Skip to content

Commit b683084

Browse files
committed
init
1 parent 349be34 commit b683084

9 files changed

Lines changed: 464 additions & 0 deletions

File tree

classify.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from mode import Mode
2+
3+
class Classify(Mode):
4+
def name(self):
5+
return 'classify'
6+
7+
def validate(self, args):
8+
if len(args) != 3:
9+
raise ValueError('Usage: %s classify <file>' % args[0])
10+
11+
file_contents = None
12+
try:
13+
file_contents = open(args[2], 'r').read()
14+
except Exception as e:
15+
raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[2], e))
16+
17+
return lambda f: f(file_contents)
18+
19+
def execute(self):
20+
print 'classify'

db.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import sqlite3
2+
3+
'''
4+
create table ok_words(word, count);
5+
create table scam_words(word, count);
6+
create table ad_counts(is_scam, count);
7+
insert into ad_counts (is_scam, count) values (1, 0);
8+
insert into ad_counts (is_scam, count) values (0, 0);
9+
10+
create index i1 on ok_words(word);
11+
create index i2 on scam_words(word);
12+
13+
P(W|L) = P(L|W) . P(W) / P(L)
14+
15+
16+
delete from ok_words;
17+
delete from scam_words;
18+
update ad_counts set count = 0;
19+
'''
20+
21+
class Db:
22+
def __init__(self):
23+
self.conn = sqlite3.connect('./scam.db')
24+
25+
def reset(self):
26+
c = self.conn.cursor()
27+
try:
28+
c.execute('delete from ok_words')
29+
c.execute('delete from scam_words')
30+
c.execute('update ad_counts set count = 0')
31+
32+
finally:
33+
c.close()
34+
self.conn.commit()
35+
36+
def get_table_name(self, is_scam):
37+
if is_scam:
38+
return 'scam_words'
39+
else:
40+
return 'ok_words'
41+
42+
def update_word_count(self, c, word, num_to_add_to_count, table_name):
43+
c.execute('select count from ' + table_name + ' where word=?', (word,))
44+
r = c.fetchone()
45+
if r:
46+
c.execute('update ' + table_name + ' set count=? where word=?', (r[0] + num_to_add_to_count, word))
47+
else:
48+
c.execute('insert into ' + table_name + ' (count, word) values (?,?)', (num_to_add_to_count, word))
49+
50+
def update_word_counts(self, d, is_scam):
51+
table_name = self.get_table_name(is_scam)
52+
c = self.conn.cursor()
53+
try:
54+
for word, count in d.items():
55+
self.update_word_count(c, word, count, table_name)
56+
finally:
57+
c.close()
58+
self.conn.commit()
59+
60+
def get_ad_count(self, is_scam):
61+
c = self.conn.cursor()
62+
try:
63+
c.execute('select count from ad_counts where is_scam=?', (is_scam,))
64+
return c.fetchone()[0]
65+
finally:
66+
c.close()
67+
self.conn.commit()
68+
69+
def get_word_count(self, word, is_scam):
70+
table_name = self.get_table_name(is_scam)
71+
c = self.conn.cursor()
72+
try:
73+
c.execute('select count from ' + table_name + ' where word=?', (word,))
74+
r = c.fetchone()
75+
if r:
76+
return r[0]
77+
else:
78+
return 0
79+
80+
finally:
81+
c.close()
82+
self.conn.commit()
83+
84+
def update_ad_count(self, num_new_ads, is_scam_data):
85+
c = self.conn.cursor()
86+
try:
87+
current_count = self.get_ad_count(is_scam_data)
88+
c.execute('update ad_counts set count=? where is_scam=?', (current_count + num_new_ads, is_scam_data))
89+
90+
finally:
91+
c.close()
92+
self.conn.commit()
93+

learn.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from mode import Mode
2+
3+
class Learn(Mode):
4+
def name(self):
5+
return 'learn'
6+
7+
def validate(self, args):
8+
scam_type = 'scam'
9+
non_scam_type = 'nonscam'
10+
valid_args = False
11+
usage = 'Usage: %s learn %s|%s <file> <count>' % (args[0], scam_type, non_scam_type)
12+
13+
if len(args) == 5:
14+
learn_type = args[2]
15+
if learn_type not in [scam_type, non_scam_type]:
16+
raise ValueError(usage + '\nInvalid document type argument, expected "%s" or "%s"' % (scam_type, non_scam_type))
17+
18+
file_contents = None
19+
try:
20+
file_contents = open(args[3], 'r').read()
21+
except Exception as e:
22+
raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e))
23+
24+
count = 0
25+
try:
26+
count = int(args[4])
27+
except:
28+
raise ValueError(usage + '\nEnter an integer value for the "count" parameter')
29+
30+
self.file_contents = file_contents
31+
self.count = count
32+
self.is_scam = learn_type == scam_type
33+
34+
else:
35+
raise ValueError(usage)
36+
37+
def execute(self):
38+
print 'learn'

mbox_extract.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import mailbox
2+
3+
def getcharsets(msg):
4+
charsets = set({})
5+
for c in msg.get_charsets():
6+
if c is not None:
7+
charsets.update([c])
8+
return charsets
9+
10+
def handleerror(errmsg, emailmsg,cs):
11+
print()
12+
print(errmsg)
13+
print("This error occurred while decoding with ",cs," charset.")
14+
print("These charsets were found in the one email.",getcharsets(emailmsg))
15+
print("This is the subject:",emailmsg['subject'])
16+
print("This is the sender:",emailmsg['From'])
17+
18+
def getbodyfromemail(msg):
19+
body = None
20+
#Walk through the parts of the email to find the text body.
21+
if msg.is_multipart():
22+
for part in msg.walk():
23+
24+
# If part is multipart, walk through the subparts.
25+
if part.is_multipart():
26+
27+
for subpart in part.walk():
28+
if subpart.get_content_type() == 'text/plain':
29+
# Get the subpart payload (i.e the message body)
30+
body = subpart.get_payload(decode=True)
31+
#charset = subpart.get_charset()
32+
33+
# Part isn't multipart so get the email body
34+
elif part.get_content_type() == 'text/plain':
35+
body = part.get_payload(decode=True)
36+
#charset = part.get_charset()
37+
38+
# If this isn't a multi-part message then get the payload (i.e the message body)
39+
elif msg.get_content_type() == 'text/plain':
40+
body = msg.get_payload(decode=True)
41+
42+
# No checking done to match the charset with the correct part.
43+
for charset in getcharsets(msg):
44+
try:
45+
body = body.decode(charset)
46+
except UnicodeDecodeError:
47+
handleerror("UnicodeDecodeError: encountered.",msg,charset)
48+
except AttributeError:
49+
handleerror("AttributeError: encountered" ,msg,charset)
50+
return body
51+
52+
53+
mboxfile = 'allspam.txt'
54+
print(mboxfile)
55+
for thisemail in mailbox.mbox(mboxfile):
56+
try:
57+
body = getbodyfromemail(thisemail)
58+
if body:
59+
print(body[0:1000])
60+
except:
61+
pass

mode.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
class Mode:
2+
def validate(self):
3+
raise NotImplementedError()
4+
5+
def execute(self):
6+
raise NotImplementedError()
7+
8+
def name(self):
9+
raise NotImplementedError()

reset.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from mode import Mode
2+
from status import Status
3+
from db import Db
4+
5+
class Reset(Mode):
6+
def name(self):
7+
return 'reset'
8+
9+
def validate(self, args):
10+
if len(args) != 2:
11+
raise ValueError('Usage: %s reset' % args[0])
12+
13+
def execute(self):
14+
Db().reset()
15+
print 'Reset Complete'
16+
Status().execute()

scam.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import sys
2+
import re
3+
import sqlite3
4+
import getopt
5+
import codecs
6+
from collections import defaultdict
7+
from learn import Learn
8+
from classify import Classify
9+
from reset import Reset
10+
from status import Status
11+
from db import Db
12+
13+
'''
14+
create table ok_words(word, count);
15+
create table scam_words(word, count);
16+
create table ad_counts(is_scam, count);
17+
insert into ad_counts (is_scam, count) values (1, 0);
18+
insert into ad_counts (is_scam, count) values (0, 0);
19+
20+
create index i1 on ok_words(word);
21+
create index i2 on scam_words(word);
22+
23+
P(W|L) = P(L|W) . P(W) / P(L)
24+
25+
26+
delete from ok_words;
27+
delete from scam_words;
28+
update ad_counts set count = 0;
29+
30+
'''
31+
commonWords = ('the','be','to','of','and','a','in','that','have','it','is','im','are','was','for','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','just','him','know','take','person','into','year','your','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','way','even','because','any','these','us')
32+
'''
33+
def cleanUpWord(word):
34+
word = re.sub('\W', '', word.lower())
35+
if (len(word) < 2):
36+
return None
37+
elif (word.isdigit()):
38+
return None
39+
elif (word in commonWords):
40+
return None
41+
42+
return word
43+
44+
45+
def update_db_with_dict(d, ad_count, is_scam_data):
46+
db = Db()
47+
db.update_ad_count(ad_count, is_scam_data)
48+
db.update_word_counts(d, is_scam_data)
49+
50+
def add_words_to_dict(l, d):
51+
for word in l:
52+
d[word] += 1
53+
54+
def ad_to_word_list(ad_text):
55+
cleaned_words = map(cleanUpWord, ad_text.split(' '))
56+
return filter(lambda word : word and (len(word) > 0), cleaned_words)
57+
58+
def file_to_ad_list(file_name):
59+
return codecs.open(file_name, 'r', 'utf-8').read().split('\n')
60+
61+
def process_file(file, is_scam_data):
62+
print 'Processing ', file
63+
ad_list = file_to_ad_list(file)
64+
ad_count = len(ad_list)
65+
print 'Found ',ad_count,'adverts in file - building dictionary...'
66+
d = defaultdict(int)
67+
i=1
68+
for ad_text in ad_list:
69+
word_list = ad_to_word_list(ad_text)
70+
add_words_to_dict(word_list, d)
71+
i += 1
72+
if i % 10000 == 0:
73+
print 'Processed ',i
74+
if i > 100000:
75+
break
76+
77+
update_db_with_dict(d, ad_count, is_scam_data)
78+
79+
def learn(non_scam_file, scam_file):
80+
process_file(non_scam_file, False)
81+
process_file(scam_file, True)
82+
83+
def calc_p_word(word):
84+
pass
85+
86+
def calc_p_words(p_word_list):
87+
pass
88+
89+
def get_total_word_count(c, for_scams):
90+
pass
91+
92+
def classify(input_file):
93+
text = open(input_file).read()
94+
words = ad_to_word_list(text)
95+
96+
conn = sqlite3.connect('./scam.db')
97+
c = conn.cursor()
98+
scam_word_count = get_ad_count(c, True)
99+
ok_word_count = get_ad_count(c, False)
100+
101+
102+
p_word_list = []
103+
for word in words:
104+
p_word_list.append(calc_p_word(word))
105+
106+
return calc_p_words(p_word_list)
107+
108+
if __name__ == '__main__':
109+
if (len(sys.argv) != 2):
110+
print 'Usage: %s <input file>' % sys.argv[0]
111+
112+
else:
113+
input_file = sys.argv[1]
114+
classify(input_file)
115+
116+
'''
117+
118+
modes = {}
119+
120+
def register_mode(mode_class):
121+
modes[mode_class.__name__.lower()] = mode_class
122+
123+
if __name__ == '__main__':
124+
register_mode(Learn)
125+
register_mode(Classify)
126+
register_mode(Reset)
127+
register_mode(Status)
128+
129+
args = sys.argv
130+
usage = 'Usage: %s %s <mode specific args>' % (args[0], '|'.join(modes.keys()))
131+
132+
try:
133+
if (len(args) < 2):
134+
raise ValueError(usage)
135+
136+
mode_name = args[1]
137+
if mode_name not in modes:
138+
raise ValueError(usage + '\nUnrecognised mode: ' + mode_name)
139+
140+
mode = modes[mode_name]()
141+
mode.validate(args)
142+
mode.execute()
143+
144+
except Exception as e:
145+
print e

0 commit comments

Comments
 (0)