init

codebox · codebox · commit b683084759ef · 2012-11-18T16:55:21.000Z
diff --git a/classify.py b/classify.py
@@ -0,0 +1,20 @@
+from mode import Mode
+
+class Classify(Mode):
+	def name(self):
+		return 'classify'
+
+	def validate(self, args):
+		if len(args) != 3:
+			raise ValueError('Usage: %s classify <file>' % args[0])
+
+		file_contents = None
+		try:
+			file_contents = open(args[2], 'r').read()
+		except Exception as e:
+			raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[2], e))
+		
+		return lambda f: f(file_contents)
+
+	def execute(self):
+		print 'classify'
diff --git a/db.py b/db.py
@@ -0,0 +1,93 @@
+import sqlite3
+
+'''
+create table ok_words(word, count);
+create table scam_words(word, count);
+create table ad_counts(is_scam, count);
+insert into ad_counts (is_scam, count) values (1, 0);
+insert into ad_counts (is_scam, count) values (0, 0);
+
+create index i1 on ok_words(word);
+create index i2 on scam_words(word);
+
+P(W|L) = P(L|W) . P(W) / P(L)
+
+
+delete from ok_words;
+delete from scam_words;
+update ad_counts set count = 0;
+'''
+
+class Db:
+	def __init__(self):
+		self.conn = sqlite3.connect('./scam.db')
+
+	def reset(self):
+		c = self.conn.cursor()
+		try:
+			c.execute('delete from ok_words')
+			c.execute('delete from scam_words')
+			c.execute('update ad_counts set count = 0')
+
+		finally:
+			c.close()
+			self.conn.commit()
+
+	def get_table_name(self, is_scam):
+		if is_scam:
+			return 'scam_words'
+		else:
+			return 'ok_words'
+
+	def update_word_count(self, c, word, num_to_add_to_count, table_name):
+		c.execute('select count from ' + table_name + ' where word=?', (word,))
+		r = c.fetchone()
+		if r:
+			c.execute('update ' + table_name + ' set count=? where word=?', (r[0] + num_to_add_to_count, word))
+		else:
+			c.execute('insert into ' + table_name + ' (count, word) values (?,?)', (num_to_add_to_count, word))
+
+	def update_word_counts(self, d, is_scam):
+		table_name = self.get_table_name(is_scam)
+		c = self.conn.cursor()
+		try:
+			for word, count in d.items():
+				self.update_word_count(c, word, count, table_name)
+		finally:
+			c.close()
+			self.conn.commit()
+
+	def get_ad_count(self, is_scam):
+		c = self.conn.cursor()
+		try:
+			c.execute('select count from ad_counts where is_scam=?', (is_scam,))
+			return c.fetchone()[0]
+		finally:
+			c.close()
+			self.conn.commit()
+		
+	def get_word_count(self, word, is_scam):
+		table_name = self.get_table_name(is_scam)
+		c = self.conn.cursor()
+		try:
+			c.execute('select count from ' + table_name + ' where word=?', (word,))
+			r = c.fetchone()
+			if r:
+				return r[0]
+			else:
+				return 0
+
+		finally:
+			c.close()
+			self.conn.commit()
+
+	def update_ad_count(self, num_new_ads, is_scam_data):
+		c = self.conn.cursor()
+		try:
+			current_count = self.get_ad_count(is_scam_data)
+			c.execute('update ad_counts set count=? where is_scam=?', (current_count + num_new_ads, is_scam_data))
+				
+		finally:
+			c.close()
+			self.conn.commit()
+
diff --git a/learn.py b/learn.py
@@ -0,0 +1,38 @@
+from mode import Mode
+
+class Learn(Mode):
+	def name(self):
+		return 'learn'
+
+	def validate(self, args):
+		scam_type = 'scam'
+		non_scam_type = 'nonscam'
+		valid_args = False
+		usage = 'Usage: %s learn %s|%s <file> <count>' % (args[0], scam_type, non_scam_type)
+
+		if len(args) == 5:
+			learn_type = args[2]
+			if learn_type not in [scam_type, non_scam_type]:
+				raise ValueError(usage + '\nInvalid document type argument, expected "%s" or "%s"' % (scam_type, non_scam_type))
+			
+			file_contents = None
+			try:
+				file_contents = open(args[3], 'r').read()
+			except Exception as e:
+				raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e))
+
+			count = 0
+			try:
+				count = int(args[4])
+			except:
+				raise ValueError(usage + '\nEnter an integer value for the "count" parameter')			
+
+			self.file_contents = file_contents
+			self.count = count
+			self.is_scam = learn_type == scam_type
+
+		else:
+			raise ValueError(usage)				
+
+	def execute(self):
+		print 'learn'
diff --git a/mbox_extract.py b/mbox_extract.py
@@ -0,0 +1,61 @@
+import mailbox
+
+def getcharsets(msg):
+    charsets = set({})
+    for c in msg.get_charsets():
+        if c is not None:
+            charsets.update([c])
+    return charsets
+
+def handleerror(errmsg, emailmsg,cs):
+    print()
+    print(errmsg)
+    print("This error occurred while decoding with ",cs," charset.")
+    print("These charsets were found in the one email.",getcharsets(emailmsg))
+    print("This is the subject:",emailmsg['subject'])
+    print("This is the sender:",emailmsg['From'])
+
+def getbodyfromemail(msg):
+    body = None
+    #Walk through the parts of the email to find the text body.    
+    if msg.is_multipart():    
+        for part in msg.walk():
+
+            # If part is multipart, walk through the subparts.            
+            if part.is_multipart(): 
+
+                for subpart in part.walk():
+                    if subpart.get_content_type() == 'text/plain':
+                        # Get the subpart payload (i.e the message body)
+                        body = subpart.get_payload(decode=True) 
+                        #charset = subpart.get_charset()
+
+            # Part isn't multipart so get the email body
+            elif part.get_content_type() == 'text/plain':
+                body = part.get_payload(decode=True)
+                #charset = part.get_charset()
+
+    # If this isn't a multi-part message then get the payload (i.e the message body)
+    elif msg.get_content_type() == 'text/plain':
+        body = msg.get_payload(decode=True) 
+
+   # No checking done to match the charset with the correct part. 
+    for charset in getcharsets(msg):
+        try:
+            body = body.decode(charset)
+        except UnicodeDecodeError:
+            handleerror("UnicodeDecodeError: encountered.",msg,charset)
+        except AttributeError:
+             handleerror("AttributeError: encountered" ,msg,charset)
+    return body    
+
+
+mboxfile = 'allspam.txt'
+print(mboxfile)
+for thisemail in mailbox.mbox(mboxfile):
+    try:
+        body = getbodyfromemail(thisemail)
+        if body:
+            print(body[0:1000])
+    except:
+        pass
diff --git a/mode.py b/mode.py
@@ -0,0 +1,9 @@
+class Mode:
+	def validate(self):
+		raise NotImplementedError()
+
+	def execute(self):
+		raise NotImplementedError()
+
+	def name(self):
+		raise NotImplementedError()
diff --git a/reset.py b/reset.py
@@ -0,0 +1,16 @@
+from mode import Mode
+from status import Status
+from db import Db
+
+class Reset(Mode):
+	def name(self):
+		return 'reset'
+
+	def validate(self, args):
+		if len(args) != 2:
+			raise ValueError('Usage: %s reset' % args[0])
+
+	def execute(self):
+		Db().reset()
+		print 'Reset Complete'
+		Status().execute()
diff --git a/scam.py b/scam.py
@@ -0,0 +1,145 @@
+import sys
+import re
+import sqlite3
+import getopt
+import codecs
+from collections import defaultdict
+from learn import Learn
+from classify import Classify
+from reset import Reset
+from status import Status
+from db import Db
+
+'''
+create table ok_words(word, count);
+create table scam_words(word, count);
+create table ad_counts(is_scam, count);
+insert into ad_counts (is_scam, count) values (1, 0);
+insert into ad_counts (is_scam, count) values (0, 0);
+
+create index i1 on ok_words(word);
+create index i2 on scam_words(word);
+
+P(W|L) = P(L|W) . P(W) / P(L)
+
+
+delete from ok_words;
+delete from scam_words;
+update ad_counts set count = 0;
+
+'''
+commonWords = ('the','be','to','of','and','a','in','that','have','it','is','im','are','was','for','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','just','him','know','take','person','into','year','your','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','way','even','because','any','these','us')
+'''
+def cleanUpWord(word):
+	word = re.sub('\W', '', word.lower())
+	if (len(word) < 2):
+		return None
+	elif (word.isdigit()):
+		return None
+	elif (word in commonWords):
+		return None
+	
+	return word
+
+
+def update_db_with_dict(d, ad_count, is_scam_data):
+	db = Db()
+	db.update_ad_count(ad_count, is_scam_data)
+	db.update_word_counts(d, is_scam_data)
+
+def add_words_to_dict(l, d):
+	for word in l:
+		d[word] += 1
+
+def ad_to_word_list(ad_text):
+	cleaned_words = map(cleanUpWord, ad_text.split(' '))
+	return filter(lambda word : word and (len(word) > 0), cleaned_words)
+
+def file_to_ad_list(file_name):
+	return codecs.open(file_name, 'r', 'utf-8').read().split('\n')
+
+def process_file(file, is_scam_data):
+	print 'Processing ', file
+	ad_list = file_to_ad_list(file)
+	ad_count = len(ad_list)
+	print 'Found ',ad_count,'adverts in file - building dictionary...'
+	d = defaultdict(int)
+	i=1
+	for ad_text in ad_list:
+		word_list = ad_to_word_list(ad_text)
+		add_words_to_dict(word_list, d)
+		i += 1
+		if i % 10000 == 0:
+			print 'Processed ',i
+		if i > 100000:
+			break
+
+	update_db_with_dict(d, ad_count, is_scam_data)
+
+def learn(non_scam_file, scam_file):
+	process_file(non_scam_file, False)
+	process_file(scam_file, True)
+
+def calc_p_word(word):
+	pass
+
+def calc_p_words(p_word_list):
+	pass
+
+def get_total_word_count(c, for_scams):
+	pass
+
+def classify(input_file):
+	text = open(input_file).read()
+	words = ad_to_word_list(text)
+
+	conn = sqlite3.connect('./scam.db')
+	c = conn.cursor()
+	scam_word_count = get_ad_count(c, True)
+	ok_word_count = get_ad_count(c, False)
+
+
+	p_word_list = []
+	for word in words:
+		p_word_list.append(calc_p_word(word))
+
+	return calc_p_words(p_word_list)
+
+if __name__ == '__main__':
+	if (len(sys.argv) != 2):
+		print 'Usage: %s <input file>' % sys.argv[0]
+		
+	else:
+		input_file = sys.argv[1]
+		classify(input_file)
+			
+'''
+
+modes = {}
+
+def register_mode(mode_class):
+	modes[mode_class.__name__.lower()] = mode_class
+
+if __name__ == '__main__':
+	register_mode(Learn)
+	register_mode(Classify)
+	register_mode(Reset)
+	register_mode(Status)
+
+	args = sys.argv
+	usage = 'Usage: %s %s <mode specific args>' % (args[0], '|'.join(modes.keys()))
+
+	try:
+		if (len(args) < 2):
+			raise ValueError(usage)
+
+		mode_name = args[1]
+		if mode_name not in modes:
+			raise ValueError(usage + '\nUnrecognised mode: ' + mode_name)
+
+		mode = modes[mode_name]()
+		mode.validate(args)
+		mode.execute()
+
+	except Exception as e:
+		print e
diff --git a/status.py b/status.py
diff --git a/validate.py b/validate.py