1+ import numpy as np
2+ import random
3+
4+
5+ def loadDataSet ():
6+ '''数据加载函数。这里是一个小例子'''
7+ postingList = [['my' , 'dog' , 'has' , 'flea' , 'problems' , 'help' , 'please' ],
8+ ['maybe' , 'not' , 'take' , 'him' , 'to' , 'dog' , 'park' , 'stupid' ],
9+ ['my' , 'dalmation' , 'is' , 'so' , 'cute' , 'I' , 'love' , 'him' ],
10+ ['stop' , 'posting' , 'stupid' , 'worthless' , 'garbage' ],
11+ ['mr' , 'licks' , 'ate' , 'my' , 'steak' , 'how' , 'to' , 'stop' , 'him' ],
12+ ['quit' , 'buying' , 'worthless' , 'dog' , 'food' , 'stupid' ]]
13+ classVec = [0 , 1 , 0 , 1 , 0 , 1 ] # 1代表侮辱性文字,0代表正常言论,代表上面6个样本的类别
14+ return postingList , classVec
15+
16+
17+ def createVocabList (dataSet ):
18+ '''
19+ 创建所有文档中出现的不重复词汇列表
20+ Args:
21+ dataSet: 所有文档
22+ Return:
23+ 包含所有文档的不重复词列表,即词汇表
24+ '''
25+ vocabSet = set ([])
26+ # 创建两个集合的并集
27+ for document in dataSet :
28+ vocabSet = vocabSet | set (document )
29+ return list (vocabSet )
30+
31+
32+ # 词集模型(set-of-words model):词在文档中是否存在,存在为1,不存在为0
33+ def setOfWord2Vec (vocabList , inputSet ):
34+ '''
35+ 依据词汇表,将输入文本转化成词集模型词向量
36+ Args:
37+ vocabList: 词汇表
38+ inputSet: 当前输入文档
39+ Return:
40+ returnVec: 转换成词向量的文档
41+ 例子:
42+ vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
43+ inputset = ['python', 'machine', 'learning']
44+ returnVec = [0, 0, 1, 0, 1, 1]
45+ 长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
46+ '''
47+ returnVec = [0 ] * len (vocabList )
48+ for word in inputSet :
49+ if word in vocabList :
50+ returnVec [vocabList .index (word )] = 1
51+ else :
52+ print ("the word: %s is not in my vocabulary!" % word )
53+ return returnVec
54+
55+
56+ # 词袋模型(bag-of-words model):词在文档中出现的次数
57+ def bagOfWords2Vec (vocabList , inputSet ):
58+ '''
59+ 依据词汇表,将输入文本转化成词袋模型词向量
60+ Args:
61+ vocabList: 词汇表
62+ inputSet: 当前输入文档
63+ Return:
64+ returnVec: 转换成词向量的文档
65+ 例子:
66+ vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
67+ inputset = ['python', 'machine', 'learning', 'python', 'machine']
68+ returnVec = [0, 0, 2, 0, 2, 1]
69+ 长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
70+ '''
71+ returnVec = [0 ] * len (vocabList )
72+ for word in inputSet :
73+ if word in vocabList :
74+ returnVec [vocabList .index (word )] += 1
75+ else :
76+ print ("the word: %s is not in my vocabulary!" % word )
77+ return returnVec
78+
79+
80+ def trainNB0 (trainMatrix , trainCategory ):
81+ '''
82+ 朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci)
83+ Args:
84+ trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合)
85+ trainCategory : 文档中每个词条的列表标注
86+ Return:
87+ p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0))
88+ p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1))
89+ pAbusive : 属于1类别文档的概率
90+ '''
91+ numTrainDocs = len (trainMatrix )
92+ # 长度为词汇表长度
93+ numWords = len (trainMatrix [0 ])
94+ # p(ci)
95+ pAbusive = sum (trainCategory ) / float (numTrainDocs )
96+ # p0Num = np.zeros(numWords)
97+ # p1Num = np.zeros(numWords)
98+ # p0Denom = 0.0
99+ # p1Denom = 0.0
100+ # 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的
101+ # 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2
102+ p0Num = np .ones (numWords )
103+ p1Num = np .ones (numWords )
104+ p0Denom = 2.0
105+ p1Denom = 2.0
106+ for i in range (numTrainDocs ):
107+ if trainCategory [i ] == 1 :
108+ p1Num += trainMatrix [i ]
109+ p1Denom += sum (trainMatrix [i ])
110+ else :
111+ p0Num += trainMatrix [i ]
112+ p0Denom += sum (trainMatrix [i ])
113+ # p(wi | c1)
114+ # p1Vect = p1Num/p1Denom
115+ # 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0)
116+ p1Vect = np .log (p1Num / p1Denom )
117+ # p(wi | c2)
118+ # p0Vect = p0Num/p0Denom
119+ p0Vect = np .log (p0Num / p0Denom )
120+ return p0Vect , p1Vect , pAbusive
121+
122+
123+ def classifyNB (vec2Classify , p0Vec , p1Vec , pClass1 ):
124+ '''
125+ 朴素贝叶斯分类器
126+ Args:
127+ vec2Classify : 待分类的文档向量(已转换成array)
128+ p0Vec : p(w|C0)
129+ p1Vec : p(w|C1)
130+ pClass1 : p(C1)
131+ Return:
132+ 1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1)))
133+ 0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0)))
134+ '''
135+
136+ p1 = np .sum (vec2Classify * p1Vec ) + np .log (pClass1 )
137+ p0 = np .sum (vec2Classify * p0Vec ) + np .log (1 - pClass1 )
138+ if p1 > p0 :
139+ return 1
140+ else :
141+ return 0
142+
143+
144+ # --------------------------------------使用小例子进行测试------------------------------------
145+ def testingNB ():
146+ '''测试'''
147+ listOPosts , lisClasses = loadDataSet ()
148+ myVocabList = createVocabList (listOPosts )
149+ trainMat = []
150+ for postinDoc in listOPosts :
151+ trainMat .append (setOfWord2Vec (myVocabList , postinDoc ))
152+ p0Vec , p1Vec , pAb = trainNB0 (np .array (trainMat ), np .array (lisClasses ))
153+
154+ testEntry1 = ['love' , 'my' , 'dalmation' ]
155+ thisDoc = np .array (setOfWord2Vec (myVocabList , testEntry1 ))
156+ print (testEntry1 , 'classified as:' , classifyNB (thisDoc , p0Vec , p1Vec , pAb ))
157+
158+ testEntry2 = ['stupid' , 'garbage' ]
159+ thisDoc2 = np .array (setOfWord2Vec (myVocabList , testEntry2 ))
160+ print (testEntry2 , 'classified as:' , classifyNB (thisDoc2 , p0Vec , p1Vec , pAb ))
161+
162+
163+ # --------------------------------------进行垃圾邮件测试------------------------------------
164+ def textParse (bigString ):
165+ '''
166+ 分词函数
167+ Args:
168+ bigString: 待分词文档
169+ Return:
170+ listOfTokens: 删除标点符号空格符等,已被转换成小写的字符串列表(删去少于两个字符的字符串)
171+ '''
172+ import re
173+ listOfTokens = re .split (r'\W*' , bigString )
174+ return [tok .lower for tok in listOfTokens if len (tok ) > 2 ]
175+
176+
177+ def spamTest ():
178+ docList = []
179+ classList = []
180+ fullText = []
181+ for i in range (1 , 26 ):
182+ wordList = textParse (open ('email/spam/%d.txt' % i ).read ())
183+ docList .append (wordList )
184+ fullText .extend (wordList )
185+ classList .append (1 )
186+ wordList = textParse (open ('email/ham/%d.txt' % i ).read ())
187+ docList .append (wordList )
188+ fullText .extend (wordList )
189+ classList .append (0 )
190+ vocabList = createVocabList (docList )
191+ trainingSet = list (range (50 )) # trainingSet = [0,1,2,...,49]
192+ testSet = []
193+ for i in range (10 ):
194+ randIndex = int (random .uniform (0 , len (trainingSet )))
195+ testSet .append (trainingSet [randIndex ])
196+ del (trainingSet [randIndex ])
197+ trainMat = []
198+ trainClasses = []
199+ for docIndex in trainingSet :
200+ trainMat .append (setOfWord2Vec (vocabList , docList [docIndex ]))
201+ trainClasses .append (classList [docIndex ])
202+ p0V , p1V , pSpam = trainNB0 (np .array (trainMat ), np .array (trainClasses ))
203+ errorCount = 0
204+ for docIndex in testSet :
205+ wordVector = setOfWord2Vec (vocabList , docList [docIndex ])
206+ if classifyNB (np .array (wordVector ), p0V , p1V , pSpam ) != classList [docIndex ]:
207+ errorCount += 1
208+ print ('the error rate is:' , float (errorCount / len (testSet )))
209+
210+ testingNB ()
0 commit comments