11import numpy as np
2- import random
3-
2+ from utils .word_utils import *
3+
4+
5+
6+ class NavieBayesNase (object ):
7+
8+ def __init__ (self ):
9+ pass
10+
11+
12+ def fit (self , trainMatrix , trainCategory ):
13+ '''
14+ 朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci)
15+ Args:
16+ trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合)
17+ trainCategory : 文档中每个词条的列表标注
18+ Return:
19+ p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0))
20+ p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1))
21+ pAbusive : 属于1类别文档的概率
22+ '''
23+ numTrainDocs = len (trainMatrix )
24+ # 长度为词汇表长度
25+ numWords = len (trainMatrix [0 ])
26+ # p(ci)
27+ self .pAbusive = sum (trainCategory ) / float (numTrainDocs )
28+ # 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的
29+ # 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2
30+ p0Num = np .ones (numWords )
31+ p1Num = np .ones (numWords )
32+ p0Denom = 2.0
33+ p1Denom = 2.0
34+ for i in range (numTrainDocs ):
35+ if trainCategory [i ] == 1 :
36+ p1Num += trainMatrix [i ]
37+ p1Denom += sum (trainMatrix [i ])
38+ else :
39+ p0Num += trainMatrix [i ]
40+ p0Denom += sum (trainMatrix [i ])
41+ # p(wi | c1)
42+ # 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0)
43+ self .p1Vect = np .log (p1Num / p1Denom )
44+ # p(wi | c2)
45+ self .p0Vect = np .log (p0Num / p0Denom )
46+ return self
47+
48+
49+ def predict (self , testX ):
50+ '''
51+ 朴素贝叶斯分类器
52+ Args:
53+ testX : 待分类的文档向量(已转换成array)
54+ p0Vect : p(w|C0)
55+ p1Vect : p(w|C1)
56+ pAbusive : p(C1)
57+ Return:
58+ 1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1)))
59+ 0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0)))
60+ '''
61+
62+ p1 = np .sum (testX * self .p1Vect ) + np .log (self .pAbusive )
63+ p0 = np .sum (testX * self .p0Vect ) + np .log (1 - self .pAbusive )
64+ if p1 > p0 :
65+ return 1
66+ else :
67+ return 0
468
569def loadDataSet ():
670 '''数据加载函数。这里是一个小例子'''
@@ -14,197 +78,25 @@ def loadDataSet():
1478 return postingList , classVec
1579
1680
17- def createVocabList (dataSet ):
18- '''
19- 创建所有文档中出现的不重复词汇列表
20- Args:
21- dataSet: 所有文档
22- Return:
23- 包含所有文档的不重复词列表,即词汇表
24- '''
25- vocabSet = set ([])
26- # 创建两个集合的并集
27- for document in dataSet :
28- vocabSet = vocabSet | set (document )
29- return list (vocabSet )
30-
31-
32- # 词集模型(set-of-words model):词在文档中是否存在,存在为1,不存在为0
33- def setOfWord2Vec (vocabList , inputSet ):
34- '''
35- 依据词汇表,将输入文本转化成词集模型词向量
36- Args:
37- vocabList: 词汇表
38- inputSet: 当前输入文档
39- Return:
40- returnVec: 转换成词向量的文档
41- 例子:
42- vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
43- inputset = ['python', 'machine', 'learning']
44- returnVec = [0, 0, 1, 0, 1, 1]
45- 长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
46- '''
47- returnVec = [0 ] * len (vocabList )
48- for word in inputSet :
49- if word in vocabList :
50- returnVec [vocabList .index (word )] = 1
51- else :
52- print ("the word: %s is not in my vocabulary!" % word )
53- return returnVec
54-
55-
56- # 词袋模型(bag-of-words model):词在文档中出现的次数
57- def bagOfWords2Vec (vocabList , inputSet ):
58- '''
59- 依据词汇表,将输入文本转化成词袋模型词向量
60- Args:
61- vocabList: 词汇表
62- inputSet: 当前输入文档
63- Return:
64- returnVec: 转换成词向量的文档
65- 例子:
66- vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
67- inputset = ['python', 'machine', 'learning', 'python', 'machine']
68- returnVec = [0, 0, 2, 0, 2, 1]
69- 长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
70- '''
71- returnVec = [0 ] * len (vocabList )
72- for word in inputSet :
73- if word in vocabList :
74- returnVec [vocabList .index (word )] += 1
75- else :
76- print ("the word: %s is not in my vocabulary!" % word )
77- return returnVec
78-
79-
80- def trainNB0 (trainMatrix , trainCategory ):
81- '''
82- 朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci)
83- Args:
84- trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合)
85- trainCategory : 文档中每个词条的列表标注
86- Return:
87- p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0))
88- p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1))
89- pAbusive : 属于1类别文档的概率
90- '''
91- numTrainDocs = len (trainMatrix )
92- # 长度为词汇表长度
93- numWords = len (trainMatrix [0 ])
94- # p(ci)
95- pAbusive = sum (trainCategory ) / float (numTrainDocs )
96- # p0Num = np.zeros(numWords)
97- # p1Num = np.zeros(numWords)
98- # p0Denom = 0.0
99- # p1Denom = 0.0
100- # 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的
101- # 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2
102- p0Num = np .ones (numWords )
103- p1Num = np .ones (numWords )
104- p0Denom = 2.0
105- p1Denom = 2.0
106- for i in range (numTrainDocs ):
107- if trainCategory [i ] == 1 :
108- p1Num += trainMatrix [i ]
109- p1Denom += sum (trainMatrix [i ])
110- else :
111- p0Num += trainMatrix [i ]
112- p0Denom += sum (trainMatrix [i ])
113- # p(wi | c1)
114- # p1Vect = p1Num/p1Denom
115- # 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0)
116- p1Vect = np .log (p1Num / p1Denom )
117- # p(wi | c2)
118- # p0Vect = p0Num/p0Denom
119- p0Vect = np .log (p0Num / p0Denom )
120- return p0Vect , p1Vect , pAbusive
121-
122-
123- def classifyNB (vec2Classify , p0Vec , p1Vec , pClass1 ):
124- '''
125- 朴素贝叶斯分类器
126- Args:
127- vec2Classify : 待分类的文档向量(已转换成array)
128- p0Vec : p(w|C0)
129- p1Vec : p(w|C1)
130- pClass1 : p(C1)
131- Return:
132- 1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1)))
133- 0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0)))
134- '''
135-
136- p1 = np .sum (vec2Classify * p1Vec ) + np .log (pClass1 )
137- p0 = np .sum (vec2Classify * p0Vec ) + np .log (1 - pClass1 )
138- if p1 > p0 :
139- return 1
140- else :
141- return 0
142-
143-
144- # --------------------------------------使用小例子进行测试------------------------------------
145- def testingNB ():
81+ def checkNB ():
14682 '''测试'''
14783 listOPosts , lisClasses = loadDataSet ()
14884 myVocabList = createVocabList (listOPosts )
14985 trainMat = []
15086 for postinDoc in listOPosts :
15187 trainMat .append (setOfWord2Vec (myVocabList , postinDoc ))
152- p0Vec , p1Vec , pAb = trainNB0 (np .array (trainMat ), np .array (lisClasses ))
88+
89+ nb = NavieBayesNase ()
90+ nb .fit (np .array (trainMat ), np .array (lisClasses ))
15391
15492 testEntry1 = ['love' , 'my' , 'dalmation' ]
15593 thisDoc = np .array (setOfWord2Vec (myVocabList , testEntry1 ))
156- print (testEntry1 , 'classified as:' , classifyNB (thisDoc , p0Vec , p1Vec , pAb ))
94+ print (testEntry1 , 'classified as:' , nb . predict (thisDoc ))
15795
15896 testEntry2 = ['stupid' , 'garbage' ]
15997 thisDoc2 = np .array (setOfWord2Vec (myVocabList , testEntry2 ))
160- print (testEntry2 , 'classified as:' , classifyNB (thisDoc2 , p0Vec , p1Vec , pAb ))
161-
162-
163- # --------------------------------------进行垃圾邮件测试------------------------------------
164- def textParse (bigString ):
165- '''
166- 分词函数
167- Args:
168- bigString: 待分词文档
169- Return:
170- listOfTokens: 删除标点符号空格符等,已被转换成小写的字符串列表(删去少于两个字符的字符串)
171- '''
172- import re
173- listOfTokens = re .split (r'\W*' , bigString )
174- return [tok .lower for tok in listOfTokens if len (tok ) > 2 ]
175-
176-
177- def spamTest ():
178- docList = []
179- classList = []
180- fullText = []
181- for i in range (1 , 26 ):
182- wordList = textParse (open ('email/spam/%d.txt' % i ).read ())
183- docList .append (wordList )
184- fullText .extend (wordList )
185- classList .append (1 )
186- wordList = textParse (open ('email/ham/%d.txt' % i ).read ())
187- docList .append (wordList )
188- fullText .extend (wordList )
189- classList .append (0 )
190- vocabList = createVocabList (docList )
191- trainingSet = list (range (50 )) # trainingSet = [0,1,2,...,49]
192- testSet = []
193- for i in range (10 ):
194- randIndex = int (random .uniform (0 , len (trainingSet )))
195- testSet .append (trainingSet [randIndex ])
196- del (trainingSet [randIndex ])
197- trainMat = []
198- trainClasses = []
199- for docIndex in trainingSet :
200- trainMat .append (setOfWord2Vec (vocabList , docList [docIndex ]))
201- trainClasses .append (classList [docIndex ])
202- p0V , p1V , pSpam = trainNB0 (np .array (trainMat ), np .array (trainClasses ))
203- errorCount = 0
204- for docIndex in testSet :
205- wordVector = setOfWord2Vec (vocabList , docList [docIndex ])
206- if classifyNB (np .array (wordVector ), p0V , p1V , pSpam ) != classList [docIndex ]:
207- errorCount += 1
208- print ('the error rate is:' , float (errorCount / len (testSet )))
209-
210- testingNB ()
98+ print (testEntry2 , 'classified as:' , nb .predict (thisDoc2 ))
99+
100+
101+ if __name__ == "__main__" :
102+ checkNB ()
0 commit comments