Skip to content

Commit e1b9163

Browse files
naviebaye base
1 parent 361f197 commit e1b9163

2 files changed

Lines changed: 281 additions & 0 deletions

File tree

naive_bayes/naiveBayesBase.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import numpy as np
2+
import random
3+
4+
5+
def loadDataSet():
6+
'''数据加载函数。这里是一个小例子'''
7+
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
8+
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
9+
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
10+
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
11+
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
12+
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
13+
classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱性文字,0代表正常言论,代表上面6个样本的类别
14+
return postingList, classVec
15+
16+
17+
def createVocabList(dataSet):
18+
'''
19+
创建所有文档中出现的不重复词汇列表
20+
Args:
21+
dataSet: 所有文档
22+
Return:
23+
包含所有文档的不重复词列表,即词汇表
24+
'''
25+
vocabSet = set([])
26+
# 创建两个集合的并集
27+
for document in dataSet:
28+
vocabSet = vocabSet | set(document)
29+
return list(vocabSet)
30+
31+
32+
# 词集模型(set-of-words model):词在文档中是否存在,存在为1,不存在为0
33+
def setOfWord2Vec(vocabList, inputSet):
34+
'''
35+
依据词汇表,将输入文本转化成词集模型词向量
36+
Args:
37+
vocabList: 词汇表
38+
inputSet: 当前输入文档
39+
Return:
40+
returnVec: 转换成词向量的文档
41+
例子:
42+
vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
43+
inputset = ['python', 'machine', 'learning']
44+
returnVec = [0, 0, 1, 0, 1, 1]
45+
长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
46+
'''
47+
returnVec = [0] * len(vocabList)
48+
for word in inputSet:
49+
if word in vocabList:
50+
returnVec[vocabList.index(word)] = 1
51+
else:
52+
print("the word: %s is not in my vocabulary!" % word)
53+
return returnVec
54+
55+
56+
# 词袋模型(bag-of-words model):词在文档中出现的次数
57+
def bagOfWords2Vec(vocabList, inputSet):
58+
'''
59+
依据词汇表,将输入文本转化成词袋模型词向量
60+
Args:
61+
vocabList: 词汇表
62+
inputSet: 当前输入文档
63+
Return:
64+
returnVec: 转换成词向量的文档
65+
例子:
66+
vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
67+
inputset = ['python', 'machine', 'learning', 'python', 'machine']
68+
returnVec = [0, 0, 2, 0, 2, 1]
69+
长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
70+
'''
71+
returnVec = [0] * len(vocabList)
72+
for word in inputSet:
73+
if word in vocabList:
74+
returnVec[vocabList.index(word)] += 1
75+
else:
76+
print("the word: %s is not in my vocabulary!" % word)
77+
return returnVec
78+
79+
80+
def trainNB0(trainMatrix, trainCategory):
81+
'''
82+
朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci)
83+
Args:
84+
trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合)
85+
trainCategory : 文档中每个词条的列表标注
86+
Return:
87+
p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0))
88+
p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1))
89+
pAbusive : 属于1类别文档的概率
90+
'''
91+
numTrainDocs = len(trainMatrix)
92+
# 长度为词汇表长度
93+
numWords = len(trainMatrix[0])
94+
# p(ci)
95+
pAbusive = sum(trainCategory) / float(numTrainDocs)
96+
# p0Num = np.zeros(numWords)
97+
# p1Num = np.zeros(numWords)
98+
# p0Denom = 0.0
99+
# p1Denom = 0.0
100+
# 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的
101+
# 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2
102+
p0Num = np.ones(numWords)
103+
p1Num = np.ones(numWords)
104+
p0Denom = 2.0
105+
p1Denom = 2.0
106+
for i in range(numTrainDocs):
107+
if trainCategory[i] == 1:
108+
p1Num += trainMatrix[i]
109+
p1Denom += sum(trainMatrix[i])
110+
else:
111+
p0Num += trainMatrix[i]
112+
p0Denom += sum(trainMatrix[i])
113+
# p(wi | c1)
114+
# p1Vect = p1Num/p1Denom
115+
# 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0)
116+
p1Vect = np.log(p1Num / p1Denom)
117+
# p(wi | c2)
118+
# p0Vect = p0Num/p0Denom
119+
p0Vect = np.log(p0Num / p0Denom)
120+
return p0Vect, p1Vect, pAbusive
121+
122+
123+
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
124+
'''
125+
朴素贝叶斯分类器
126+
Args:
127+
vec2Classify : 待分类的文档向量(已转换成array)
128+
p0Vec : p(w|C0)
129+
p1Vec : p(w|C1)
130+
pClass1 : p(C1)
131+
Return:
132+
1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1)))
133+
0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0)))
134+
'''
135+
136+
p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
137+
p0 = np.sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
138+
if p1 > p0:
139+
return 1
140+
else:
141+
return 0
142+
143+
144+
# --------------------------------------使用小例子进行测试------------------------------------
145+
def testingNB():
146+
'''测试'''
147+
listOPosts, lisClasses = loadDataSet()
148+
myVocabList = createVocabList(listOPosts)
149+
trainMat = []
150+
for postinDoc in listOPosts:
151+
trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
152+
p0Vec, p1Vec, pAb = trainNB0(np.array(trainMat), np.array(lisClasses))
153+
154+
testEntry1 = ['love', 'my', 'dalmation']
155+
thisDoc = np.array(setOfWord2Vec(myVocabList, testEntry1))
156+
print(testEntry1, 'classified as:', classifyNB(thisDoc, p0Vec, p1Vec, pAb))
157+
158+
testEntry2 = ['stupid', 'garbage']
159+
thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
160+
print(testEntry2, 'classified as:', classifyNB(thisDoc2, p0Vec, p1Vec, pAb))
161+
162+
163+
# --------------------------------------进行垃圾邮件测试------------------------------------
164+
def textParse(bigString):
165+
'''
166+
分词函数
167+
Args:
168+
bigString: 待分词文档
169+
Return:
170+
listOfTokens: 删除标点符号空格符等,已被转换成小写的字符串列表(删去少于两个字符的字符串)
171+
'''
172+
import re
173+
listOfTokens = re.split(r'\W*', bigString)
174+
return [tok.lower for tok in listOfTokens if len(tok) > 2]
175+
176+
177+
def spamTest():
178+
docList = []
179+
classList = []
180+
fullText = []
181+
for i in range(1, 26):
182+
wordList = textParse(open('email/spam/%d.txt' % i).read())
183+
docList.append(wordList)
184+
fullText.extend(wordList)
185+
classList.append(1)
186+
wordList = textParse(open('email/ham/%d.txt' % i).read())
187+
docList.append(wordList)
188+
fullText.extend(wordList)
189+
classList.append(0)
190+
vocabList = createVocabList(docList)
191+
trainingSet = list(range(50)) # trainingSet = [0,1,2,...,49]
192+
testSet = []
193+
for i in range(10):
194+
randIndex = int(random.uniform(0, len(trainingSet)))
195+
testSet.append(trainingSet[randIndex])
196+
del (trainingSet[randIndex])
197+
trainMat = []
198+
trainClasses = []
199+
for docIndex in trainingSet:
200+
trainMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
201+
trainClasses.append(classList[docIndex])
202+
p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
203+
errorCount = 0
204+
for docIndex in testSet:
205+
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
206+
if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
207+
errorCount += 1
208+
print('the error rate is:', float(errorCount / len(testSet)))
209+
210+
testingNB()

naive_bayes/naiveBayesGaussian.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import math
2+
import numpy as np
3+
from sklearn.datasets import load_iris
4+
from sklearn.model_selection import train_test_split
5+
from scipy.stats import norm
6+
7+
class NaiveBayes:
8+
def __init__(self):
9+
self.model = None
10+
11+
# 数学期望
12+
@staticmethod
13+
def mean(X):
14+
return np.mean(X)
15+
16+
# 标准差(方差)
17+
def stdev(self, X):
18+
return np.std(X)
19+
20+
# 概率密度函数
21+
def gaussian_probability(self, x, mean, stdev):
22+
return norm.pdf(x, loc=mean, scale=stdev)
23+
24+
# 处理X_train
25+
def summarize(self, train_data):
26+
summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]
27+
return summaries
28+
29+
# 分类别求出数学期望和标准差
30+
def fit(self, X, y):
31+
labels = list(set(y))
32+
data = {label:[] for label in labels}
33+
for f, label in zip(X, y):
34+
data[label].append(f)
35+
self.model = {label: self.summarize(value) for label, value in data.items()}
36+
return self
37+
38+
# 计算概率
39+
def calculate_probabilities(self, input_data):
40+
probabilities = {}
41+
for label, value in self.model.items():
42+
probabilities[label] = 1
43+
for i in range(len(value)):
44+
mean, stdev = value[i]
45+
probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev)
46+
return probabilities
47+
48+
# 类别
49+
def predict(self, X_test):
50+
label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0]
51+
return label
52+
53+
def score(self, X_test, y_test):
54+
right = 0
55+
for X, y in zip(X_test, y_test):
56+
label = self.predict(X)
57+
if label == y:
58+
right += 1
59+
60+
return right / float(len(X_test))
61+
62+
if __name__ == "__main__":
63+
iris = load_iris()
64+
X,y = iris.data, iris.target
65+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
66+
print(len(X_train))
67+
print(len(X_test))
68+
model = NaiveBayes()
69+
model.fit(X_train, y_train)
70+
print(model.predict([4.4, 3.2, 1.3, 0.2]))
71+
print(model.score(X_test, y_test))

0 commit comments

Comments
 (0)