Skip to content

Commit bbbb32f

Browse files
author
linyiqun
committed
朴素贝叶斯算法实现
朴素贝叶斯算法实现
1 parent 80b4af0 commit bbbb32f

2 files changed

Lines changed: 226 additions & 0 deletions

File tree

DataMining_NaiveBayes/Client.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package DataMining_NaiveBayes;
2+
3+
4+
/**
5+
* 朴素贝叶斯算法场景调用类
6+
* @author lyq
7+
*
8+
*/
9+
public class Client {
10+
public static void main(String[] args){
11+
//训练集数据
12+
String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt";
13+
String testData = "Youth Medium Yes Fair";
14+
NaiveBayesTool tool = new NaiveBayesTool(filePath);
15+
System.out.println(testData + " 数据的分类为:" + tool.naiveBayesClassificate(testData));
16+
}
17+
}
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
package DataMining_NaiveBayes;
2+
3+
import java.io.BufferedReader;
4+
import java.io.File;
5+
import java.io.FileReader;
6+
import java.io.IOException;
7+
import java.util.ArrayList;
8+
import java.util.HashMap;
9+
import java.util.Map;
10+
11+
/**
12+
* 朴素贝叶斯算法工具类
13+
*
14+
* @author lyq
15+
*
16+
*/
17+
public class NaiveBayesTool {
18+
// 类标记符,这里分为2类,YES和NO
19+
private String YES = "Yes";
20+
private String NO = "No";
21+
22+
// 已分类训练数据集文件路径
23+
private String filePath;
24+
// 属性名称数组
25+
private String[] attrNames;
26+
// 训练数据集
27+
private String[][] data;
28+
29+
// 每个属性的值所有类型
30+
private HashMap<String, ArrayList<String>> attrValue;
31+
32+
public NaiveBayesTool(String filePath) {
33+
this.filePath = filePath;
34+
35+
readDataFile();
36+
initAttrValue();
37+
}
38+
39+
/**
40+
* 从文件中读取数据
41+
*/
42+
private void readDataFile() {
43+
File file = new File(filePath);
44+
ArrayList<String[]> dataArray = new ArrayList<String[]>();
45+
46+
try {
47+
BufferedReader in = new BufferedReader(new FileReader(file));
48+
String str;
49+
String[] tempArray;
50+
while ((str = in.readLine()) != null) {
51+
tempArray = str.split(" ");
52+
dataArray.add(tempArray);
53+
}
54+
in.close();
55+
} catch (IOException e) {
56+
e.getStackTrace();
57+
}
58+
59+
data = new String[dataArray.size()][];
60+
dataArray.toArray(data);
61+
attrNames = data[0];
62+
63+
/*
64+
* for(int i=0; i<data.length;i++){ for(int j=0; j<data[0].length; j++){
65+
* System.out.print(" " + data[i][j]); }
66+
*
67+
* System.out.print("\n"); }
68+
*/
69+
}
70+
71+
/**
72+
* 首先初始化每种属性的值的所有类型,用于后面的子类熵的计算时用
73+
*/
74+
private void initAttrValue() {
75+
attrValue = new HashMap<>();
76+
ArrayList<String> tempValues;
77+
78+
// 按照列的方式,从左往右找
79+
for (int j = 1; j < attrNames.length; j++) {
80+
// 从一列中的上往下开始寻找值
81+
tempValues = new ArrayList<>();
82+
for (int i = 1; i < data.length; i++) {
83+
if (!tempValues.contains(data[i][j])) {
84+
// 如果这个属性的值没有添加过,则添加
85+
tempValues.add(data[i][j]);
86+
}
87+
}
88+
89+
// 一列属性的值已经遍历完毕,复制到map属性表中
90+
attrValue.put(data[0][j], tempValues);
91+
}
92+
93+
}
94+
95+
/**
96+
* 在classType的情况下,发生condition条件的概率
97+
*
98+
* @param condition
99+
* 属性条件
100+
* @param classType
101+
* 分类的类型
102+
* @return
103+
*/
104+
private double computeConditionProbably(String condition, String classType) {
105+
// 条件计数器
106+
int count = 0;
107+
// 条件属性的索引列
108+
int attrIndex = 1;
109+
// yes类标记符数据
110+
ArrayList<String[]> yClassData = new ArrayList<>();
111+
// no类标记符数据
112+
ArrayList<String[]> nClassData = new ArrayList<>();
113+
ArrayList<String[]> classData;
114+
115+
for (int i = 1; i < data.length; i++) {
116+
// data数据按照yes和no分类
117+
if (data[i][attrNames.length - 1].equals(YES)) {
118+
yClassData.add(data[i]);
119+
} else {
120+
nClassData.add(data[i]);
121+
}
122+
}
123+
124+
if (classType.equals(YES)) {
125+
classData = yClassData;
126+
} else {
127+
classData = nClassData;
128+
}
129+
130+
// 如果没有设置条件则,计算的是纯粹的类事件概率
131+
if (condition == null) {
132+
return 1.0 * classData.size() / (data.length - 1);
133+
}
134+
135+
// 寻找此条件的属性列
136+
attrIndex = getConditionAttrName(condition);
137+
138+
for (String[] s : classData) {
139+
if (s[attrIndex].equals(condition)) {
140+
count++;
141+
}
142+
}
143+
144+
return 1.0 * count / classData.size();
145+
}
146+
147+
/**
148+
* 根据条件值返回条件所属属性的列值
149+
*
150+
* @param condition
151+
* 条件
152+
* @return
153+
*/
154+
private int getConditionAttrName(String condition) {
155+
// 条件所属属性名
156+
String attrName = "";
157+
// 条件所在属性列索引
158+
int attrIndex = 1;
159+
// 临时属性值类型
160+
ArrayList<String[]> valueTypes;
161+
for (Map.Entry entry : attrValue.entrySet()) {
162+
valueTypes = (ArrayList<String[]>) entry.getValue();
163+
if (valueTypes.contains(condition)
164+
&& !((String) entry.getKey()).equals("BuysComputer")) {
165+
attrName = (String) entry.getKey();
166+
}
167+
}
168+
169+
for (int i = 0; i < attrNames.length - 1; i++) {
170+
if (attrNames[i].equals(attrName)) {
171+
attrIndex = i;
172+
break;
173+
}
174+
}
175+
176+
return attrIndex;
177+
}
178+
179+
/**
180+
* 进行朴素贝叶斯分类
181+
*
182+
* @param data
183+
* 待分类数据
184+
*/
185+
public String naiveBayesClassificate(String data) {
186+
// 测试数据的属性值特征
187+
String[] dataFeatures;
188+
// 在yes的条件下,x事件发生的概率
189+
double xWhenYes = 1.0;
190+
// 在no的条件下,x事件发生的概率
191+
double xWhenNo = 1.0;
192+
// 最后也是yes和no分类的总概率,用P(X|Ci)*P(Ci)的公式计算
193+
double pYes = 1;
194+
double pNo = 1;
195+
196+
dataFeatures = data.split(" ");
197+
for (int i = 0; i < dataFeatures.length; i++) {
198+
// 因为朴素贝叶斯算法是类条件独立的,所以可以进行累积的计算
199+
xWhenYes *= computeConditionProbably(dataFeatures[i], YES);
200+
xWhenNo *= computeConditionProbably(dataFeatures[i], NO);
201+
}
202+
203+
pYes = xWhenYes * computeConditionProbably(null, YES);
204+
pNo = xWhenNo * computeConditionProbably(null, NO);
205+
206+
return (pYes > pNo ? YES : NO);
207+
}
208+
209+
}

0 commit comments

Comments
 (0)