Skip to content

Commit c056d1a

Browse files
merge from git
2 parents e238d88 + 7f3710b commit c056d1a

5 files changed

Lines changed: 522 additions & 2 deletions

File tree

adaboost/AdaBoost.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from sklearn.ensemble import AdaBoostClassifier
2+
3+
from utils.data_generater import *
4+
5+
class AdaBoost:
6+
def __init__(self, n_estimators=50, learning_rate=1.0):
7+
self.clf_num = n_estimators
8+
self.learning_rate = learning_rate
9+
10+
def init_args(self, datasets, labels):
11+
12+
self.X = datasets
13+
self.Y = labels
14+
self.M, self.N = datasets.shape
15+
16+
# 弱分类器数目和集合
17+
self.clf_sets = []
18+
19+
# 初始化weights
20+
self.weights = [1.0 / self.M] * self.M
21+
22+
# G(x)系数 alpha
23+
self.alpha = []
24+
25+
def _G(self, features, labels, weights):
26+
m = len(features)
27+
error = 100000.0 # 无穷大
28+
best_v = 0.0
29+
# 单维features
30+
features_min = min(features)
31+
features_max = max(features)
32+
n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
33+
# print('n_step:{}'.format(n_step))
34+
direct, compare_array = None, None
35+
for i in range(1, int(n_step)):
36+
v = features_min + self.learning_rate * i
37+
38+
if v not in features:
39+
# 误分类计算
40+
compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
41+
weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
42+
43+
compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
44+
weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])
45+
46+
if weight_error_positive < weight_error_nagetive:
47+
weight_error = weight_error_positive
48+
_compare_array = compare_array_positive
49+
direct = 'positive'
50+
else:
51+
weight_error = weight_error_nagetive
52+
_compare_array = compare_array_nagetive
53+
direct = 'nagetive'
54+
55+
# print('v:{} error:{}'.format(v, weight_error))
56+
if weight_error < error:
57+
error = weight_error
58+
compare_array = _compare_array
59+
best_v = v
60+
return best_v, direct, error, compare_array
61+
62+
# 计算alpha
63+
def _alpha(self, error):
64+
return 0.5 * np.log((1 - error) / error)
65+
66+
# 规范化因子
67+
def _Z(self, weights, a, clf):
68+
return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)])
69+
70+
# 权值更新
71+
def _w(self, a, clf, Z):
72+
for i in range(self.M):
73+
self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z
74+
75+
# G(x)的线性组合
76+
def _f(self, alpha, clf_sets):
77+
pass
78+
79+
def G(self, x, v, direct):
80+
if direct == 'positive':
81+
return 1 if x > v else -1
82+
else:
83+
return -1 if x > v else 1
84+
85+
def fit(self, X, y):
86+
self.init_args(X, y)
87+
88+
for epoch in range(self.clf_num):
89+
best_clf_error, best_v, clf_result = 100000, None, None
90+
# 根据特征维度, 选择误差最小的
91+
for j in range(self.N):
92+
features = self.X[:, j]
93+
# 分类阈值,分类误差,分类结果
94+
v, direct, error, compare_array = self._G(features, self.Y, self.weights)
95+
96+
if error < best_clf_error:
97+
best_clf_error = error
98+
best_v = v
99+
final_direct = direct
100+
clf_result = compare_array
101+
axis = j
102+
103+
# print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))
104+
if best_clf_error == 0:
105+
break
106+
107+
# 计算G(x)系数a
108+
a = self._alpha(best_clf_error)
109+
self.alpha.append(a)
110+
# 记录分类器
111+
self.clf_sets.append((axis, best_v, final_direct))
112+
# 规范化因子
113+
Z = self._Z(self.weights, a, clf_result)
114+
# 权值更新
115+
self._w(a, clf_result, Z)
116+
117+
# print('classifier:{}/{} error:{:.3f} v:{} direct:{} a:{:.5f}'.format(epoch+1, self.clf_num, error, best_v, final_direct, a))
118+
# print('weight:{}'.format(self.weights))
119+
# print('\n')
120+
121+
def predict(self, feature):
122+
result = 0.0
123+
for i in range(len(self.clf_sets)):
124+
axis, clf_v, direct = self.clf_sets[i]
125+
f_input = feature[axis]
126+
result += self.alpha[i] * self.G(f_input, clf_v, direct)
127+
# sign
128+
return 1 if result > 0 else -1
129+
130+
def score(self, X_test, y_test):
131+
right_count = 0
132+
for i in range(len(X_test)):
133+
feature = X_test[i]
134+
if self.predict(feature) == y_test[i]:
135+
right_count += 1
136+
137+
return right_count / len(X_test)
138+
139+
if __name__ == "__main__":
140+
X_train, X_test, y_train, y_test = create_svm_data()
141+
my_ada = AdaBoost(n_estimators=10, learning_rate=0.2)
142+
my_ada.fit(X_train, y_train)
143+
print("my AdaBoost score", my_ada.score(X_test, y_test))
144+
145+
sk_ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
146+
sk_ada.fit(X_train, y_train)
147+
print("sklearn AdaBoost score", sk_ada.score(X_test, y_test))
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from sklearn.linear_model import LogisticRegression
2+
3+
from math import exp
4+
from utils.data_generater import *
5+
6+
7+
class LogisticRegressionClassifier:
8+
def __init__(self, max_iter=200, learning_rate=0.01):
9+
# 最大迭代次数
10+
self.max_iter = max_iter
11+
# 学习率
12+
self.learning_rate = learning_rate
13+
14+
# sigmoid函数
15+
def sigmoid(self, x):
16+
return 1 / (1 + exp(-x))
17+
18+
# 处理训练数据,增加一列,为了weight和bias合并处理
19+
def data_matrix(self, X):
20+
data_mat = []
21+
for d in X:
22+
data_mat.append([1.0, *d])
23+
return data_mat
24+
25+
26+
def fit(self, X, y):
27+
data_mat = self.data_matrix(X)
28+
# self.weights包含了weight和bias合并处理
29+
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
30+
31+
for iter_ in range(self.max_iter):
32+
for i in range(len(X)):
33+
result = self.sigmoid(np.dot(data_mat[i], self.weights))
34+
error = y[i] - result
35+
# 梯度下降迭代权重参数self.weights
36+
self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
37+
print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter))
38+
39+
# 计算准确度
40+
def score(self, X_test, y_test):
41+
right = 0
42+
X_test = self.data_matrix(X_test)
43+
for x, y in zip(X_test, y_test):
44+
result = np.dot(x, self.weights)
45+
if (result > 0 and y == 1) or (result < 0 and y == 0):
46+
right += 1
47+
return right / len(X_test)
48+
49+
50+
if __name__ == "__main__":
51+
X_train, X_test, y_train, y_test = create_logistic_data()
52+
53+
# 我们的LogisticRegression
54+
my_lr = LogisticRegressionClassifier()
55+
my_lr.fit(X_train, y_train)
56+
print("my LogisticRegression score", my_lr.score(X_test, y_test))
57+
58+
# sklearn的LogisticRegression
59+
sklearn_lr = LogisticRegression(max_iter=200)
60+
sklearn_lr.fit(X_train, y_train)
61+
print("sklearn LogisticRegression score", sklearn_lr.score(X_test, y_test))

logistic_regression/max_entropy.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import math
2+
from copy import deepcopy
3+
4+
5+
class MaxEntropy:
6+
def __init__(self, EPS=0.005):
7+
self._samples = []
8+
self._Y = set() # 标签集合,相当去去重后的y
9+
self._numXY = {} # key为(x,y),value为出现次数
10+
self._N = 0 # 样本数
11+
self._Ep_ = [] # 样本分布的特征期望值
12+
self._xyID = {} # key记录(x,y),value记录id号
13+
self._n = 0 # 特征键值(x,y)的个数
14+
self._C = 0 # 最大特征数
15+
self._IDxy = {} # key为(x,y),value为对应的id号
16+
self._w = []
17+
self._EPS = EPS # 收敛条件
18+
self._lastw = [] # 上一次w参数值
19+
20+
def loadData(self, dataset):
21+
self._samples = deepcopy(dataset)
22+
for items in self._samples:
23+
y = items[0]
24+
X = items[1:]
25+
self._Y.add(y) # 集合中y若已存在则会自动忽略
26+
for x in X:
27+
if (x, y) in self._numXY:
28+
self._numXY[(x, y)] += 1
29+
else:
30+
self._numXY[(x, y)] = 1
31+
32+
self._N = len(self._samples)
33+
self._n = len(self._numXY)
34+
self._C = max([len(sample)-1 for sample in self._samples])
35+
self._w = [0]*self._n
36+
self._lastw = self._w[:]
37+
38+
self._Ep_ = [0] * self._n
39+
for i, xy in enumerate(self._numXY): # 计算特征函数fi关于经验分布的期望
40+
self._Ep_[i] = self._numXY[xy]/self._N
41+
self._xyID[xy] = i
42+
self._IDxy[i] = xy
43+
44+
def _Zx(self, X): # 计算每个Z(x)值
45+
zx = 0
46+
for y in self._Y:
47+
ss = 0
48+
for x in X:
49+
if (x, y) in self._numXY:
50+
ss += self._w[self._xyID[(x, y)]]
51+
zx += math.exp(ss)
52+
return zx
53+
54+
def _model_pyx(self, y, X): # 计算每个P(y|x)
55+
zx = self._Zx(X)
56+
ss = 0
57+
for x in X:
58+
if (x, y) in self._numXY:
59+
ss += self._w[self._xyID[(x, y)]]
60+
pyx = math.exp(ss)/zx
61+
return pyx
62+
63+
def _model_ep(self, index): # 计算特征函数fi关于模型的期望
64+
x, y = self._IDxy[index]
65+
ep = 0
66+
for sample in self._samples:
67+
if x not in sample:
68+
continue
69+
pyx = self._model_pyx(y, sample)
70+
ep += pyx/self._N
71+
return ep
72+
73+
def _convergence(self): # 判断是否全部收敛
74+
for last, now in zip(self._lastw, self._w):
75+
if abs(last - now) >= self._EPS:
76+
return False
77+
return True
78+
79+
def predict(self, X): # 计算预测概率
80+
Z = self._Zx(X)
81+
result = {}
82+
for y in self._Y:
83+
ss = 0
84+
for x in X:
85+
if (x, y) in self._numXY:
86+
ss += self._w[self._xyID[(x, y)]]
87+
pyx = math.exp(ss)/Z
88+
result[y] = pyx
89+
return result
90+
91+
def train(self, maxiter=1000): # 训练数据
92+
for loop in range(maxiter): # 最大训练次数
93+
print("iter:%d" % loop)
94+
self._lastw = self._w[:]
95+
for i in range(self._n):
96+
ep = self._model_ep(i) # 计算第i个特征的模型期望
97+
self._w[i] += math.log(self._Ep_[i]/ep)/self._C # 更新参数
98+
print("w:", self._w)
99+
if self._convergence(): # 判断是否收敛
100+
break
101+
102+
if __name__ == "__main__":
103+
dataset = [['no', 'sunny', 'hot', 'high', 'FALSE'],
104+
['no', 'sunny', 'hot', 'high', 'TRUE'],
105+
['yes', 'overcast', 'hot', 'high', 'FALSE'],
106+
['yes', 'rainy', 'mild', 'high', 'FALSE'],
107+
['yes', 'rainy', 'cool', 'normal', 'FALSE'],
108+
['no', 'rainy', 'cool', 'normal', 'TRUE'],
109+
['yes', 'overcast', 'cool', 'normal', 'TRUE'],
110+
['no', 'sunny', 'mild', 'high', 'FALSE'],
111+
['yes', 'sunny', 'cool', 'normal', 'FALSE'],
112+
['yes', 'rainy', 'mild', 'normal', 'FALSE'],
113+
['yes', 'sunny', 'mild', 'normal', 'TRUE'],
114+
['yes', 'overcast', 'mild', 'high', 'TRUE'],
115+
['yes', 'overcast', 'hot', 'normal', 'FALSE'],
116+
['no', 'rainy', 'mild', 'high', 'TRUE']]
117+
118+
maxent = MaxEntropy()
119+
x = ['overcast', 'mild', 'high', 'FALSE']
120+
maxent.loadData(dataset)
121+
maxent.train()
122+
print('predict:', maxent.predict(x))

0 commit comments

Comments
 (0)