-
Notifications
You must be signed in to change notification settings - Fork 266
Expand file tree
/
Copy pathAdaBoost.py
More file actions
147 lines (119 loc) · 5.21 KB
/
AdaBoost.py
File metadata and controls
147 lines (119 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from sklearn.ensemble import AdaBoostClassifier
from utils.data_generater import *
class AdaBoost(object):
def __init__(self, n_estimators=50, learning_rate=1.0):
self.clf_num = n_estimators
self.learning_rate = learning_rate
def init_args(self, datasets, labels):
self.X = datasets
self.Y = labels
self.M, self.N = datasets.shape
# 弱分类器数目和集合
self.clf_sets = []
# 初始化weights
self.weights = [1.0 / self.M] * self.M
# G(x)系数 alpha
self.alpha = []
def _G(self, features, labels, weights):
m = len(features)
error = 100000.0 # 无穷大
best_v = 0.0
# 单维features
features_min = min(features)
features_max = max(features)
n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
# print('n_step:{}'.format(n_step))
direct, compare_array = None, None
for i in range(1, int(n_step)):
v = features_min + self.learning_rate * i
if v not in features:
# 误分类计算
compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])
if weight_error_positive < weight_error_nagetive:
weight_error = weight_error_positive
_compare_array = compare_array_positive
direct = 'positive'
else:
weight_error = weight_error_nagetive
_compare_array = compare_array_nagetive
direct = 'nagetive'
# print('v:{} error:{}'.format(v, weight_error))
if weight_error < error:
error = weight_error
compare_array = _compare_array
best_v = v
return best_v, direct, error, compare_array
# 计算alpha
def _alpha(self, error):
return 0.5 * np.log((1 - error) / error)
# 规范化因子
def _Z(self, weights, a, clf):
return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)])
# 权值更新
def _w(self, a, clf, Z):
for i in range(self.M):
self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z
# G(x)的线性组合
def _f(self, alpha, clf_sets):
pass
def G(self, x, v, direct):
if direct == 'positive':
return 1 if x > v else -1
else:
return -1 if x > v else 1
def fit(self, X, y):
self.init_args(X, y)
for epoch in range(self.clf_num):
best_clf_error, best_v, clf_result = 100000, None, None
# 根据特征维度, 选择误差最小的
for j in range(self.N):
features = self.X[:, j]
# 分类阈值,分类误差,分类结果
v, direct, error, compare_array = self._G(features, self.Y, self.weights)
if error < best_clf_error:
best_clf_error = error
best_v = v
final_direct = direct
clf_result = compare_array
axis = j
# print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))
if best_clf_error == 0:
break
# 计算G(x)系数a
a = self._alpha(best_clf_error)
self.alpha.append(a)
# 记录分类器
self.clf_sets.append((axis, best_v, final_direct))
# 规范化因子
Z = self._Z(self.weights, a, clf_result)
# 权值更新
self._w(a, clf_result, Z)
# print('classifier:{}/{} error:{:.3f} v:{} direct:{} a:{:.5f}'.format(epoch+1, self.clf_num, error, best_v, final_direct, a))
# print('weight:{}'.format(self.weights))
# print('\n')
def predict(self, feature):
result = 0.0
for i in range(len(self.clf_sets)):
axis, clf_v, direct = self.clf_sets[i]
f_input = feature[axis]
result += self.alpha[i] * self.G(f_input, clf_v, direct)
# sign
return 1 if result > 0 else -1
def score(self, X_test, y_test):
right_count = 0
for i in range(len(X_test)):
feature = X_test[i]
if self.predict(feature) == y_test[i]:
right_count += 1
return right_count / len(X_test)
if __name__ == "__main__":
X_train, X_test, y_train, y_test = create_svm_data()
my_ada = AdaBoost(n_estimators=10, learning_rate=0.2)
my_ada.fit(X_train, y_train)
print("my AdaBoost score", my_ada.score(X_test, y_test))
sk_ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
sk_ada.fit(X_train, y_train)
print("sklearn AdaBoost score", sk_ada.score(X_test, y_test))