1+ from sklearn .ensemble import AdaBoostClassifier
2+
3+ from utils .data_generater import *
4+
5+ class AdaBoost :
6+ def __init__ (self , n_estimators = 50 , learning_rate = 1.0 ):
7+ self .clf_num = n_estimators
8+ self .learning_rate = learning_rate
9+
10+ def init_args (self , datasets , labels ):
11+
12+ self .X = datasets
13+ self .Y = labels
14+ self .M , self .N = datasets .shape
15+
16+ # 弱分类器数目和集合
17+ self .clf_sets = []
18+
19+ # 初始化weights
20+ self .weights = [1.0 / self .M ] * self .M
21+
22+ # G(x)系数 alpha
23+ self .alpha = []
24+
25+ def _G (self , features , labels , weights ):
26+ m = len (features )
27+ error = 100000.0 # 无穷大
28+ best_v = 0.0
29+ # 单维features
30+ features_min = min (features )
31+ features_max = max (features )
32+ n_step = (features_max - features_min + self .learning_rate ) // self .learning_rate
33+ # print('n_step:{}'.format(n_step))
34+ direct , compare_array = None , None
35+ for i in range (1 , int (n_step )):
36+ v = features_min + self .learning_rate * i
37+
38+ if v not in features :
39+ # 误分类计算
40+ compare_array_positive = np .array ([1 if features [k ] > v else - 1 for k in range (m )])
41+ weight_error_positive = sum ([weights [k ] for k in range (m ) if compare_array_positive [k ] != labels [k ]])
42+
43+ compare_array_nagetive = np .array ([- 1 if features [k ] > v else 1 for k in range (m )])
44+ weight_error_nagetive = sum ([weights [k ] for k in range (m ) if compare_array_nagetive [k ] != labels [k ]])
45+
46+ if weight_error_positive < weight_error_nagetive :
47+ weight_error = weight_error_positive
48+ _compare_array = compare_array_positive
49+ direct = 'positive'
50+ else :
51+ weight_error = weight_error_nagetive
52+ _compare_array = compare_array_nagetive
53+ direct = 'nagetive'
54+
55+ # print('v:{} error:{}'.format(v, weight_error))
56+ if weight_error < error :
57+ error = weight_error
58+ compare_array = _compare_array
59+ best_v = v
60+ return best_v , direct , error , compare_array
61+
62+ # 计算alpha
63+ def _alpha (self , error ):
64+ return 0.5 * np .log ((1 - error ) / error )
65+
66+ # 规范化因子
67+ def _Z (self , weights , a , clf ):
68+ return sum ([weights [i ] * np .exp (- 1 * a * self .Y [i ] * clf [i ]) for i in range (self .M )])
69+
70+ # 权值更新
71+ def _w (self , a , clf , Z ):
72+ for i in range (self .M ):
73+ self .weights [i ] = self .weights [i ] * np .exp (- 1 * a * self .Y [i ] * clf [i ]) / Z
74+
75+ # G(x)的线性组合
76+ def _f (self , alpha , clf_sets ):
77+ pass
78+
79+ def G (self , x , v , direct ):
80+ if direct == 'positive' :
81+ return 1 if x > v else - 1
82+ else :
83+ return - 1 if x > v else 1
84+
85+ def fit (self , X , y ):
86+ self .init_args (X , y )
87+
88+ for epoch in range (self .clf_num ):
89+ best_clf_error , best_v , clf_result = 100000 , None , None
90+ # 根据特征维度, 选择误差最小的
91+ for j in range (self .N ):
92+ features = self .X [:, j ]
93+ # 分类阈值,分类误差,分类结果
94+ v , direct , error , compare_array = self ._G (features , self .Y , self .weights )
95+
96+ if error < best_clf_error :
97+ best_clf_error = error
98+ best_v = v
99+ final_direct = direct
100+ clf_result = compare_array
101+ axis = j
102+
103+ # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))
104+ if best_clf_error == 0 :
105+ break
106+
107+ # 计算G(x)系数a
108+ a = self ._alpha (best_clf_error )
109+ self .alpha .append (a )
110+ # 记录分类器
111+ self .clf_sets .append ((axis , best_v , final_direct ))
112+ # 规范化因子
113+ Z = self ._Z (self .weights , a , clf_result )
114+ # 权值更新
115+ self ._w (a , clf_result , Z )
116+
117+ # print('classifier:{}/{} error:{:.3f} v:{} direct:{} a:{:.5f}'.format(epoch+1, self.clf_num, error, best_v, final_direct, a))
118+ # print('weight:{}'.format(self.weights))
119+ # print('\n')
120+
121+ def predict (self , feature ):
122+ result = 0.0
123+ for i in range (len (self .clf_sets )):
124+ axis , clf_v , direct = self .clf_sets [i ]
125+ f_input = feature [axis ]
126+ result += self .alpha [i ] * self .G (f_input , clf_v , direct )
127+ # sign
128+ return 1 if result > 0 else - 1
129+
130+ def score (self , X_test , y_test ):
131+ right_count = 0
132+ for i in range (len (X_test )):
133+ feature = X_test [i ]
134+ if self .predict (feature ) == y_test [i ]:
135+ right_count += 1
136+
137+ return right_count / len (X_test )
138+
139+ if __name__ == "__main__" :
140+ X_train , X_test , y_train , y_test = create_svm_data ()
141+ my_ada = AdaBoost (n_estimators = 10 , learning_rate = 0.2 )
142+ my_ada .fit (X_train , y_train )
143+ print ("my AdaBoost score" , my_ada .score (X_test , y_test ))
144+
145+ sk_ada = AdaBoostClassifier (n_estimators = 100 , learning_rate = 0.5 )
146+ sk_ada .fit (X_train , y_train )
147+ print ("sklearn AdaBoost score" , sk_ada .score (X_test , y_test ))
0 commit comments