一、代码
class weakLearner():
def __init__(self):
self.type_feature = None
self.w = None
def __Gini(self, y, sample_weight):
"""
:param y: 目标数据
:param sample_weight: 特征数据的权值
:return: 返回该特征的Gini系数
"""
K = np.unique(y)
gini = 1- np.sum([(np.sum(sample_weight[y == k]) / np.sum(sample_weight)) **2 for k in K])
return gini
def __typeFeature(self, X):
n_sample, n_feature = X.shape
self.type_feature = []
for f_idx in range(n_feature):
if len(np.unique(X[:, f_idx])) < 10:
self.type_feature.append(0)
else:
self.type_feature.append(1)
return self.type_feature
def __binSplitData(self,X, y, f_idx, f_val):
"""
:param X: 待划分的数据集
:param y: 待划分数据集的标签
:param f_idx: 划分的特征索引
:param f_val: 特征的属性值
:return: 二分后的左右数据集
"""
att = X[:, f_idx]
if self.type_feature[f_idx] == 0:
X_left = X[att == f_val]
X_right = X[att != f_val]
y_left = y[att == f_val]
y_right = y[att != f_val]
weight_left = self.w[att == f_val]
weight_right = self.w[att != f_val]
else:
X_left = X[att <= f_val]
X_right = X[att > f_val]
y_left = y[att <= f_val]
y_right = y[att > f_val]
weight_left = self.w[att <= f_val]
weight_right = self.w[att > f_val]
return X_left, X_right, y_left, y_right, weight_left, weight_right
def __bestSplit(self, X, y):
"""
:param X: 训练样本
:param y: 训练样本标签
:return: 最佳分割特征best_f_idx,特征值best_f_val
"""
ini_gini = 1
n_sample, n_feature = X.shape
for f_idx in range(n_feature):
if self.type_feature[f_idx] == 0:
for f_val in np.unique(X[:, f_idx]):
if len(np.unique(X[:, f_idx])) == 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X, y, f_idx, f_val)
Gini_after = np.sum(weight_left) * self.__Gini(y_left, weight_left) + np.sum(weight_right) * self.__Gini(y_right, weight_right)
if Gini_after > ini_gini:
continue
else:
ini_gini = Gini_after
best_f_idx = f_idx
best_f_val = f_val
else:
for f_val in np.linspace(np.nanmin(X[:,f_idx])+1, np.nanmax(X[:,f_idx])-1, num=50):
X_left, X_right, y_left, y_right, weight_left, weight_right= self.__binSplitData(X, y, f_idx, f_val)
Gini_after = np.sum(weight_left) * self.__Gini(y_left, weight_left) + np.sum(weight_right) * self.__Gini(y_right, weight_right)
if Gini_after > ini_gini:
continue
else:
ini_gini = Gini_after
best_f_idx = f_idx
best_f_val = f_val
return best_f_idx, best_f_val
def __CART(self, X, y):
"""
:param X: 训练样本
:param y: 训练样本标签
:return: CART树
"""
best_f_idx , best_f_val = self.__bestSplit(X, y)
tree = dict()
tree['cut_f_idx'] = best_f_idx
tree['cut_f_val'] = best_f_val
X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X, y, best_f_idx, best_f_val)
tree['left_tree'] = y_left
tree['right_tree'] = y_right
tree['weight_left'] = weight_left
tree['weight_right'] = weight_right
return tree
def train(self, X, y, sample_weight):
self.w = sample_weight
self.type_feature = self.__typeFeature(X)
self.tree = self.__CART(X, y)
return self.tree
def predict(self, X_test):
return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
def __predict_one(self, x_test, tree):
cut_f_idx, cut_val = tree['cut_f_idx'], tree['cut_f_val']
label_left = Counter(tree['left_tree']).most_common(1)[0][0]
label_right = Counter(tree['right_tree']).most_common(1)[0][0]
if self.type_feature[cut_f_idx] == 0:
result = label_left if x_test[cut_f_idx] == cut_val else label_right
else:
result = label_left if x_test[cut_f_idx] <= cut_val else label_right
return result
class Adaboost():
def __init__(self, estimators: int = 10, classifier = weakLearner):
self.estimators = estimators
self.weakLearner = classifier
self.w = None
self.alphas = []
self.stums = []
def fit(self, X, y):
self.w = np.array([1/len(X)] * len(X))
M = self.estimators
for m in range(M):
G_m = self.weakLearner()
tree = G_m.train(X, y, self.w)
label_left = np.array([Counter(tree['left_tree']).most_common(1)[0][0]] * len(tree['left_tree']))
label_right = np.array([Counter(tree['right_tree']).most_common(1)[0][0]] * len(tree['right_tree']))
error = 1e-6 + np.sum(tree['weight_left'] * (tree['left_tree'] != label_left)) + np.sum(tree['weight_right'] * (tree['right_tree'] != label_right))
alpha = 1/2 *np.log((1- error) / error)
y_temp = np.hstack((tree['left_tree'], tree['right_tree']))
G = np.hstack((label_left, label_right))
Zm = np.sum(self.w * np.exp(- alpha * y_temp * G))
self.w = self.w * np.exp(- alpha * y_temp * G) / Zm
self.stums.append(G_m)
self.alphas.append(alpha)
def predict(self, X_test):
M = self.estimators
y_ = 0
for m in range(M):
y_ += self.alphas[m] * self.stums[m].predict(X_test)
return np.sign(y_)
if __name__ == '__main__':
from collections import Counter
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
data = datasets.load_breast_cancer()
data.target[data.target > 0 ] = 1
data.target[data.target == 0] = -1
X, Y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = Adaboost()
tree = tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
accuracy = np.sum(Y_pred == Y_test) / len(Y_test)
print('acc:{}'.format(accuracy))