Adaboost的python简单实现_adaboost python-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_45478754/article/details/126123821
本文详细介绍了如何使用Python从零开始实现Adaboost算法，包括弱学习器CART决策树的构建，训练过程，以及模型预测。通过Adaboost结合多个弱分类器形成强分类器，最终在乳腺癌数据集上进行预测并计算准确率。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
Adaboost的python简单实现

一、代码
一、代码

#弱学习器类
class weakLearner():
    #初始化函数
    def __init__(self):
        self.type_feature = None #特征的类型，是连续还是离散
        self.w = None #样本的权值

    #计算基尼系数
    def __Gini(self, y, sample_weight):
        """
        :param y: 目标数据
        :param sample_weight: 特征数据的权值
        :return: 返回该特征的Gini系数
        """
        #统计标签类别
        K = np.unique(y)
        #计算基尼系数,注意这里计算的是样本的权值的基尼系数
        gini = 1- np.sum([(np.sum(sample_weight[y == k]) / np.sum(sample_weight)) **2 for k in K])

        return gini

    #判断特征类型,并存入特征类型数组中
    def __typeFeature(self, X):
        #获得样本数量和特征个数
        n_sample, n_feature = X.shape
        self.type_feature = []
        #特征属性个数小于10个，认为是离散型特征则用0表示，连续型特征用1表示
        for f_idx in range(n_feature):
            if len(np.unique(X[:, f_idx])) < 10:
                self.type_feature.append(0)
            else:
                self.type_feature.append(1)
        return self.type_feature

    #按照特征值二元划分数据集
    def __binSplitData(self,X, y, f_idx, f_val):
        """
        :param X: 待划分的数据集
        :param y: 待划分数据集的标签
        :param f_idx: 划分的特征索引
        :param f_val: 特征的属性值
        :return: 二分后的左右数据集
        """

        att = X[:, f_idx] #所有数据在第f_idx个特征的所有属性
        #按照离散和连续特征两种情况，二分数据集
        #当特征是离散型特征
        if self.type_feature[f_idx] == 0:
            X_left = X[att == f_val]
            X_right = X[att != f_val]
            y_left = y[att == f_val]
            y_right = y[att != f_val]
            weight_left = self.w[att == f_val]
            weight_right = self.w[att != f_val]
        else: #当特征是连续型特征
            X_left = X[att <= f_val]
            X_right = X[att > f_val]
            y_left = y[att <= f_val]
            y_right = y[att > f_val]
            weight_left = self.w[att <= f_val]
            weight_right = self.w[att > f_val]

        return X_left, X_right, y_left, y_right, weight_left, weight_right

    #选择最佳划分特征及最佳划分点
    def __bestSplit(self, X, y):
        """
        :param X: 训练样本
        :param y: 训练样本标签
        :return: 最佳分割特征best_f_idx，特征值best_f_val
        """
        #初始化基尼系数
        ini_gini = 1
        #获取样本个数和特征数
        n_sample, n_feature = X.shape

        #遍历n_feature个特征，以基尼系数为划分指标，寻找最佳分裂特征和最佳分割点
        for f_idx in range(n_feature):
            #若该特征是离散型特征
            if self.type_feature[f_idx] == 0:
                #遍历当前特征的所有属性，寻找最佳划分点
                for f_val in np.unique(X[:, f_idx]):
                    #当某个特征只有两个属性值时，仅仅做一次划分，跳出当前的循环
                    if len(np.unique(X[:, f_idx])) == 2 and f_val == np.unique(X[:, f_idx])[0]:
                        continue
                    else:
                        X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X, y, f_idx, f_val)
                        #计算划分后的基尼系数
                        Gini_after = np.sum(weight_left) * self.__Gini(y_left, weight_left) + np.sum(weight_right) * self.__Gini(y_right, weight_right)
                        #与未划分前的基尼系数比较
                        if Gini_after > ini_gini:
                            continue
                        else:
                            ini_gini = Gini_after
                            best_f_idx = f_idx
                            best_f_val = f_val
            else: #特征为连续型特征
                for f_val in np.linspace(np.nanmin(X[:,f_idx])+1, np.nanmax(X[:,f_idx])-1, num=50): #将该特征的所有值划分为50个点的等差数列
                    X_left, X_right, y_left, y_right, weight_left, weight_right= self.__binSplitData(X, y, f_idx, f_val)
                Gini_after = np.sum(weight_left) * self.__Gini(y_left, weight_left) + np.sum(weight_right) * self.__Gini(y_right, weight_right)
                if Gini_after > ini_gini:
                    continue
                else:
                    ini_gini = Gini_after
                    best_f_idx = f_idx
                    best_f_val = f_val

        return best_f_idx, best_f_val

    #建立CART树
    def __CART(self, X, y):
        """
        :param X: 训练样本
        :param y: 训练样本标签
        :return: CART树
        """
        best_f_idx , best_f_val = self.__bestSplit(X, y)
        tree = dict()
        tree['cut_f_idx'] = best_f_idx #最佳特征
        tree['cut_f_val'] = best_f_val #最佳划分点
        X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X, y, best_f_idx, best_f_val)
        tree['left_tree'] = y_left #存储左分支的所有样本的标签值
        tree['right_tree'] = y_right
        tree['weight_left'] = weight_left
        tree['weight_right'] = weight_right

        return tree

    #训练CART树
    def train(self, X, y, sample_weight):
        self.w = sample_weight
        self.type_feature = self.__typeFeature(X)
        self.tree = self.__CART(X, y)

        return self.tree

    #CART预测
    def predict(self, X_test):
        return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])

    #预测一个样本
    def __predict_one(self, x_test, tree):
        cut_f_idx, cut_val = tree['cut_f_idx'], tree['cut_f_val']
        label_left = Counter(tree['left_tree']).most_common(1)[0][0] #统计左子树中各类标签中出现次数最高的标签
        label_right = Counter(tree['right_tree']).most_common(1)[0][0]
        #按照离散和连续型特征预测
        if self.type_feature[cut_f_idx] == 0:
            result = label_left if x_test[cut_f_idx] == cut_val else label_right
        else:
            result = label_left if x_test[cut_f_idx] <= cut_val else label_right

        return result



#Adaboost类
class Adaboost():
    #初始化函数
    def __init__(self, estimators: int = 10, classifier = weakLearner):
        self.estimators = estimators #弱学习器的个数
        self.weakLearner = classifier #模型的弱学习器基类
        self.w = None #样本的权值矩阵
        self.alphas = [] #弱学习器的系数
        self.stums = [] #训练好的弱学习器的数组

    #训练拟合模型
    def fit(self, X, y):
        self.w = np.array([1/len(X)] * len(X)) #初始化训练数据的权值分布 w = 1/n
        M = self.estimators #弱学习器的个数
        for m in range(M):
            G_m = self.weakLearner() #弱学习器实例化
            tree = G_m.train(X, y, self.w)
            #统计将所有左叶子节点中含样本数最多的节点的值作为模型的整体左叶子节点值，右节点同理
            label_left = np.array([Counter(tree['left_tree']).most_common(1)[0][0]] * len(tree['left_tree']))
            label_right = np.array([Counter(tree['right_tree']).most_common(1)[0][0]] * len(tree['right_tree']))
            #计算当前弱学习器在训练集上的分类误差率
            error = 1e-6 + np.sum(tree['weight_left'] * (tree['left_tree'] != label_left)) + np.sum(tree['weight_right'] * (tree['right_tree'] != label_right))
            #计算当前弱学习器的系数
            alpha = 1/2 *np.log((1- error) / error)
            #更新训练数据集的权值分布
            y_temp = np.hstack((tree['left_tree'], tree['right_tree'])) #np.hstack将参数元组的元素组按水平方向进行叠加
            G = np.hstack((label_left, label_right))
            Zm = np.sum(self.w * np.exp(- alpha * y_temp * G))
            self.w = self.w * np.exp(- alpha * y_temp * G) / Zm
            #存储当前模型和其系数
            self.stums.append(G_m)
            self.alphas.append(alpha)

    #模型预测
    def predict(self, X_test):

        M = self.estimators #弱学习器的个数
        y_ = 0 #初始化预测值为0
        for m in range(M):
            y_ += self.alphas[m] * self.stums[m].predict(X_test)

        return np.sign(y_) #结合策略：符号函数



if __name__ == '__main__':
    from collections import Counter #计数器
    from sklearn import datasets #调用sklearn库的开源数据集
    import numpy as np
    from sklearn.model_selection import train_test_split #调用sklearn库中的划分训练集、测试集的方法

    data = datasets.load_breast_cancer() #加载乳腺癌数据集
    #将标签转化为-1和1
    data.target[data.target > 0 ] = 1
    data.target[data.target == 0] = -1
    X, Y = data.data, data.target #划分样本和标签
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #按照8：2划分数据集
    #实例化Adaboost
    tree_clf = Adaboost()
    #训练模型
    tree = tree_clf.fit(X_train, Y_train)
    #预测模型
    Y_pred = tree_clf.predict(X_test)
    #计算准确率
    accuracy = np.sum(Y_pred == Y_test) / len(Y_test)
    print('acc:{}'.format(accuracy))