【机器学习与算法】python手写算法:xgboost源码复现
背景知识
关于XGB原理的解释与推导,最好就直接参看原作者陈天奇大神的PPT,这里对原理不再赘述,直接附上链接:
tqchen/pdf/BoostedTree.pdf.
根据PPT的内容,我们来用python对XGB算法进行一个复现,实现两种目标函数的拟合:linear和logistic,这两种目标函数的公式如下:
当用python复现一遍xgboost算法后,相信一定会对算法过程、每个参数的含义及作用节点、作用效果有一个更清晰的认识。
上代码
import pandas as pd
import numpy as np
class XGB:
def __init__(self,
base_score = 0.5,
max_depth=3,
n_estimators=10,
learning_rate = 0.1,
reg_lambda = 1,
gamma = 0,
min_child_sample = None,
min_child_weight = 1,
objective = 'linear'):
self.base_score = base_score #最开始时给叶子节点权重所赋的值,默认0.5,迭代次数够多的话,结果对这个初值不敏感
self.max_depth = max_depth #最大数深度
self.n_estimators = n_estimators #树的个数
self.learning_rate = learning_rate #学习率,别和梯度下降里的学习率搞混了,这里是每棵树要乘以的权重系数
self.reg_lambda = reg_lambda #L2正则项的权重系数
self.gamma = gamma #正则项中,叶子节点数T的权重系数
self.min_child_sample = min_child_sample #每个叶子节点的样本数(自己加的)
self.min_child_weight = min_child_weight #每个叶子节点的Hessian矩阵和,下面代码会细讲
self.objective = objective #目标函数,可选linear和logistic
self.tree_structure = {
} #用一个字典来存储每一颗树的树结构
def xgb_cart_tree(self, X, w, m_dpth):
'''
递归的方式构造XGB中的Cart树
X:训练数据集
w:每个样本的权重值,递归赋值
m_dpth:树的深度
'''
#边界条件:递归到指定最大深度后,跳出
if m_dpth > self.max_depth:
return
best_var, best_cut = None, None
#这里增益的初值一定要设置为0,相当于对树做剪枝,即如果算出的增益小于0则不做分裂
max_gain = 0
G_left_best, G_right_best, H_left_best, H_right_best = 0,0,0,0
#遍历每个变量的每个切点,寻找分裂增益gain最大的切点并记录下来
for item in [x for x in X.columns if x not in ['g','h','y']]:
for cut in list(set(X[item])):
#这里如果指定了min_child_sample则限制分裂后叶子节点的样本数都不能小于指定值
if self.min_child_sample:
if (X.loc[X[item]<cut].shape[0]<self.min_child_sample)\
|(X.loc[X[item]>=cut].shape[0]<self.min_child_sample):
continue
G_left = X.loc[X[item]<cut,'g'].sum()
G_right = X.loc[X[item]>=cut,'g'].sum()
H_left = X.loc[X[item]<cut,'h'].sum()
H_right = X.loc[X[item]>=cut,'h'].sum()
#min_child_weight在这里起作用,指的是每个叶子节点上的H,即目标函数二阶导的加和
#当目标函数为linear,即1/2*(y-y_hat)**2时,它的二阶导是1,那min_child_weight就等价于min_child_sample
#当目标函数为logistic,其二阶导为sigmoid(y_hat)*(1-sigmoid(y_hat)),可理解为叶子节点的纯度,更详尽的解释可参看:
#https://stats.stackexchange.com/questions/317073/explanation-of-min-child-weight-in-xgboost-algorithm#
if self.min_child_weight:
if (H_left<self.min_child_weight)|(H_right<self.min_child_weight):
continue
gain = G_left**2/(H_left + self.reg_lambda) + \
G_right**2