最近看了论文Multi-Level Deep Cascade Trees for Conversion Rate Prediction
发现作者没有开源相关代码,就自己试着写了一下
核心思想就是把上一层GBDT每棵树的交叉熵输出作为下一层的输入
核心代码如下
def gen_new_train_X(self, X, y, gb_classifier):
# 保存一个用于寻找叶子节点的X
array_x = np.array(X).astype(np.float32)
X = None
for j in range(len(gb_classifier.estimators_)):
# 获得每个树
decision_tree = gb_classifier.estimators_[j, 0].tree_
train_leave_id = decision_tree.apply(array_x)
temp_df = DataFrame(train_leave_id, columns=['leave_id'])
# 取得对应的回归值列表
regress_l = []
for k in range(len(decision_tree.value)):
regress_l.append(decision_tree.value[k, 0, 0])
temp_df['prob'] = self.sigmoid(temp_df['leave_id'].map(lambda x: regress_l[x]))
# 计算每个叶子节点对应的交叉熵
# 用dict记录一下
dic = {}
# 我只需要取值空间
leave_ids = temp_df['leave_id'].unique()
for leave_id in leave_ids:
# 计算对应的交叉熵
temp_cross_entropy = metrics.log_loss(y[temp_df.leave_id == leave_id],
temp_df[temp_df.leave_id == leave_id]['prob'], labels=[1, 0])
dic[leave_id] = temp_cross_entropy
self.cross_entropys.append(dic)
if X is None:
X = DataFrame(np.array(temp_df['leave_id'].map(lambda x: dic[x])), columns=['cross_entropy0'])
else:
X['cross_entropy' + str(j)] = temp_df['leave_id'].map(lambda x: dic[x])
return X
全部代码见github:https://github.com/sunjiaxin111/ldcTree