1.1 打印特征重要性
print(get_feature_importance_pair(gbm))
save_feature_importance_pair(feature_list, gbm)
或
importances = pd.Series(gbm.feature_importance('gain'), index=feature_list).sort_values(ascending=False)
1.2 给训练样本增加权重
from woe.eval import eval_segment_metrics
train['weight'] = 1
train.loc[(train['d7_open'] == 1) & (train['new_user'] == 1), 'weight'] = (len(train) - sum(train['d7_open'])) / sum(train['d7_open'])
train.loc[(train['d7_open'] == 0) & (train['new_user'] == 1), 'weight'] = (len(train) - sum(train['d7_open'])) / sum(train['d7_open']) * 2
train.loc[(train['d7_open'] == 0) & (train['new_user'] == 0), 'weight'] = 1
weights = train['weight']
trainset = lgb.Dataset(data=train.loc[:, feature_list], label=train.loc[:, 'd7_open'], weight=weights)
eval_segment_metrics(target=test.loc[:, 'd7_open'].values,
segment_cnt=50,
predict_proba=gbm.predict(test.loc[:, feature_list]),
out_path='../../reports/d7open.xlsx')
1.3 计算woe
def calc_woe(counter):
counter = pd.DataFrame(counter).T
counter.index.name = 'app'
counter['count-d7_open'] = counter['count'] - counter['d7_open']
counter['woe'] = np.log((counter['d7_open'] + 1e-4) /
(counter['count-d7_open'] + 1e-4)) - np.log(total_mean/(1-total_mean))
counter['abs(woe)'] = np.abs(counter['woe'])
counter['r7r'] = counter['d7_open'] / counter['count']
return counter
1.4 决策树可视化
# 画出决策树
def plot_tree(gbm_model):
ax = lgb.plot_tree(gbm_model, tree_index=0, figsize=(100, 40),
show_info=['split_gain', 'internal_value', 'internal_count', 'internal_weight', 'leaf_count', 'leaf_weight', 'data_percentage'])
#plt.show()
plt.savefig("lgb.png")
graph = lgb.create_tree_digraph(gbm_model, tree_index=0, name='Treepdf')
graph.render(view=True)
# 或者使用pydotplus来画树
'''
import pydotplus
from IPython.display import Image
dot_data = lgb.export_graphviz(gbm, out_file=None,
feature_names=feature_list,
class_names=iris.target_names,
filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
graph.write_png(r'pang.png')
'''
1.5 门槛统计
def model_stat(train, test, df_opt, dump_report=False):
df_opt = df[df['score'] >= THRESHOLD]
# r7r_opt = df_opt['d7_open'].sum() / df_opt['new_user'].sum()
# cvr_opt = df_opt['new_user'].sum() / len(df_opt)
# r7r = df['d3_open'].sum() / df['is_new_user'].sum()
# cvr = df['is_new_user'].sum() / len(df)
# print('r7r_before_opt = %.4f' % r7r)
# print('r7r_lift = %.1f%%' % ((r7r_opt / r7r - 1) * 100))
# print('cvr_before_opt = %.4f' % cvr)
# print('cvr_after_opt = %.4f' % cvr_opt)
# print('cvr_lift = %.1f%%' % ((cvr_opt / cvr - 1) * 100))
res = pd.DataFrame()
res['样本量'] = [len(train), len(test), len(df_opt), len(df_opt) / len(test), test['is_new_user'].sum(),
df_opt['is_new_user'].sum()]
res['正样本量'] = [train['d3_open'].sum(), test['d3_open'].sum(), df_opt['d3_open'].sum(),
df_opt['d3_open'].sum() / test['d3_open'].sum(), None, None]
res.index = ['训练数据', '测试数据', '测试门槛召回数据', '占比', '原始打开用户', '优化打开用户']
if dump_report:
res.to_excel('../../reports/model_stat.xlsx', index=True)
return res
1.6 画PR曲线
def plot_pr_curve(y_test, y_score):
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, y_score)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 0.0175])
plt.xlim([0.0, 1.0])
plt.savefig('../../reports/figures/PR-curve.png')
1.7 测试集训练集划分
print(data.groupby('pt')['uid'].count())
tr = data[data['pt'] < '2021-01-03']
te = data[data['pt'] == '2021-01-03']
tr.to_csv('../data/train.data_huo_v4', sep='\t', index=False)
te.to_csv('../data/test.data_huo_v4', sep='\t', index=False)
1.8 grp_id
def assign_grp_idx(df, segment_cnt):
total_sample_cnt = len(df)
grp_idx = 1
start_idx = 0
proba_descend_idx = np.argsort(y_pred_prob)[::-1]
segment_sample_cnt = int(len(y_pred_prob) / segment_cnt)
while start_idx < total_sample_cnt:
segment_idx_list = proba_descend_idx[start_idx: start_idx + segment_sample_cnt]
segment_sample_cnt = len(segment_idx_list)
df.loc[segment_idx_list, 'grp'] = grp_idx
grp_idx += 1
start_idx += segment_sample_cnt
if __name__ == "__main__":
segment_cnt = 50
df['grp'] = -1
assign_grp_idx(df, segment_cnt)
print(df.groupby('grp').apply(lambda x: np.sum(x[LABEL]) / np.sum(x['is_recall'])))
print(df.groupby('grp').apply(lambda x: np.sum(x['is_recall'] / len(x))))
print(df.groupby('grp').apply(lambda x: np.sum(x[LABEL] / len(x))))
本文介绍了机器学习中特征重要性的获取与展示方法,并详细解释了如何为训练样本分配权重以提升模型性能。此外,还提供了计算WoE值、决策树可视化及绘制PR曲线等实用技巧,帮助读者更好地理解并优化机器学习模型。
603

被折叠的 条评论
为什么被折叠?



