1. model pipeline拆解
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from woe.eval import eval_segment_metrics
feature_list = ['fea1', 'fea2', 'fea3']
drop_cols = ['uid', 'label', 'pt']
keep_cols = ['uid', 'score']
LABEL = 'is_later_30d_loss'
# 阈值通过segment确定
THRESHOLD = 0.5034
# 测试集待打分数据
data_input_path = '../data/raw_data.txt'
data_output_path = '../data/predict_data.csv'
model_file_path = 'model.txt'
# 一般这样,不需改动
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 6, # 32
'max_bin': 10, # 16
'learning_rate': 0.01, # 0.005
# 'feature_fraction': 0.9,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5,
# 'max_depth': 5,
# 'min_data_in_leaf': 6,
'is_unbalance': False,
'verbose': 0
}
if __name__ == "__main__":
train_data_path = '../data/train.txt'
test_data_path = '../data/test.txt'
# 获取训练集,测试集
X_tr, y_tr, X_te, y_te = get_dataset(train_data_path, test_data_path)
# 训练模型
train(X_tr, y_tr, X_te, y_te)
# 评估模型
eval(X_te, y_te)
# 打印模型特征重要度
feature_names = X_tr.columns.tolist()
print_important_feature(feature_names)
# 模型预测
predict()
1.1 获取数据集
1.1.1 获取训练集,测试集
def get_dataset(train_data_path, test_data_path):
tr = pd.read_csv(train_data_path, sep='\t')
te = pd.read_csv(test_data_path, sep='\t')
# 训练集数量,及其中正样本数
print('tr = %d, tr_pos_num = %d' % (len(tr), len(tr[tr[LABEL] == 1])))
print('te = %d, te_pos_num = %d' % (len(te), len(te[te[LABEL] == 1])))
y_tr = tr.loc[:, LABEL]
y_te = te.loc[:, LABEL]
tr.drop(labels=drop_cols, axis=1, inplace=True)
te.drop(labels=drop_cols, axis=1, inplace=True)
# 或者
# tr = tr.loc[:, feature_list]
# te = te.loc[:, feature_list]
X_tr, X_te = tr, te
return X_tr, y_tr, X_te, y_te
1.1.2 增加权重
# sample weight
tr['weight'] = 1
te['weight'] = 1
tr.loc[(tr[LABEL] == 1) & (tr['is_recall'] == 1), 'weight'] = (
len(tr) - sum(tr[LABEL])) / sum(tr[LABEL])
tr.loc[(tr[LABEL] == 0) & (tr['is_recall'] == 1), 'weight'] = (
len(tr) - sum(tr[LABEL])) / sum(tr[LABEL]) * 2
tr.loc[(tr[LABEL] == 0) & (tr['is_recall'] == 0), 'weight'] = 1
weights = tr['weight']
trainset = lgb.Dataset(data=x_tr, label=y_tr, weight=weights)
testset = lgb.Dataset(data=x_te, label=y_te)
1.1.3 切分训练集测试集
def split_dataset(data_path):
data = pd.read_csv(data_path, sep='\t')
X = data[feature_list].values
y = data[LABEL].values
X.fillna(0, inplace=True).round(decimals=2)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, stratify=y_test)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
return X_tr, y_tr, X_te, y_te
1.2 模型训练
def train(X_tr, y_tr, X_te, y_te):
trainset = lgb.Dataset(data=X_tr, label=y_tr)
testset = lgb.Dataset(data=X_te, label=y_te)
# testset = lgb.Dataset(X_te, y_te, reference=trainset)
lgb_model = lgb.train(params,
train_set=trainset,
valid_sets=[trainset, testset],
num_boost_round=2000, # 200
early_stopping_rounds=100)
# clf =lgb.LGBMClassifier(n_estimators=24)
# clf.fit(trainset, eval_set=[trainset,testset], eval_metric='auc')
# clf.best_score_['valid_1']['auc']
print(lgb_model.best_iteration) # 最佳迭代次数
lgb_model.save_model(model_file_path)
含有策略型特征的训练
def train(X_tr, y_tr, X_te, y_te):
trainset = lgb.Dataset(data=X_tr, label=y_tr, categorical_feature=category_feature_list)
testset = lgb.Dataset(X_te, y_te, reference=trainset, categorical_feature=category_feature_list)
lgb_model = lgb.train(params,
train_set=trainset,
valid_sets=[trainset, testset],
categorical_feature=category_feature_list,
num_boost_round=2000,
early_stopping_rounds=100)
lgb_model.save_model(model_file_path)
1.3 模型评估及segment抽取
def eval(X, y):
gbm = lgb.Booster(model_file=model_file_path)
# 生成segment文件
eval_segment_metrics(target=y.values,
segment_cnt=50,
predict_proba=gbm.predict(X),
out_path='aa_segment_' + LABEL + 'v8.xlsx')
pred = gbm.predict(X, num_iteration=gbm.best_iteration)
print ('predicting, classification error=%f'
% (sum(int(pred[i]) != y[i] for i in range(len(y))) / float(len(y))))
y_pred = (pred >= THRESHOLD) * 1
print ('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
print ('AUC: %.4f' % metrics.roc_auc_score(y, pred))
print ('Recall: %.4f' % metrics.recall_score(y, y_pred))
print ('F1-score: %.4f' % metrics.f1_score(y, y_pred))
print ('Precesion: %.4f' % metrics.precision_score(y, y_pred))
print(metrics.confusion_matrix(y, y_pred))
1.4 打印特征重要度
def print_important_feature(feature_names):
gbm = lgb.Booster(model_file=model_file_path)
importances = pd.Series(gbm.feature_importance('gain'),
index=feature_names
).sort_values(ascending=False)
#显示所有行,不然特征太多显示不完全
pd.set_option('display.max_rows', None)
print(importances)
# importance_list = list(gbm.feature_importance())
# feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_names, importance_list)]
# feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
# print(feature_importance_pair)
# 或
name = gbm.booster_.feature_name()
importance = lgb_model.feature_importances_
feature_importance = pd.DataFrame({'name': name,
'importance': importance}).sort_values(by=['importance'],ascending=False)
1.5 模型打分
def predict():
gbm = lgb.Booster(model_file=model_file_path)
test_data = pd.read_csv(data_input_path, sep='\t')
test_data = test_data.loc[:, feature_list]
test_data['score'] = gbm.predict(test_data)
result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
print("-----阈值门槛数据量: " + str(len(result)))
result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')
2. 代码汇总
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import datetime
import matplotlib.pyplot as plt
from woe.eval import eval_segment_metrics
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 6,
'max_bin': 10,
'learning_rate': 0.01,
'is_unbalance': False,
'verbose': 0
}
feature_list = ["chat_7d_cnt", "chat_cnt_self_expression", "chat_cnt_text"]
drop_cols = ["uid", "is_later_30d_loss", "pt"]
keep_cols = ['uid', 'score']
LABEL = 'is_later_30d_loss'
# 阈值通过segment确定
THRESHOLD = 0.5034
# 测试集待打分数据
data_input_path = '../data/raw_data.txt'
data_output_path = '../data/predict_data.csv'
model_file_path = 'model.txt'
def get_dataset(train_data_path, test_data_path):
tr = pd.read_csv(train_data_path, sep='\t')
te = pd.read_csv(test_data_path, sep='\t')
# 训练集数量,及其中正样本数
print('tr = %d, tr_pos_num = %d' % (len(tr), len(tr[tr[LABEL] == 1])))
print('te = %d, te_pos_num = %d' % (len(te), len(te[te[LABEL] == 1])))
y_tr = tr.loc[:, LABEL]
y_te = te.loc[:, LABEL]
tr.drop(labels=drop_cols, axis=1, inplace=True)
te.drop(labels=drop_cols, axis=1, inplace=True)
X_tr, X_te = tr, te
return X_tr, y_tr, X_te, y_te
def train(X_tr, y_tr, X_te, y_te):
trainset = lgb.Dataset(data=X_tr, label=y_tr)
# trainset = lgb.Dataset(data=X_tr, label=y_tr, weight=weights)
testset = lgb.Dataset(data=X_te, label=y_te)
lgb_model = lgb.train(params,
train_set=trainset,
valid_sets=[trainset, testset],
num_boost_round=2000,
early_stopping_rounds=100)
lgb_model.save_model(model_file_path)
def eval(X, y):
gbm = lgb.Booster(model_file=model_file_path)
# 生成segment文件
eval_segment_metrics(target=y.values,
segment_cnt=50,
predict_proba=gbm.predict(X),
out_path='aa_segment_' + LABEL + 'kk.xlsx')
pred = gbm.predict(X, num_iteration=gbm.best_iteration)
print ('predicting, classification error=%f'
% (sum(int(pred[i]) != y[i] for i in range(len(y))) / float(len(y))))
y_pred = (pred >= THRESHOLD) * 1
print ('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
print ('AUC: %.4f' % metrics.roc_auc_score(y, pred))
print ('Recall: %.4f' % metrics.recall_score(y, y_pred))
print ('F1-score: %.4f' % metrics.f1_score(y, y_pred))
print ('Precesion: %.4f' % metrics.precision_score(y, y_pred))
print(metrics.confusion_matrix(y, y_pred))
def print_important_feature(feature_names):
gbm = lgb.Booster(model_file=model_file_path)
# gbm.feature_importance('split')
importances = pd.Series(gbm.feature_importance('gain'),
index=feature_names
).sort_values(ascending=False)
print(importances)
def predict():
gbm = lgb.Booster(model_file=model_file_path)
test_data = pd.read_csv(data_input_path, sep='\t')
test_data = test_data.loc[:, feature_list]
test_data['score'] = gbm.predict(test_data)
result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
print("-----阈值门槛数据量: " + str(len(result)))
result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')
if __name__ == "__main__":
train_data_path = '../data/train.txt'
test_data_path = '../data/test.txt'
# 获取训练集,测试集
X_tr, y_tr, X_te, y_te = get_dataset(train_data_path, test_data_path)
# 训练模型
train(X_tr, y_tr, X_te, y_te)
# 评估模型
eval(X_te, y_te)
# 打印模型特征重要度
feature_names = X_tr.columns.tolist()
print_important_feature(feature_names)
# 模型预测
predict()