1 案例描述
赛题以预测用户贷款是否违约为任务,数据集报名后可见并可下载,该数据来自某信贷平台的贷款记录,总数据量超过120w,包含47列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取80万条作为训练集,20万条作为测试集A,20万条作为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。
提交结果为每个测试样本是1的概率,也就是y为1的概率。评价方法为AUC评估模型效果(越大越好)。
2 代码详情
import pandas as pd
import numpy as np
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import auc, roc_curve
from lightgbm import LGBMRegressor
# 1 导入数据
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./testA.csv', index_col='id')
target = train.pop('isDefault')
test = test[train.columns]
# 2 非数值列
s = train.apply(lambda x: x.dtype)
tecols = s[s == 'object'].index.tolist()
# 3 模型
def makelgb():
lgbr = LGBMRegressor(num_leaves=30
, max_depth=5
, learning_rate=.02
, n_estimators=1000
, subsample_for_bin=5000
, min_child_samples=200
, colsample_bytree=.2
, reg_alpha=.1
, reg_lambda=.1
)
return lgbr
# 4 k折交叉验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
tf = train.iloc[tidx] # 训练集
df = train.iloc[didx] # 验证集
tt = target.iloc[tidx] # 训练集target
dt = target.iloc[didx] # 验证集target
te = TargetEncoder(cols=tecols)
tf = te.fit_transform(tf, tt) # 训练集 目标编码器转换
df = te.transform(df) # 验证集 目标编码器转换
lgbr = makelgb()
lgbr.fit(tf, tt) # 训练集和训练集target 模型训练
pre = lgbr.predict(df) # 验证集 预测的标签值
fpr, tpr, thresholds = roc_curve(dt, pre) # 验证集预测结果和验证集target得出ROC
score = auc(fpr, tpr)
devscore.append(score)
print(np.mean(devscore))
# 5 在整个train集上重新训练,
lgbr = makelgb()
te = TargetEncoder(cols=tecols) # 方法
tf = te.fit_transform(train, target) # fit_transform
lgbr.fit(tf, target) # 模型训练
# 6 预测test
df = te.transform(test) # transform test数据集、目标编码器转换
pre = lgbr.predict(df)
# 7 输出结果
pd.Series(pre, name='isDefault', index=test.index).reset_index().to_csv('submit.csv', index=False)