数据还是采用这个例子里的数据,具体背景也同上。
添模型构建——使用逻辑回归构建模型,lightGBM进行特征筛选
lightGBM模型介绍请看这个链接:集成学习——Boosting算法:Adaboost、GBDT、XGBOOST和lightGBM的简要原理和区别
具体代码如下:
导入模块
# 导入模块
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
读取数据
df = pd.read_csv('Bcard.txt')
print(df.info())
df.head()
划分训练集和测试集
# 划分测试集和验证集
train = df[df.obs_mth!='2018-11-30'].reset_index().sort_values('obs_mth', ascending=False)
val = df[df.obs_mth == '2018-11-30'].reset_index()
train.head()
将训练集的数据进行分组
# 按照时间先后顺序分为5组
train['rank'] = [i for i in range(train.shape[0])]
train['rank'] = pd.cut(train['rank'], bins=5, labels=[i for i in range(5)])
train['rank'].value_counts()
获取特征
ft_lst = train.columns
ft_lst=ft_lst.drop(['index','rank','bad_ind','obs_mth','uid'])
ft_lst
定义模型函数
# 先定义lgb模型函数
def lgb_test(train_X,train_y,test_X,test_y):
from multiprocessing import cpu_count
lgb_clf = lgb.LGBMClassifier(learning_rate=0.05,n_estimators=100)
lgb_clf.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], eval_metric='auc', early_stopping_rounds=100)
lgb.plot_metric(lgb_clf,metric='auc')
# print(lgb_clf.n_features_)
return lgb_clf, lgb_clf.best_score_