本是天池的课堂,不过讲的不是太清楚,所以后面的变量控制,清洗数据都是按照自己的想法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv(r'LoanStats_2016Q3.csv',skiprows=1,low_memory=False)
df.info()
先删除一些无关数据 ,数据格式都统一
df.drop('id',1,inplace=True)
df.drop('member_id',1,inplace=True)
df.dropna(axis=0,how='all',inplace=True) #all空就删
df.dropna(axis=0,how='all',inplace=True) #all空就删
df.emp_title=df.emp_title.astype('str').apply(lambda x:x.lower())#由于数据形式大小写没固定,全部转化一下,可以避免大小写不同导致的特征不一样
df.drop('emp_title',1,inplace=True)#这是职业,分类太多了,先暂时删掉,之后有时间可以加回来
年份信息格式修好统一
df['emp_length'].fillna(value=0,inplace=True)
#正则替换掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
#正则替换掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
查看缺失值的比例
#处理对象类型的缺失,O代表float型的字段
df.select_dtypes(include=['O']).describe().T.assign(missing_pct=df.apply(lambda x: (len(x)-x.count())/float(len(x))))
删除缺失值太多的列
df.drop('desc',1,inplace=True)
df.drop('verification_status_joint',1,inplace=True)
df.drop('zip_code',1,inplace=True)
df.drop('addr_state',1,inplace=True)
df.drop('earliest_cr_line',1,inplace=True)
df.drop('revol_util',1,inplace=True)
df.drop('title',1,inplace=True)
df.drop('term',1,inplace=True)
df.drop('issue_d',1,inplace=True)
删除贷款后的信息
df.drop(['out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','grade','sub_grade'],1,inplace=True)
df.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee'],1,inplace=True)
df.drop(['last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d'],1,inplace=True)
df.drop(['policy_code'],1,inplace=True)
df.drop('annual_inc_joint',1,inplace=True)
df.drop('dti_joint',1,inplace=True)
是否还款按是否违约分类
dict={lo[0]:1, lo[1]:1, lo[2]:0, lo[3]:np.nan, lo[4]:0, lo[5]:np.nan, lo[6]:np.nan}
df.loan_status.replace(dict,inplace=True)
剩下一些变量修改
df.loan_status.dropna(how='any',inplace=True)
df.dropna(subset=['loan_status'],inplace=True)#这列的空值删除
df.loan_status=df.loan_status.astype('float').astype('int')
#df.loan_status=df.loan_status.apply(lambda x : int(str(x).strip()))#有时候int(str(x)会出错,用float代替会好一些
df.int_rate=df.int_rate.apply(lambda x:float(x[:-1])/100 if '%' in x else(np.nan))
删除相关性。corr太高的数据
df.drop(['funded_amnt','funded_amnt_inv','installment'],axis=1,inplace=True)
get_dummies,处理特征
df.home_ownership.value_counts()
home_dumm=pd.get_dummies(df.home_ownership)
df= pd.concat([df,home_dumm],axis=1)
df.drop(['home_ownership'],1,inplace=True)
df.verification_status.value_counts()
veristatus_dumm=pd.get_dummies(df.verification_status)
df= pd.concat([df,veristatus_dumm],axis=1)
df.drop(['verification_status'],1,inplace=True)
df.application_type.value_counts()
apptype_dumm=pd.get_dummies(df.application_type)
df= pd.concat([df,apptype_dumm],axis=1)
df.drop(['application_type'],1,inplace=True)
df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)
df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)
df.initial_list_status.value_counts()
inls_dumm=pd.get_dummies(df.initial_list_status)
df= pd.concat([df,inls_dumm],axis=1)
df.drop(['initial_list_status'],1,inplace=True)
还没有做特征工程,先尝试丢进模型里面试试
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
Y=df.loan_status
df.drop(['loan_status'],1,inplace=True)
X=df.copy()
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=123)
简单的调试了一下模型,达到auc=0.68,不算很高,一般般的效果
lgb_train = lgb.Dataset(x_train,y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
params = {
'task': 'train',
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'binary', # 目标函数
'metric': {'l2', 'auc'}, # 评估函数
'num_leaves': 30, # 叶子节点数*
'learning_rate': 0.1, # 学习速率
'feature_fraction': 1, # 建树的特征选择比例,默认0,
'bagging_fraction': 1, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1, # <0 显示致命的, =0 显示错误 (警告), =1 显示信息,>1 调试
'min_data_in_leaf':30 #防止过拟合,对于大数据集设置几百几千*
}
params['is_unbalance']='true'
#bst=lgb.cv(params,lgb_train,nfold=3,early_stopping_rounds=5)
est=lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval)#valid_sets=lgb_eval 每次迭代显示auc
y_pre=est.predict(x_test)
roc_auc_score(y_test,y_pre)
[1] valid_0's auc: 0.671759 valid_0's l2: 0.0389263 [2] valid_0's auc: 0.672499 valid_0's l2: 0.0479194 [3] valid_0's auc: 0.678703 valid_0's l2: 0.0570938 [4] valid_0's auc: 0.679827 valid_0's l2: 0.0665111 [5] valid_0's auc: 0.681003 valid_0's l2: 0.0758469 [6] valid_0's auc: 0.683287 valid_0's l2: 0.0848328 [7] valid_0's auc: 0.683924 valid_0's l2: 0.0933776 [8] valid_0's auc: 0.682981 valid_0's l2: 0.101624 [9] valid_0's auc: 0.684177 valid_0's l2: 0.108998 [10] valid_0's auc: 0.68466 valid_0's l2: 0.116012 [11] valid_0's auc: 0.685856 valid_0's l2: 0.122292 [12] valid_0's auc: 0.687436 valid_0's l2: 0.128198 [13] valid_0's auc: 0.688577 valid_0's l2: 0.133389 [14] valid_0's auc: 0.689055 valid_0's l2: 0.138128 [15] valid_0's auc: 0.688259 valid_0's l2: 0.142351 [16] valid_0's auc: 0.68847 valid_0's l2: 0.14615 [17] valid_0's auc: 0.689827 valid_0's l2: 0.149469 [18] valid_0's auc: 0.690383 valid_0's l2: 0.152582 [19] valid_0's auc: 0.690423 valid_0's l2: 0.155276 [20] valid_0's auc: 0.689452 valid_0's l2: 0.157742 [21] valid_0's auc: 0.689922 valid_0's l2: 0.159834 [22] valid_0's auc: 0.689969 valid_0's l2: 0.161778 [23] valid_0's auc: 0.690007 valid_0's l2: 0.163318 [24] valid_0's auc: 0.689451 valid_0's l2: 0.164765 [25] valid_0's auc: 0.689326 valid_0's l2: 0.165952 [26] valid_0's auc: 0.690091 valid_0's l2: 0.166698 [27] valid_0's auc: 0.689723 valid_0's l2: 0.167523 [28] valid_0's auc: 0.689428 valid_0's l2: 0.168271 [29] valid_0's auc: 0.687731 valid_0's l2: 0.168675 [30] valid_0's auc: 0.686303 valid_0's l2: 0.169073
0.6863027484698025
第二个模型,使用逻辑回归,可能哪里出错了,导致auc=0.5,暂时将案例放上了,之后有时间再仔细调整
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler#归一化,除以最大减最小
from sklearn.preprocessing import StandardScaler#标准化 ,除以方差
x_train3=StandardScaler(x_train)
x_test3=StandardScaler(x_test)
x_train=x_train.fillna(0)
x_test=x_test.fillna(0)
lr = LogisticRegression(C=190,dual=True,random_state=123)
lr.fit(b,y_train)
y_pre1 = lr.predict(x_test)
roc_auc_score(y_test,y_pre1)
0.5