金融贷款预测

本数据来源于https://tianchi.aliyun.com/course/courseConsole?spm=5176.12282070.0.0.764c290a2RIpBY&courseId=192&chapterIndex=10&sectionIndex=1

本是天池的课堂,不过讲的不是太清楚,所以后面的变量控制,清洗数据都是按照自己的想法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv(r'LoanStats_2016Q3.csv',skiprows=1,low_memory=False)
df.info()

先删除一些无关数据 ,数据格式都统一

df.drop('id',1,inplace=True)
df.drop('member_id',1,inplace=True)
df.dropna(axis=0,how='all',inplace=True) #all空就删
df.dropna(axis=0,how='all',inplace=True) #all空就删
df.emp_title=df.emp_title.astype('str').apply(lambda x:x.lower())#由于数据形式大小写没固定,全部转化一下,可以避免大小写不同导致的特征不一样
df.drop('emp_title',1,inplace=True)#这是职业,分类太多了,先暂时删掉,之后有时间可以加回来

 年份信息格式修好统一

df['emp_length'].fillna(value=0,inplace=True)
#正则替换掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
#正则替换掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)

查看缺失值的比例

#处理对象类型的缺失,O代表float型的字段
df.select_dtypes(include=['O']).describe().T.assign(missing_pct=df.apply(lambda x: (len(x)-x.count())/float(len(x))))

删除缺失值太多的列

df.drop('desc',1,inplace=True)
df.drop('verification_status_joint',1,inplace=True)
df.drop('zip_code',1,inplace=True)
df.drop('addr_state',1,inplace=True)
df.drop('earliest_cr_line',1,inplace=True)
df.drop('revol_util',1,inplace=True)
df.drop('title',1,inplace=True)
df.drop('term',1,inplace=True)
df.drop('issue_d',1,inplace=True)

删除贷款后的信息

df.drop(['out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','grade','sub_grade'],1,inplace=True)
df.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee'],1,inplace=True)
df.drop(['last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d'],1,inplace=True)
df.drop(['policy_code'],1,inplace=True)
df.drop('annual_inc_joint',1,inplace=True)
df.drop('dti_joint',1,inplace=True)

是否还款按是否违约分类

dict={lo[0]:1, lo[1]:1,  lo[2]:0, lo[3]:np.nan, lo[4]:0, lo[5]:np.nan, lo[6]:np.nan}
df.loan_status.replace(dict,inplace=True)

 剩下一些变量修改

df.loan_status.dropna(how='any',inplace=True)
df.dropna(subset=['loan_status'],inplace=True)#这列的空值删除
df.loan_status=df.loan_status.astype('float').astype('int')
#df.loan_status=df.loan_status.apply(lambda x : int(str(x).strip()))#有时候int(str(x)会出错,用float代替会好一些
df.int_rate=df.int_rate.apply(lambda x:float(x[:-1])/100 if '%' in x else(np.nan))

删除相关性。corr太高的数据

df.drop(['funded_amnt','funded_amnt_inv','installment'],axis=1,inplace=True)

get_dummies,处理特征

df.home_ownership.value_counts()
home_dumm=pd.get_dummies(df.home_ownership)
df= pd.concat([df,home_dumm],axis=1)
df.drop(['home_ownership'],1,inplace=True)

df.verification_status.value_counts()
veristatus_dumm=pd.get_dummies(df.verification_status)
df= pd.concat([df,veristatus_dumm],axis=1)
df.drop(['verification_status'],1,inplace=True)

df.application_type.value_counts()
apptype_dumm=pd.get_dummies(df.application_type)
df= pd.concat([df,apptype_dumm],axis=1)
df.drop(['application_type'],1,inplace=True)

df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)

df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)

df.initial_list_status.value_counts()
inls_dumm=pd.get_dummies(df.initial_list_status)
df= pd.concat([df,inls_dumm],axis=1)
df.drop(['initial_list_status'],1,inplace=True)

还没有做特征工程,先尝试丢进模型里面试试

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
Y=df.loan_status
df.drop(['loan_status'],1,inplace=True)
X=df.copy()
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=123)

 简单的调试了一下模型,达到auc=0.68,不算很高,一般般的效果

lgb_train = lgb.Dataset(x_train,y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'binary', # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 30,   # 叶子节点数*
    'learning_rate': 0.1,  # 学习速率
    'feature_fraction': 1, # 建树的特征选择比例,默认0,
    'bagging_fraction': 1, # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1, # <0 显示致命的, =0 显示错误 (警告), =1 显示信息,>1 调试
    'min_data_in_leaf':30 #防止过拟合,对于大数据集设置几百几千*
}
params['is_unbalance']='true'
#bst=lgb.cv(params,lgb_train,nfold=3,early_stopping_rounds=5)
est=lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval)#valid_sets=lgb_eval 每次迭代显示auc
y_pre=est.predict(x_test)
roc_auc_score(y_test,y_pre)
[1]	valid_0's auc: 0.671759	valid_0's l2: 0.0389263
[2]	valid_0's auc: 0.672499	valid_0's l2: 0.0479194
[3]	valid_0's auc: 0.678703	valid_0's l2: 0.0570938
[4]	valid_0's auc: 0.679827	valid_0's l2: 0.0665111
[5]	valid_0's auc: 0.681003	valid_0's l2: 0.0758469
[6]	valid_0's auc: 0.683287	valid_0's l2: 0.0848328
[7]	valid_0's auc: 0.683924	valid_0's l2: 0.0933776
[8]	valid_0's auc: 0.682981	valid_0's l2: 0.101624
[9]	valid_0's auc: 0.684177	valid_0's l2: 0.108998
[10]	valid_0's auc: 0.68466	valid_0's l2: 0.116012
[11]	valid_0's auc: 0.685856	valid_0's l2: 0.122292
[12]	valid_0's auc: 0.687436	valid_0's l2: 0.128198
[13]	valid_0's auc: 0.688577	valid_0's l2: 0.133389
[14]	valid_0's auc: 0.689055	valid_0's l2: 0.138128
[15]	valid_0's auc: 0.688259	valid_0's l2: 0.142351
[16]	valid_0's auc: 0.68847	valid_0's l2: 0.14615
[17]	valid_0's auc: 0.689827	valid_0's l2: 0.149469
[18]	valid_0's auc: 0.690383	valid_0's l2: 0.152582
[19]	valid_0's auc: 0.690423	valid_0's l2: 0.155276
[20]	valid_0's auc: 0.689452	valid_0's l2: 0.157742
[21]	valid_0's auc: 0.689922	valid_0's l2: 0.159834
[22]	valid_0's auc: 0.689969	valid_0's l2: 0.161778
[23]	valid_0's auc: 0.690007	valid_0's l2: 0.163318
[24]	valid_0's auc: 0.689451	valid_0's l2: 0.164765
[25]	valid_0's auc: 0.689326	valid_0's l2: 0.165952
[26]	valid_0's auc: 0.690091	valid_0's l2: 0.166698
[27]	valid_0's auc: 0.689723	valid_0's l2: 0.167523
[28]	valid_0's auc: 0.689428	valid_0's l2: 0.168271
[29]	valid_0's auc: 0.687731	valid_0's l2: 0.168675
[30]	valid_0's auc: 0.686303	valid_0's l2: 0.169073
0.6863027484698025

第二个模型,使用逻辑回归,可能哪里出错了,导致auc=0.5,暂时将案例放上了,之后有时间再仔细调整

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler#归一化,除以最大减最小
from sklearn.preprocessing import StandardScaler#标准化 ,除以方差
x_train3=StandardScaler(x_train)
x_test3=StandardScaler(x_test)
x_train=x_train.fillna(0)
x_test=x_test.fillna(0)

lr = LogisticRegression(C=190,dual=True,random_state=123)
lr.fit(b,y_train) 
y_pre1 = lr.predict(x_test)

roc_auc_score(y_test,y_pre1)

0.5

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值