该工程是2019招商银行的赛题2:基于收支记录判断借贷意愿 的完整代码
这类题目的大致流程为:
数据分析
数据清洗
特征提取
选择模型
建模
训练模型与调参
数据预测
#导入相关库
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn import metrics #scklearn functions
import matplotlib.pylab as plt
from datetime import datetime
#读取csv文件
train = pd.read_csv(r'data/FT_Camp_2/train.csv')
cust_bas_inf = pd.read_csv(r'data/FT_Camp_2/cust_bas_inf.csv')
g2 = pd.read_csv(r'data/FT_Camp_2/g2.csv')
pred_users = pd.read_csv(r'data/FT_Camp_2/pred_users.csv')
sz_detail = pd.read_csv(r'data/FT_Camp_2/sz_detail.csv')
trx_cod = pd.read_csv(r'data/FT_Camp_2/trx_cod.csv')
#清除重复数据
cust_bas_inf = cust_bas_inf.drop_duplicates()
#空数据填充
age = cust_bas_inf['age']
gender = cust_bas_inf['gender']
aum227 = cust_bas_inf['aum227']
aum306 = cust_bas_inf['aum306']
for i in range(124000):
if aum227.loc[i]=='\\N':
aum227.loc[i]='0'
if aum306.loc[i]=='\\N':
aum306.loc[i]='0'
if gender.loc[i]=='\\N':
gender.loc[i]='M'
if age.loc[i]=='\\N':
age.loc[i]='72'
#转换类型
float_aum227 = aum227.astype('float64')
float_aum306 = aum306.astype('float64')
int_age = age.astype('int')
#合并到源数据集中
cust_bas_inf = pd.merge(cust_bas_inf,float_aum227,right_index=True,left_index=True)
cust_bas_inf = pd.merge(cust_bas_inf,float_aum306,right_index=True,left_index=True)
cust_bas_inf = pd.merge(cust_bas_inf,int_age,right_index=True,left_index=True)
#取出能用的部分
cust_bas_inf=cust_bas_inf[['id','gender','aum227_y','aum306_y','age_y']]
#性别one_hot编码
cust_bas_inf=pd.get_dummies(cust_bas_inf,columns=['gender'])
#更名
colNameDict = {'age_y':'age'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)
colNameDict = {'gender_F':'F'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)
colNameDict = {'gender_M':'M'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)
#对年龄进行min-max归一化
agemin = np.min(cust_bas_inf['age'])
agemax = np.max(cust_bas_inf['age'])
cust_bas_inf['age'] = (cust_bas_inf['age']-agemin)/(agemax-agemin)
#Z-score标准化方法(保留正负)
cust_bas_inf['aum227_y'] = (cust_bas_inf['aum227_y'] - cust_bas_inf['aum227_y'].mean())/cust_bas_inf['aum227_y'].std()
cust_bas_inf['aum306_y'] = (cust_bas_inf['aum306_y'] - cust_bas_inf['aum306_y'].mean())/cust_bas_inf['aum306_y'].std()
#更名 分为t与p,分别为训练与预测数据
cust_t = cust_bas_inf[['id','F','M','aum227_y','age']]
cust_p = cust_bas_inf[['id','F','M','aum306_y','age']]
colNameDict = {'aum227_y':'aum'}
cust_t.rename(columns = colNameDict,inplace=True)
colNameDict = {'aum306_y':'aum'}
cust_p.rename(columns = colNameDict,inplace=True)
cust_p.head()
sz_detail.head()
#把数据按日期分为两部分
sz_detail['prt_dt'] = sz_detail['prt_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
data_before = sz_detail[(sz_detail['prt_dt']>= datetime.strptime('2019-01-01','%Y-%m-%d')) &
(sz_detail['prt_dt']<datetime.strptime('2019-02-28','%Y-%m-%d'))]
data_after = sz_detail[(sz_detail['prt_dt']>= datetime.strptime('2019-01-01','%Y-%m-%d')) &
(sz_detail['prt_dt']<datetime.strptime('2019-03-07','%Y-%m-%d'))]
data_before.head()
#对rmb_amt求和并进行归一化处理
data_before = data_before.groupby('id')['rmb_amt'].nunique()
data_before = data_before.to_frame().reset_index()
data_before.columns = ['id', 'rmb_sum']
data_before['rmb_sum'] = (data_before['rmb_sum'] - data_before['rmb_sum'].mean())/data_before['rmb_sum'].std()
data_after = data_after.groupby('id')['rmb_amt'].nunique()
data_after = data_after.to_frame().reset_index()
data_after.columns = ['id', 'rmb_sum']
data_after['rmb_sum'] = (data_after['rmb_sum'] - data_after['rmb_sum'].mean())/data_after['rmb_sum'].std()
data_after