天池贷款违约预测

# 读取数据
data_row = pd.read_csv('D:/天池/贷款违约预测/train.csv')
data_row.head(5)
# 查看分类情况
data_row.isDefault.value_counts()
# 提取类别
cols_row = data_row.columns.to_list()
# 提取文本列并替换为数字
cat_ = data_row.select_dtypes(include='object').columns

Index([‘grade’, ‘subGrade’, ‘employmentLength’, ‘issueDate’,
‘earliesCreditLine’],
dtype=‘object’)

data_row['employmentLength'].unique()
data_row['employmentLength'].replace({'10+ years':10,'9 years':9,'8 years':8,'7 years':7,'6 years':6,'5 years':5,'4 years':4,'3 years':3,'2 years':2,'1 year':1,'< 1 year':0},inplace=True)
data_row['employmentLength'].unique()

array([ 2., 5., 8., 10., nan, 7., 9., 1., 3., 0., 4., 6.])

data_row['grade'].replace({'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1},inplace=True)
data_row['subGrade'].replace({'E2':42,'D2':32,'D3':33,'A4':4, 'C2':22, 'A5':5, 'C3':23, 'B4':14, 'B5':15, 'E5':45, 'D4':34,
       'B3':13, 'B2':12, 'D1':31, 'E1':41, 'C5':25, 'C1':21, 'A2':2, 'A3':3, 'B1':11, 'E3':43, 'F1':51,
       'C4':24, 'A1':1, 'D5':35, 'F2':52, 'E4':44, 'F3':53, 'G2':62, 'F5':55, 'G3':63, 'G1':61, 'F4':54,
       'G4':64, 'G5':65},inplace=True)
# 删除无用列
data_row.drop(columns=['issueDate'],inplace=True)
data_row.drop(columns=['earliesCreditLine'],inplace=True)
data_row.drop(columns='id',axis=1,inplace=True)
data_row.drop(columns=['postCode','regionCode'],axis=1,inplace=True)
# 缺失值处理

# 查看缺失值
total = data_row.isnull().sum().sort_values(ascending=False)
percent = data_row.isnull().sum().sort_values(ascending=False)/total.sum()
missing_data = pd.concat([total,percent],axis=1,keys=['total','percent'])
missing_data.head(20)
data_row.fillna(method='ffill',inplace=True)
data_row.isnull().sum().sort_values(ascending=False)
# 处理异常值

# 均方差检测

def find_outliers(data,fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    cut_off = data_std * 3
    lower_rule = data_mean - cut_off
    upper_rule = data_mean + cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x :str('异常值') if x > upper_rule or x <lower_rule else str('正常值'))
    return data
fea_cols=data_row.columns.to_list()
fea_cols.remove('isDefault')
# 检测异常值

data_train = data_row.copy()
for fea in fea_cols:
    data_train = find_outliers(data_train,fea)
# 删除异常值

for fea in fea_cols:
    data_train = data_train[data_train[fea+'_outliers']=='正常值']
# 删除标签列

for fea in fea_cols:
    data_train = data_train.drop(columns=fea+'_outliers')
data_train = data_train.reset_index(drop=True)
# 划分标签列与特征列

X = data_train.loc[:,data_train.columns != 'isDefault']
y = data_train['isDefault']
data_train.isDefault.value_counts()
# 对数据进行下采样

number_default = len(data_train[data_train.isDefault == 1])
default_indices = np.array(data_train[data_train.isDefault == 1].index)

default_indices
normal_indices = data_train[data_train.isDefault == 0].index

random_normal_indices = np.random.choice(normal_indices, number_default, replace = False)  # 随机选择
random_normal_indices = np.array(random_normal_indices)

under_sample_indices = np.concatenate([default_indices,random_normal_indices])        # 将index值合并在一起

under_sample_data = data_train.iloc[under_sample_indices,:]    # 定位

X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'isDefault']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'isDefault']

print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.isDefault == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.isDefault == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# 划分数据集

from sklearn.model_selection import train_test_split

X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y_undersample,test_size=0.3,random_state=0)

print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transcations test dataset: ",len( X_test_undersample))
print("Total number of transcations: ",len(X_train_undersample)+len(X_test_undersample))
# 随机森林分类

from sklearn.metrics import roc_auc_score
rf = RandomForestClassifier(random_state=666, n_estimators=50, min_samples_split=8, min_samples_leaf=2,max_depth=8)
rf.fit(X_train_undersample, y_train_undersample)

# print(rf.oob_score_)

y_predprob = rf.predict_proba(X_test_undersample)[:,1]
y_pred=rf.predict(X_test_undersample)


print("AUC Score (Train): %f" % roc_auc_score(y_test_undersample, y_predprob))
### 天池平台贷款违约预测模型Top解决方案概述 在天池平台上,针对贷款违约预测这一竞赛主题,多个团队提出了创新性的方法来提高预测准确性。这些顶级解决方案通常综合运用了多种技术手段,包括但不限于特征工程、模型选择以及集成学习策略。 #### 特征工程技术应用 为了提升模型的表现力,在处理原始数据时会进行深入的特征挖掘工作。这不仅限于简单的数值转换,还包括创建新的衍生特征以捕捉潜在模式。例如,通过计算借款人的信用评分变化趋势作为额外输入项给到算法[^1]。此外,还可能涉及到时间序列分析,用于提取历史还款行为中的周期性和季节性成分[^3]。 #### 模型架构设计 对于此类分类问题,参赛者们倾向于采用复杂的机器学习框架来进行训练。常见的做法是从简单线性回归起步逐步过渡至更高级别的随机森林和支持向量机等非参数化模型。然而真正脱颖而出的作品往往依赖于深度神经网络或是XGBoost这样的梯度增强决策树结构。特别是后者因其高效能而在实际比赛中被广泛采纳并取得优异成绩[^2]。 ```python import xgboost as xgb from sklearn.model_selection import train_test_split # 假设df为预处理后的DataFrame, target为目标列名 X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=[target]), df[target], test_size=0.2) dtrain = xgb.DMatrix(X_train, label=y_train) dval = xgb.DMatrix(X_val, label=y_val) params = { 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc'], } bst = xgb.train(params=params, dtrain=dtrain, num_boost_round=100, evals=[(dval, 'validation')]) ``` #### 集成学习与超参数优化 除了单个强大基底外,许多优胜队伍还会利用Bagging/Boosting机制构建强大的元估计器。比如LightGBM和CatBoost都是不错的选择。与此同时,借助贝叶斯优化工具Hyperopt或网格搜索GridSearchCV实现自动化调参过程也是必不可少的一环。这种方法能够有效减少人工干预的同时找到全局最优解空间内的配置组合。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值