# 功能性约束
import pandas as pd
# 1.数据准备:
# (1)导入数据
df = pd.read_csv('Credit_card.csv')
# (2)查看数据前五行数据
print(df.head(5))
# (3)查看数据后五行数据
print(df.tail(5))
# (4)查看数据描述性统计
print(df.describe())
# (5)查看数据信息
df.info()
# 2.数据预处理
# (1)使用pandas当中的方法查看数据的缺失值
if df.isnull().sum().any():
print(df.isnull().sum().any())
# (2)对'Annual_income', 'Birthday_count'字段采用均值进行填充
df['Annual_income'] = df['Annual_income'].fillna(df['Annual_income'].mean())
df['Birthday_count'] = df['Birthday_count'].fillna(df['Birthday_count'].mean())
# (3)对'GENDER', 'Type_Occupation'字段采用众数填充
df['GENDER'] = df['GENDER'].fillna(df['GENDER'].mode()[0])
df['Type_Occupation']= df['Type_Occupation'].fillna(df['Type_Occupation'].mode()[0])
# (4)打印填充后的值检查是否填充成功
if df.isnull().sum().any():
print(df.isnull().sum().any())
else:
print('没有缺失值')
# (5)打印该列中每个唯一值及其对应的计数。
df_vc = df['label'].value_counts()
print(df_vc)
# (6)删除ID编号和任何手机字段
df.drop(columns=['Ind_ID','Mobile_phone','Work_Phone','Phone','EMAIL_ID'], inplace=True)
# (7)对以下字段进行二值化处理('GENDER', 'Car_Owner', 'Propert_Owner')
from sklearn.preprocessing import LabelEncoder
alist = ['GENDER', 'Car_Owner', 'Propert_Owner']
for i in alist:
df[i] = LabelEncoder().fit_transform(df[i])
# (8)打印数据前几行查看是否二值化成功
print(df.head())
''''
二值化成功
'''
# (9)对以下字段进行独热处理('Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation')
from sklearn.preprocessing import OneHotEncoder
blist = ['Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation']
for i in blist:
oht = OneHotEncoder(sparse_output=False)
df_oe = pd.DataFrame(oht.fit_transform(df[[i]]),columns=oht.get_feature_names_out())
df = pd.concat([df.drop(columns=i),df_oe], axis = 1)
# (10)打印数据前几行查看数据是否独热成功
print(df.head())
df.info()
# (11)分析不人口统计学特征(如性别、年龄、婚姻状况)对信用卡申请的影响和规律
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x = df['GENDER'], hue = df['label'])
plt.show()
sns.countplot(x = df['Birthday_count'], hue = df['label'])
plt.show()
sns.countplot(x = df['Birthday_count'], hue = df['label'])
plt.show()
# (12)可视化展示分析结果
# (13)文本总结分析结论
# (14)分析不同社会经济特征(如收入、职业、教育程度)与申请结果的关系
# (15)可视化展示分析结果
# (16)文本总结分析结论
# 3.特征选择以及处理
# (1)获取特征
# (2)获取标签
y, x = df.pop('label'), df
# (3)对标签进行类型转换为numpy类型
import numpy as np
y = np.array(y)
# (4)对标签列进行补充一列
y = np.reshape(y,(-1,1))
# (5)使用划分方法划分训练数据和测试数据
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=10010)
# (6)查看训练数据和测试数据四部分形状(shape实现)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
# (7)将训练标签列类型转换为dataframe类型
y_train = pd.DataFrame(y_train)
# (8)查看训练标签列是否平衡
print(y_train.value_counts())
''''
不平衡
'''
df_vc = y_train.value_counts().sort_index()
df_vc.index = [idx[0] for idx in df_vc.index]
print(df_vc.index)
# (9)可视化绘制训练数据标签分布图(可用条形图)
sns.barplot(x = df_vc.index, hue = df_vc.values)
plt.show()
# (10)使用过采样方法(SMOTE)对训练数据(train_x,train_y)不平衡进行处理
from imblearn.over_sampling import SMOTE
x_train, y_train = SMOTE().fit_resample(x_train, y_train)
print(y_train.value_counts())
# 4.算法模型:(1)逻辑回归模型
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# (2)集成学习模型(使用RandomForestClassifier随机森林)
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier()
# 5、算法模型训练:
from sklearn.model_selection import GridSearchCV
# (1)使用网格搜索交叉验证设置合适的超参数,如正则化项、树的数量等
lr_gs = GridSearchCV(estimator=lr, param_grid={'C':[0.2,0.4,0.6,0.7]})
rc_gs = GridSearchCV(estimator=rc, param_grid={'max_depth':[10,20,40,60]})
# (2)进行网格训练
lr_gs.fit(x_train, y_train)
rc_gs.fit(x_train, y_train)
# (3)打印网格最优参数
print(lr_gs.best_params_['C'])
print(rc_gs.best_params_['max_depth'])
# (4)打印网格最优得分
print(lr_gs.score(x_test, y_test))
print(rc_gs.score(x_test, y_test))
# (5)传入网格最优参数进行逻辑回归、随机森林模型训练。
lr = LogisticRegression(C =lr_gs.best_params_['C'] )
rc = RandomForestClassifier(max_depth=rc_gs.best_params_['max_depth'])
lr.fit(x_train, y_train)
rc.fit(x_train, y_train)
# (6)计算各模型F1值作为评估指标
# (7)计算各模型准确率
# (8)计算各模型召回率
# (9)计算各模型混淆矩阵
# (10)计算各模型分类报告
# (11)绘制各模型roc曲线
# (12)计算各模型auc数值等指标
# (13)绘制各模型学习曲线进行分析
from sklearn.metrics import f1_score,accuracy_score,recall_score,confusion_matrix,classification_report
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.model_selection import learning_curve
models = [lr,rc]
names = ['lr','rc']
for i in range(1, len(models)+1):
m = models[i]
y_pred = m.predict(x_test)
print(f'{names[i]}的评估{f1_score(y_test, y_pred)}')
print(f'{names[i]}的评估{accuracy_score(y_test, y_pred)}')
print(f'{names[i]}的评估{recall_score(y_test, y_pred)}')
print(f'{names[i]}的评估{confusion_matrix(y_test, y_pred)}')
print(f'{names[i]}的评估{classification_report(y_test, y_pred)}')
fpr,tpr,th = roc_curve(y_test, m.predict_proba(x_test)[:,1])
plt.plot(fpr,tpr)
plt.show()
print(f'{names[i]}的评估{roc_auc_score(y_test, m.predict_proba(x_test)[:, 1])}')
train_size, y1, y2 = learning_curve(estimator=m, X = x_train, Y = y_train)
plt.plot(train_size, y1.mean(axis = 1))
plt.plot(train_size, y2.mean(axis = 1))
plt.show()
# 6、不同大小训练数据的拟合与测量
# 数据子集划分:
# (1)从原始训练集中分别提取200个、300个和400个数据点作为不同大小的训 练数据。
# 模型训练与评估:
# (2)对每种模型在不同大小的训练数据上进行拟合。
# (3)计算并记录每种情况下的训练时间、预测时间、训练集和测试集的F1分数。
import time
train_s = [200,300,400]
models = [lr,rc]
names = ['lr','rc']
s = []
for i in range(1, len(models)+1):
m = models[i]
for size in train_s:
x_tr, x_te, y_tr, y_te = train_test_split(x_train, y_train,train_size=size)
train_start = time.time() # 开始时间
m.fit(x_train)
train_end = time.time() # 结束时间
train_time = train_start - train_end
pred_start = time.time() # 开始时间
te_y_pred = m.predict(x_te)
tr_y_pred = m.predict(x_tr)
pred_end = time.time() # 结束时间
# 训练时间
pred_time = pred_start - pred_end
train_f1 = f1_score(y_tr, tr_y_pred)
test_f1 = f1_score(y_te,te_y_pred )
s.append(
{
'模型':m,
'训练时间':train_time,
'预测时间':pred_time,
'train_f1':train_f1,
'test_f1':test_f1
}
)
print(s)
最新发布