propert test

# 功能性约束 import pandas as pd # 1.数据准备: # (1)导入数据 df = pd.read_csv('Credit_card.csv') # (2)查看数据前五行数据 print(df.head(5)) # (3)查看数据后五行数据 print(df.tail(5)) # (4)查看数据描述性统计 print(df.describe()) # (5)查看数据信息 df.info() # 2.数据预处理 # (1)使用pandas当中的方法查看数据的缺失值 if df.isnull().sum().any(): print(df.isnull().sum().any()) # (2)对'Annual_income', 'Birthday_count'字段采用均值进行填充 df['Annual_income'] = df['Annual_income'].fillna(df['Annual_income'].mean()) df['Birthday_count'] = df['Birthday_count'].fillna(df['Birthday_count'].mean()) # (3)对'GENDER', 'Type_Occupation'字段采用众数填充 df['GENDER'] = df['GENDER'].fillna(df['GENDER'].mode()[0]) df['Type_Occupation']= df['Type_Occupation'].fillna(df['Type_Occupation'].mode()[0]) # (4)打印填充后的值检查是否填充成功 if df.isnull().sum().any(): print(df.isnull().sum().any()) else: print('没有缺失值') # (5)打印该列中每个唯一值及其对应的计数。 df_vc = df['label'].value_counts() print(df_vc) # (6)删除ID编号和任何手机字段 df.drop(columns=['Ind_ID','Mobile_phone','Work_Phone','Phone','EMAIL_ID'], inplace=True) # (7)对以下字段进行二值化处理('GENDER', 'Car_Owner', 'Propert_Owner') from sklearn.preprocessing import LabelEncoder alist = ['GENDER', 'Car_Owner', 'Propert_Owner'] for i in alist: df[i] = LabelEncoder().fit_transform(df[i]) # (8)打印数据前几行查看是否二值化成功 print(df.head()) '''' 二值化成功 ''' # (9)对以下字段进行独热处理('Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation') from sklearn.preprocessing import OneHotEncoder blist = ['Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation'] for i in blist: oht = OneHotEncoder(sparse_output=False) df_oe = pd.DataFrame(oht.fit_transform(df[[i]]),columns=oht.get_feature_names_out()) df = pd.concat([df.drop(columns=i),df_oe], axis = 1) # (10)打印数据前几行查看数据是否独热成功 print(df.head()) df.info() # (11)分析不人口统计学特征(如性别、年龄、婚姻状况)对信用卡申请的影响和规律 import matplotlib.pyplot as plt import seaborn as sns sns.countplot(x = df['GENDER'], hue = df['label']) plt.show() sns.countplot(x = df['Birthday_count'], hue = df['label']) plt.show() sns.countplot(x = df['Birthday_count'], hue = df['label']) plt.show() # (12)可视化展示分析结果 # (13)文本总结分析结论 # (14)分析不同社会经济特征(如收入、职业、教育程度)与申请结果的关系 # (15)可视化展示分析结果 # (16)文本总结分析结论 # 3.特征选择以及处理 # (1)获取特征 # (2)获取标签 y, x = df.pop('label'), df # (3)对标签进行类型转换为numpy类型 import numpy as np y = np.array(y) # (4)对标签列进行补充一列 y = np.reshape(y,(-1,1)) # (5)使用划分方法划分训练数据和测试数据 from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=10010) # (6)查看训练数据和测试数据四部分形状(shape实现) print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) # (7)将训练标签列类型转换为dataframe类型 y_train = pd.DataFrame(y_train) # (8)查看训练标签列是否平衡 print(y_train.value_counts()) '''' 不平衡 ''' df_vc = y_train.value_counts().sort_index() df_vc.index = [idx[0] for idx in df_vc.index] print(df_vc.index) # (9)可视化绘制训练数据标签分布图(可用条形图) sns.barplot(x = df_vc.index, hue = df_vc.values) plt.show() # (10)使用过采样方法(SMOTE)对训练数据(train_x,train_y)不平衡进行处理 from imblearn.over_sampling import SMOTE x_train, y_train = SMOTE().fit_resample(x_train, y_train) print(y_train.value_counts()) # 4.算法模型:(1)逻辑回归模型 from sklearn.linear_model import LogisticRegression lr = LogisticRegression() # (2)集成学习模型(使用RandomForestClassifier随机森林) from sklearn.ensemble import RandomForestClassifier rc = RandomForestClassifier() # 5、算法模型训练: from sklearn.model_selection import GridSearchCV # (1)使用网格搜索交叉验证设置合适的超参数,如正则化项、树的数量等 lr_gs = GridSearchCV(estimator=lr, param_grid={'C':[0.2,0.4,0.6,0.7]}) rc_gs = GridSearchCV(estimator=rc, param_grid={'max_depth':[10,20,40,60]}) # (2)进行网格训练 lr_gs.fit(x_train, y_train) rc_gs.fit(x_train, y_train) # (3)打印网格最优参数 print(lr_gs.best_params_['C']) print(rc_gs.best_params_['max_depth']) # (4)打印网格最优得分 print(lr_gs.score(x_test, y_test)) print(rc_gs.score(x_test, y_test)) # (5)传入网格最优参数进行逻辑回归、随机森林模型训练。 lr = LogisticRegression(C =lr_gs.best_params_['C'] ) rc = RandomForestClassifier(max_depth=rc_gs.best_params_['max_depth']) lr.fit(x_train, y_train) rc.fit(x_train, y_train) # (6)计算各模型F1值作为评估指标 # (7)计算各模型准确率 # (8)计算各模型召回率 # (9)计算各模型混淆矩阵 # (10)计算各模型分类报告 # (11)绘制各模型roc曲线 # (12)计算各模型auc数值等指标 # (13)绘制各模型学习曲线进行分析 from sklearn.metrics import f1_score,accuracy_score,recall_score,confusion_matrix,classification_report from sklearn.metrics import roc_curve,roc_auc_score from sklearn.model_selection import learning_curve models = [lr,rc] names = ['lr','rc'] for i in range(1, len(models)+1): m = models[i] y_pred = m.predict(x_test) print(f'{names[i]}的评估{f1_score(y_test, y_pred)}') print(f'{names[i]}的评估{accuracy_score(y_test, y_pred)}') print(f'{names[i]}的评估{recall_score(y_test, y_pred)}') print(f'{names[i]}的评估{confusion_matrix(y_test, y_pred)}') print(f'{names[i]}的评估{classification_report(y_test, y_pred)}') fpr,tpr,th = roc_curve(y_test, m.predict_proba(x_test)[:,1]) plt.plot(fpr,tpr) plt.show() print(f'{names[i]}的评估{roc_auc_score(y_test, m.predict_proba(x_test)[:, 1])}') train_size, y1, y2 = learning_curve(estimator=m, X = x_train, Y = y_train) plt.plot(train_size, y1.mean(axis = 1)) plt.plot(train_size, y2.mean(axis = 1)) plt.show() # 6、不同大小训练数据的拟合与测量 # 数据子集划分: # (1)从原始训练集中分别提取200个、300个和400个数据点作为不同大小的训 练数据。 # 模型训练与评估: # (2)对每种模型在不同大小的训练数据上进行拟合。 # (3)计算并记录每种情况下的训练时间、预测时间、训练集和测试集的F1分数。 import time train_s = [200,300,400] models = [lr,rc] names = ['lr','rc'] s = [] for i in range(1, len(models)+1): m = models[i] for size in train_s: x_tr, x_te, y_tr, y_te = train_test_split(x_train, y_train,train_size=size) train_start = time.time() # 开始时间 m.fit(x_train) train_end = time.time() # 结束时间 train_time = train_start - train_end pred_start = time.time() # 开始时间 te_y_pred = m.predict(x_te) tr_y_pred = m.predict(x_tr) pred_end = time.time() # 结束时间 # 训练时间 pred_time = pred_start - pred_end train_f1 = f1_score(y_tr, tr_y_pred) test_f1 = f1_score(y_te,te_y_pred ) s.append( { '模型':m, '训练时间':train_time, '预测时间':pred_time, 'train_f1':train_f1, 'test_f1':test_f1 } ) print(s)
最新发布
11-06
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值