python打卡DAY12

##注入所需库

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

import random

import numpy as np

import time

from sklearn.svm import SVC #支持向量机分类器

# from sklearn.neighbors import KNeighborsClassifier #K近邻分类器

# from sklearn.linear_model import LogisticRegression #逻辑回归分类器

import xgboost as xgb #XGBoost分类器

import lightgbm as lgb #LightGBM分类器

from sklearn.ensemble import RandomForestClassifier #随机森林分类器

# from catboost import CatBoostClassifier #CatBoost分类器

# from sklearn.tree import DecisionTreeClassifier #决策树分类器

# from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器

from skopt import BayesSearchCV

from skopt.space import Integer

from deap import base, creator, tools, algorithms

from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标

from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵

from sklearn.metrics import make_scorer#定义函数

import warnings #用于忽略警告信息

warnings.filterwarnings("ignore") # 忽略所有警告信息

##设置中文字体及负号正确显示

plt.rcParams['font.sans-serif']=['STHeiti']

plt.rcParams['axes.unicode_minus']=True

plt.rcParams['figure.dpi']=100

##查看基本信息&读取数据

data=pd.read_csv(r'data.csv')

print(f'{data.info()}\n{data.isnull().sum()}\n{data.head()}\n{data.columns}\n{data["Credit Default"].value_counts()}')

##绘制图像

# plt.figure(figsize=(6,4))

# sns.boxplot(x=data['Annual Income'])

# plt.title(f'年收入箱线图')

# plt.xlabel('年收入')

# plt.tight_layout()

# plt.show()

# plt.figure(figsize=(6,4))

# sns.boxplot(x='Credit Default',y='Annual Income',data=data)

# plt.title('是否欠款年收入箱线图')

# plt.xlabel('是否欠款')

# plt.ylabel('金额')

# plt.xticks([0,1],['n','y'])

# plt.tight_layout()

# plt.show()

# sns.histplot(x='Annual Income',

# hue='Credit Default',

# hue_order=[0,1],

# data=data,

# kde=True,

# element='bars')

# plt.title('年收入直方图')

# plt.xlabel('年收入')

# plt.ylabel('金额')

# plt.legend(labels=['否','是'])

# plt.tight_layout()

# plt.show()

##数据填补

for i in data.columns:

if data[i].dtype!='object':

if data[i].isnull().sum()>0:

data[i].fillna(data[i].mean(),inplace=True)

else:

if data[i].isnull().sum()>0:

data[i].fillna(data[i].mode()[0],inplace=True)

print(data.isnull().sum())

##数据编码

mapping={'10+ years':0,

'9 years':1,

'8 years':2,

'7 years':3,

'6 years':4,

'5 years':5,

'4 years':6,

'3 years':7,

'2 years':8,

'1 year':9,

'< 1 year':10}

data['Years in current job']=data['Years in current job'].map(mapping)

data=pd.get_dummies(data=data,drop_first=True)

dummies_list=[]

data2=pd.read_csv(r'data.csv')

for i in data.columns:

if i not in data2.columns:

dummies_list.append(i)

for i in dummies_list:

data[i]=data[i].astype(int)

print(f'{data.info()}\n{data.head()}\n{data.columns}')

# ##绘制相关热力图

# continuous_features=['Annual Income', 'Years in current job', 'Tax Liens',

# 'Number of Open Accounts', 'Years of Credit History',

# 'Maximum Open Credit', 'Number of Credit Problems',

# 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',

# 'Current Credit Balance', 'Monthly Debt', 'Credit Score',

# 'Credit Default']

# correlation_matrix=data[continuous_features].corr()

# plt.figure(figsize=(12,10))

# sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',vmin=-1,vmax=1)

# plt.title('相关热力图')

# plt.xticks(rotation=45,ha='right')

# plt.tight_layout()

# plt.show()

# #绘制子图

features=['Annual Income','Years in current job','Tax Liens','Number of Open Accounts']

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# for i,feature in enumerate(features):

# row=i//2

# col=i%2

# axes[row,col].boxplot(data[features])

# axes[row,col].set_title(f'boxplot of {feature}')

# axes[row,col].set_ylabel(feature)

# plt.tight_layout()

# plt.show()

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# for i,feature in enumerate(features):

# row=i//2

# col=i%2

# axes[row,col].boxplot(data[feature])

# axes[row,col].set_title(f'boxplot of {feature}')

# axes[row,col].set_ylabel(feature)

# plt.tight_layout()

# plt.show()

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# for i,feature in enumerate(features):

# row=i//2

# col=i%2

# sns.histplot(

# x=feature,

# hue='Credit Default',

# hue_order=[0,1],

# data=data,

# kde=True,

# element='bars',

# ax=axes[row,col]

# )

# axes[row,col].set_title(f'aa')

# axes[row,col].set_xlabel(feature)

# axes[row,col].set_ylabel(f'count')

# axes[row,col].legend(labels=['否','是'])

# plt.tight_layout()

# plt.show()

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# for i,feature in enumerate(features):

# row=i//2

# col=i%2

# sns.histplot(

# x=feature,

# hue='Credit Default',

# hue_order=[0,1],

# data=data,

# element='bars',

# ax=axes[row,col]

# )

# axes[row,col].set_title(f'aa')

# axes[row,col].set_xlabel(feature)

# axes[row,col].set_ylabel(f'count')

# axes[row,col].legend(labels=['否','是'])

# plt.tight_layout()

# plt.show()

from sklearn.model_selection import train_test_split

x=data.drop(['Credit Default'],axis=1)

y=data['Credit Default']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

print(f'train:{x_train.shape}\ntest:{x_test.shape}')

# #svm

# print("--- 1. 默认参数SVM (训练集 -> 测试集) ---")

# start_time=time.time()

# svm_model=SVC(random_state=42)

# svm_model.fit(x_train,y_train)

# svm_pred=svm_model.predict(x_test)

# end_time=time.time()

# print(f'训练与预测耗时:{end_time - start_time:.4f}')

# print('\n SVM分类报告')

# print(classification_report(y_test,svm_pred))

# print('\n SVM混淆矩阵')

# print(confusion_matrix(y_test,svm_pred))

print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")

start_time = time.time()

rf_model_default = RandomForestClassifier(random_state=42)

rf_model_default.fit(x_train, y_train)

rf_pred_default = rf_model_default.predict(x_test)

end_time = time.time()

print(f"默认模型训练与预测耗时: {end_time - start_time:.4f} 秒")

print("\n默认随机森林 在测试集上的分类报告:")

print(classification_report(y_test, rf_pred_default))

print("默认随机森林 在测试集上的混淆矩阵:")

print(confusion_matrix(y_test, rf_pred_default))

print("-" * 50)

# print(f'训练与预测耗时:{end_time - start_time:.4f}')

# print('\n随机森林分类报告')

# print(classification_report(y_test,rf_pred))

# print('\n随机森林混淆矩阵')

# print(confusion_matrix(y_test,rf_pred))

## --- 2. 带权重的随机森林 + 交叉验证 (在训练集上进行CV) ---

print("--- 2. 带权重随机森林 + 交叉验证 (在训练集上进行) ---")

counts=np.bincount(y_train)

minority_label=np.argmin(counts)

majority_label=np.argmax(counts)

print(f'训练集中各类别数量:{counts}')

print(f'少数类标签:{minority_label},多数类标签:{majority_label}')

rf_model_weighted=RandomForestClassifier(

random_state=42,

class_weight='balanced'

)

cv_strategy=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scoring={

'accuracy':'accuracy',

'precision_minority':make_scorer(precision_score,pos_label=minority_label,zero_division=0),

'recall_minority':make_scorer(recall_score,pos_label=minority_label),

'f1_minority':make_scorer(f1_score,pos_label=minority_label)

}

print(f'开始进行{cv_strategy.get_n_splits()}折交叉验证..')

start_time_cv=time.time()

cv_results=cross_validate(

estimator=rf_model_weighted,

X=x_train,

y=y_train,

cv=cv_strategy,

scoring=scoring,

n_jobs=-1,

return_train_score=False

)

end_time_cv=time.time()

print(f"交叉验证耗时: {end_time_cv - start_time_cv:.4f} 秒")

# 打印交叉验证结果的平均值,查看每一折模型指标的得分查看模型的稳定性,如果不稳定可能原因是### .

# 模型稳定性差

# 标准差大意味着模型在不同数据子集上的表现波动很大,这表明模型对训练数据的变化非常敏感。这种不稳定性可能导致模型在实际应用中表现不一致,难以预测其在新数据上的性能。

# ### 2. 数据分布不均匀

# 如果数据在不同折之间的分布差异很大,特别是在类别不平衡的情况下,即使使用了分层抽样(StratifiedKFold),仍可能导致评估指标的高方差。

# ### 3. 过拟合风险

# 高标准差可能是过拟合的信号,表明模型可能过度学习了训练数据中的噪声或特定模式,而不是学习到真正的数据规律。

# ### 4. 样本量不足

# 当总体样本量较小时,交叉验证的每个折中的样本更少,这可能导致评估指标的高方差。

# ### 5. 异常值影响

# 数据中的异常值可能在某些折中产生显著影响,导致评估指标在不同折之间差异很大。

print("\n带权重随机森林 交叉验证平均性能 (基于训练集划分):")

for metric_name,scores in cv_results.items():

if metric_name.startswith('test_'):

clean_metric_name=metric_name.split('test_')[1]

print(f'平均{clean_metric_name}:{np.mean(scores):.4f}(+/-{np.std(scores):.4f})')

print('-'*50)

# --- 3. 使用权重训练最终模型,并在测试集上评估 ---

print("--- 3. 训练最终的带权重模型 (整个训练集) 并在测试集上评估 ---")

start_time_final=time.time()

rf_model_weighted_final=RandomForestClassifier(random_state=42,class_weight='balanced')

rf_model_weighted_final.fit(x_train,y_train)

rf_pred_weighted=rf_model_weighted_final.predict(x_test)

end_time_final = time.time()

print(f"最终带权重模型训练与预测耗时: {end_time_final - start_time_final:.4f} 秒")

print("\n带权重随机森林 在测试集上的分类报告:")

print(classification_report(y_test,rf_pred_weighted))

print("带权重随机森林 在测试集上的混淆矩阵:")

print(confusion_matrix(y_test, rf_pred_weighted))

print("-" * 50)

# 对比总结 (简单示例)

print("性能对比 (测试集上的少数类召回率 Recall):")

recall_default = recall_score(y_test, rf_pred_default, pos_label=minority_label)

recall_weighted = recall_score(y_test, rf_pred_weighted, pos_label=minority_label)

print(f" 默认模型: {recall_default:.4f}")

print(f" 带权重模型: {recall_weighted:.4f}")



 

#定义约登指数

def youden_score(y_true,y_pred):

tn,fp,fn,tp=confusion_matrix(y_true,y_pred).ravel()

sensitivity=tp/(tp+fn)

spcificity=tn/(tn+fp)

return sensitivity+spcificity-1

youden_score=make_scorer(youden_score)

# #网格搜索

# print("\n--- 2. 网格搜索优化随机森林 (训练集 -> 测试集) ---")

# from sklearn.model_selection import GridSearchCV

# param_grid={

# 'n_estimators':[50,100],

# 'max_depth':[10,20],

# 'min_samples_split':[2,5,10],

# 'min_samples_leaf':[1,2,4]

# }

# grid_search=GridSearchCV(estimator=RandomForestClassifier(random_state=42),

# param_grid=param_grid,

# cv=2,

# n_jobs=-1,

# scoring=youden_score)

# start_time=time.time()

# grid_search.fit(x_train,y_train)

# end_time=time.time()

# best_model=grid_search.best_estimator_

# best_pred=best_model.predict(x_test)

# print(f'网格搜索耗时:{end_time-start_time:.4f}秒')

# print('最佳参数:',grid_search.best_params_)

# print('\n网格搜索优化后的随机森林在测试集上的分类报告')

# print(classification_report(y_test,best_pred))

# print('网格搜索优化后的随机森林在测试集上的混淆矩阵')

# print(confusion_matrix(y_test,best_pred))

# #贝叶斯优化

# search_space={

# 'n_estimators':Integer(20,50),

# 'max_depth':Integer(1,10),

# 'min_samples_split':(2,5),

# 'min_samples_leaf':(1,5)

# }

# bayes_search=BayesSearchCV(

# estimator=RandomForestClassifier(random_state=42),

# search_spaces=search_space,

# n_iter=10,

# cv=2,

# n_jobs=-1,

# scoring=youden_score

# )

# start_time=time.time()

# bayes_search.fit(x_train,y_train)

# end_time=time.time()

# best_model=bayes_search.best_estimator_

# best_pred=best_model.predict(x_test)

# print(f'贝叶斯优化耗时:{end_time-start_time}')

# print('最佳参数',bayes_search.best_params_)

# print('\n贝叶斯优化后的随机森林在测试集上的分类报告')

# print(classification_report(y_test,best_pred))

# print('\n贝叶斯优化后的随机森林在测试集上的混淆矩阵')

# print(confusion_matrix(y_test,best_pred))


 

# #随机过采样

# from imblearn.over_sampling import RandomOverSampler

# ros=RandomOverSampler(random_state=42)

# x_train_ros,y_train_ros=ros.fit_resample(x_train,y_train)

# print('随机过采样后训练集的形状:',x_train_ros.shape,y_train_ros.shape)

# rf_model_ros=RandomForestClassifier(random_state=42)

# start_time_ros=time.time()

# rf_model_ros.fit(x_train_ros,y_train_ros)

# end_time_ros=time.time()

# rf_pred_ros=rf_model_ros.predict(x_test)

# print(f"随机过采样后训练与预测耗时: {end_time_ros - start_time_ros:.4f} 秒")

# print("\n随机过采样后随机森林 在测试集上的分类报告:")

# print(classification_report(y_test, rf_pred_ros))

# print("随机过采样后随机森林 在测试集上的混淆矩阵:")

# print(confusion_matrix(y_test, rf_pred_ros))

# #SMOTE过采样

# from imblearn.over_sampling import SMOTE

# smote=SMOTE(random_state=42)

# x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)

# print('smote过采样后训练集的形状:',x_train_smote.shape,y_train_smote)

# start_time=time.time()

# rf_model_smote=RandomForestClassifier(random_state=42)

# rf_model_smote.fit(x_train_smote,y_train_smote)

# rf_pred_smote=rf_model_smote.predict(x_test)

# end_time=time.time()

# print('\nSMOTE过采样后随机森林在测试集上的分类报告')

# print(classification_report(y_test,rf_pred_smote))

# print('SMOTE过采样后随机森林在测试集上的混淆矩阵')

# print(confusion_matrix(y_test,rf_pred_smote))

#带权重的交叉验证

import numpy as np # 引入 numpy 用于计算平均值等

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import time

import warnings

warnings.filterwarnings("ignore")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值