##注入所需库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import time
from sklearn.svm import SVC #支持向量机分类器
# from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
# from sklearn.linear_model import LogisticRegression #逻辑回归分类器
import xgboost as xgb #XGBoost分类器
import lightgbm as lgb #LightGBM分类器
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
# from catboost import CatBoostClassifier #CatBoost分类器
# from sklearn.tree import DecisionTreeClassifier #决策树分类器
# from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
from skopt import BayesSearchCV
from skopt.space import Integer
from deap import base, creator, tools, algorithms
from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
from sklearn.metrics import make_scorer#定义函数
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
##设置中文字体及负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
##查看基本信息&读取数据
data=pd.read_csv(r'data.csv')
print(f'{data.info()}\n{data.isnull().sum()}\n{data.head()}\n{data.columns}\n{data["Credit Default"].value_counts()}')
##绘制图像
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Annual Income'])
# plt.title(f'年收入箱线图')
# plt.xlabel('年收入')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x='Credit Default',y='Annual Income',data=data)
# plt.title('是否欠款年收入箱线图')
# plt.xlabel('是否欠款')
# plt.ylabel('金额')
# plt.xticks([0,1],['n','y'])
# plt.tight_layout()
# plt.show()
# sns.histplot(x='Annual Income',
# hue='Credit Default',
# hue_order=[0,1],
# data=data,
# kde=True,
# element='bars')
# plt.title('年收入直方图')
# plt.xlabel('年收入')
# plt.ylabel('金额')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
##数据填补
for i in data.columns:
if data[i].dtype!='object':
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mean(),inplace=True)
else:
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mode()[0],inplace=True)
print(data.isnull().sum())
##数据编码
mapping={'10+ years':0,
'9 years':1,
'8 years':2,
'7 years':3,
'6 years':4,
'5 years':5,
'4 years':6,
'3 years':7,
'2 years':8,
'1 year':9,
'< 1 year':10}
data['Years in current job']=data['Years in current job'].map(mapping)
data=pd.get_dummies(data=data,drop_first=True)
dummies_list=[]
data2=pd.read_csv(r'data.csv')
for i in data.columns:
if i not in data2.columns:
dummies_list.append(i)
for i in dummies_list:
data[i]=data[i].astype(int)
print(f'{data.info()}\n{data.head()}\n{data.columns}')
# ##绘制相关热力图
# continuous_features=['Annual Income', 'Years in current job', 'Tax Liens',
# 'Number of Open Accounts', 'Years of Credit History',
# 'Maximum Open Credit', 'Number of Credit Problems',
# 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',
# 'Current Credit Balance', 'Monthly Debt', 'Credit Score',
# 'Credit Default']
# correlation_matrix=data[continuous_features].corr()
# plt.figure(figsize=(12,10))
# sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',vmin=-1,vmax=1)
# plt.title('相关热力图')
# plt.xticks(rotation=45,ha='right')
# plt.tight_layout()
# plt.show()
# #绘制子图
features=['Annual Income','Years in current job','Tax Liens','Number of Open Accounts']
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# axes[row,col].boxplot(data[features])
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# axes[row,col].boxplot(data[feature])
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# sns.histplot(
# x=feature,
# hue='Credit Default',
# hue_order=[0,1],
# data=data,
# kde=True,
# element='bars',
# ax=axes[row,col]
# )
# axes[row,col].set_title(f'aa')
# axes[row,col].set_xlabel(feature)
# axes[row,col].set_ylabel(f'count')
# axes[row,col].legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# sns.histplot(
# x=feature,
# hue='Credit Default',
# hue_order=[0,1],
# data=data,
# element='bars',
# ax=axes[row,col]
# )
# axes[row,col].set_title(f'aa')
# axes[row,col].set_xlabel(feature)
# axes[row,col].set_ylabel(f'count')
# axes[row,col].legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
from sklearn.model_selection import train_test_split
x=data.drop(['Credit Default'],axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(f'train:{x_train.shape}\ntest:{x_test.shape}')
# #svm
# print("--- 1. 默认参数SVM (训练集 -> 测试集) ---")
# start_time=time.time()
# svm_model=SVC(random_state=42)
# svm_model.fit(x_train,y_train)
# svm_pred=svm_model.predict(x_test)
# end_time=time.time()
# print(f'训练与预测耗时:{end_time - start_time:.4f}')
# print('\n SVM分类报告')
# print(classification_report(y_test,svm_pred))
# print('\n SVM混淆矩阵')
# print(confusion_matrix(y_test,svm_pred))
print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
start_time = time.time()
rf_model_default = RandomForestClassifier(random_state=42)
rf_model_default.fit(x_train, y_train)
rf_pred_default = rf_model_default.predict(x_test)
end_time = time.time()
print(f"默认模型训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, rf_pred_default))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred_default))
print("-" * 50)
# print(f'训练与预测耗时:{end_time - start_time:.4f}')
# print('\n随机森林分类报告')
# print(classification_report(y_test,rf_pred))
# print('\n随机森林混淆矩阵')
# print(confusion_matrix(y_test,rf_pred))
## --- 2. 带权重的随机森林 + 交叉验证 (在训练集上进行CV) ---
print("--- 2. 带权重随机森林 + 交叉验证 (在训练集上进行) ---")
counts=np.bincount(y_train)
minority_label=np.argmin(counts)
majority_label=np.argmax(counts)
print(f'训练集中各类别数量:{counts}')
print(f'少数类标签:{minority_label},多数类标签:{majority_label}')
rf_model_weighted=RandomForestClassifier(
random_state=42,
class_weight='balanced'
)
cv_strategy=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scoring={
'accuracy':'accuracy',
'precision_minority':make_scorer(precision_score,pos_label=minority_label,zero_division=0),
'recall_minority':make_scorer(recall_score,pos_label=minority_label),
'f1_minority':make_scorer(f1_score,pos_label=minority_label)
}
print(f'开始进行{cv_strategy.get_n_splits()}折交叉验证..')
start_time_cv=time.time()
cv_results=cross_validate(
estimator=rf_model_weighted,
X=x_train,
y=y_train,
cv=cv_strategy,
scoring=scoring,
n_jobs=-1,
return_train_score=False
)
end_time_cv=time.time()
print(f"交叉验证耗时: {end_time_cv - start_time_cv:.4f} 秒")
# 打印交叉验证结果的平均值,查看每一折模型指标的得分查看模型的稳定性,如果不稳定可能原因是### .
# 模型稳定性差
# 标准差大意味着模型在不同数据子集上的表现波动很大,这表明模型对训练数据的变化非常敏感。这种不稳定性可能导致模型在实际应用中表现不一致,难以预测其在新数据上的性能。
# ### 2. 数据分布不均匀
# 如果数据在不同折之间的分布差异很大,特别是在类别不平衡的情况下,即使使用了分层抽样(StratifiedKFold),仍可能导致评估指标的高方差。
# ### 3. 过拟合风险
# 高标准差可能是过拟合的信号,表明模型可能过度学习了训练数据中的噪声或特定模式,而不是学习到真正的数据规律。
# ### 4. 样本量不足
# 当总体样本量较小时,交叉验证的每个折中的样本更少,这可能导致评估指标的高方差。
# ### 5. 异常值影响
# 数据中的异常值可能在某些折中产生显著影响,导致评估指标在不同折之间差异很大。
print("\n带权重随机森林 交叉验证平均性能 (基于训练集划分):")
for metric_name,scores in cv_results.items():
if metric_name.startswith('test_'):
clean_metric_name=metric_name.split('test_')[1]
print(f'平均{clean_metric_name}:{np.mean(scores):.4f}(+/-{np.std(scores):.4f})')
print('-'*50)
# --- 3. 使用权重训练最终模型,并在测试集上评估 ---
print("--- 3. 训练最终的带权重模型 (整个训练集) 并在测试集上评估 ---")
start_time_final=time.time()
rf_model_weighted_final=RandomForestClassifier(random_state=42,class_weight='balanced')
rf_model_weighted_final.fit(x_train,y_train)
rf_pred_weighted=rf_model_weighted_final.predict(x_test)
end_time_final = time.time()
print(f"最终带权重模型训练与预测耗时: {end_time_final - start_time_final:.4f} 秒")
print("\n带权重随机森林 在测试集上的分类报告:")
print(classification_report(y_test,rf_pred_weighted))
print("带权重随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred_weighted))
print("-" * 50)
# 对比总结 (简单示例)
print("性能对比 (测试集上的少数类召回率 Recall):")
recall_default = recall_score(y_test, rf_pred_default, pos_label=minority_label)
recall_weighted = recall_score(y_test, rf_pred_weighted, pos_label=minority_label)
print(f" 默认模型: {recall_default:.4f}")
print(f" 带权重模型: {recall_weighted:.4f}")
#定义约登指数
def youden_score(y_true,y_pred):
tn,fp,fn,tp=confusion_matrix(y_true,y_pred).ravel()
sensitivity=tp/(tp+fn)
spcificity=tn/(tn+fp)
return sensitivity+spcificity-1
youden_score=make_scorer(youden_score)
# #网格搜索
# print("\n--- 2. 网格搜索优化随机森林 (训练集 -> 测试集) ---")
# from sklearn.model_selection import GridSearchCV
# param_grid={
# 'n_estimators':[50,100],
# 'max_depth':[10,20],
# 'min_samples_split':[2,5,10],
# 'min_samples_leaf':[1,2,4]
# }
# grid_search=GridSearchCV(estimator=RandomForestClassifier(random_state=42),
# param_grid=param_grid,
# cv=2,
# n_jobs=-1,
# scoring=youden_score)
# start_time=time.time()
# grid_search.fit(x_train,y_train)
# end_time=time.time()
# best_model=grid_search.best_estimator_
# best_pred=best_model.predict(x_test)
# print(f'网格搜索耗时:{end_time-start_time:.4f}秒')
# print('最佳参数:',grid_search.best_params_)
# print('\n网格搜索优化后的随机森林在测试集上的分类报告')
# print(classification_report(y_test,best_pred))
# print('网格搜索优化后的随机森林在测试集上的混淆矩阵')
# print(confusion_matrix(y_test,best_pred))
# #贝叶斯优化
# search_space={
# 'n_estimators':Integer(20,50),
# 'max_depth':Integer(1,10),
# 'min_samples_split':(2,5),
# 'min_samples_leaf':(1,5)
# }
# bayes_search=BayesSearchCV(
# estimator=RandomForestClassifier(random_state=42),
# search_spaces=search_space,
# n_iter=10,
# cv=2,
# n_jobs=-1,
# scoring=youden_score
# )
# start_time=time.time()
# bayes_search.fit(x_train,y_train)
# end_time=time.time()
# best_model=bayes_search.best_estimator_
# best_pred=best_model.predict(x_test)
# print(f'贝叶斯优化耗时:{end_time-start_time}')
# print('最佳参数',bayes_search.best_params_)
# print('\n贝叶斯优化后的随机森林在测试集上的分类报告')
# print(classification_report(y_test,best_pred))
# print('\n贝叶斯优化后的随机森林在测试集上的混淆矩阵')
# print(confusion_matrix(y_test,best_pred))
# #随机过采样
# from imblearn.over_sampling import RandomOverSampler
# ros=RandomOverSampler(random_state=42)
# x_train_ros,y_train_ros=ros.fit_resample(x_train,y_train)
# print('随机过采样后训练集的形状:',x_train_ros.shape,y_train_ros.shape)
# rf_model_ros=RandomForestClassifier(random_state=42)
# start_time_ros=time.time()
# rf_model_ros.fit(x_train_ros,y_train_ros)
# end_time_ros=time.time()
# rf_pred_ros=rf_model_ros.predict(x_test)
# print(f"随机过采样后训练与预测耗时: {end_time_ros - start_time_ros:.4f} 秒")
# print("\n随机过采样后随机森林 在测试集上的分类报告:")
# print(classification_report(y_test, rf_pred_ros))
# print("随机过采样后随机森林 在测试集上的混淆矩阵:")
# print(confusion_matrix(y_test, rf_pred_ros))
# #SMOTE过采样
# from imblearn.over_sampling import SMOTE
# smote=SMOTE(random_state=42)
# x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
# print('smote过采样后训练集的形状:',x_train_smote.shape,y_train_smote)
# start_time=time.time()
# rf_model_smote=RandomForestClassifier(random_state=42)
# rf_model_smote.fit(x_train_smote,y_train_smote)
# rf_pred_smote=rf_model_smote.predict(x_test)
# end_time=time.time()
# print('\nSMOTE过采样后随机森林在测试集上的分类报告')
# print(classification_report(y_test,rf_pred_smote))
# print('SMOTE过采样后随机森林在测试集上的混淆矩阵')
# print(confusion_matrix(y_test,rf_pred_smote))
#带权重的交叉验证
import numpy as np # 引入 numpy 用于计算平均值等
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import time
import warnings
warnings.filterwarnings("ignore")
1029

被折叠的 条评论
为什么被折叠?



