##注入所需库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import time
# import shap
# from sklearn.svm import SVC #支持向量机分类器
# # from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
# # from sklearn.linear_model import LogisticRegression #逻辑回归分类器
# import xgboost as xgb #XGBoost分类器
# import lightgbm as lgb #LightGBM分类器
# from sklearn.ensemble import RandomForestClassifier #随机森林分类器
# # from catboost import CatBoostClassifier #CatBoost分类器
# # from sklearn.tree import DecisionTreeClassifier #决策树分类器
# # from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
# from skopt import BayesSearchCV
# from skopt.space import Integer
# from deap import base, creator, tools, algorithms
# from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
# from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
from sklearn.metrics import make_scorer#定义函数
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
#聚类
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
#3D可视化
from mpl_toolkits.mplot3d import Axes3D
#设置中文字体
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
#查看基本信息&读取数据
data=pd.read_csv(r'data.csv')
print(f'{data.info()}\n{data.isnull().sum()}\n{data.columns}')
#绘制图像
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Years of Credit History'])
# plt.title('欠款时间箱线图')
# plt.xlabel('Years of Credit History')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x='Credit Default',y='Years of Credit History',data=data)
# plt.title('欠款时间分类箱线图')
# plt.xlabel('是否逾期')
# plt.ylabel('欠款时间')
# plt.xticks([0,1],['n','y'])
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.histplot(
# x='Years of Credit History',
# hue='Credit Default',
# hue_order=[0,1],
# data=data,
# kde=True,
# element='bars'
# )
# plt.title('欠款时间直方图')
# plt.xlabel('欠款时间')
# plt.ylabel('count')
# plt.legend(labels=['n','y'])
# plt.tight_layout()
# plt.show()
# #绘制箱线图子图
# features=['Annual Income','Current Credit Balance','Years of Credit History','Credit Score']
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row,col=i//2,i%2
# axes[row,col].boxplot(data[feature].dropna())
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# #绘制分类箱线图子图
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row,col=i//2,i%2
# sns.boxplot(
# x='Credit Default',
# y=feature,
# data=data.dropna(),
# ax=axes[row,col]
# )
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_xlabel('Credit Default')
# axes[row,col].set_ylabel('count')
# axes[row,col].set_xticks([0,1],['n','y'])
# plt.tight_layout()
# plt.show()
# #绘制分类直方图子图
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row,col=i//2,i%2
# sns.histplot(
# x=feature,
# hue='Credit Default',
# hue_order=(0,1),
# data=data.dropna(),
# kde=True,
# element='bars',
# ax=axes[row,col]
# )
# axes[row,col].set_title(f'histplot of{feature}')
# axes[row,col].set_xlabel(feature)
# axes[row,col].set_ylabel('count')
# axes[row,col].legend(labels=['n','y'])
# plt.tight_layout()
# plt.show()
#数据填补
for i in data.columns:
if data[i].dtype!='object':
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mean(),inplace=True)
else:
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mode()[0],inplace=True)
mapping={
'10+ years':0,
'9 years':1,
'8 years':2,
'7 years':3,
'6 years':4,
'5 years':5,
'4 years':6,
'3 years':7,
'2 years':8,
'1 year':9,
'< 1 year':10}
data['Years in current job']=data['Years in current job'].map(mapping)
data=pd.get_dummies(data=data,drop_first=True)
dummies_list=[]
data2=pd.read_csv(r'data.csv')
for i in data.columns:
if i not in data2.columns:
dummies_list.append(i)
for i in dummies_list:
data[i]=data[i].astype(int)
print(f'{data.info()}\n{data.isnull().sum()}\n{data.columns}')
# #绘制相关热力图
# continuous_features=['Annual Income', 'Years in current job', 'Tax Liens',
# 'Number of Open Accounts', 'Years of Credit History',
# 'Maximum Open Credit', 'Number of Credit Problems',
# 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',
# 'Current Credit Balance', 'Monthly Debt', 'Credit Score',]
# confusion_matrixx=data[continuous_features].corr()
# plt.figure(figsize=(12,10))
# sns.heatmap(confusion_matrix,annot=True,cmap='coolwarm',vmin=-1,vmax=1)
# plt.title('相关热力图')
# plt.xticks(rotation=45,ha='right')
# plt.tight_layout()
# plt.show()
#划分数据集
from sklearn.model_selection import train_test_split
x=data.drop('Credit Default',axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(f'train:{x_train.shape}\ntest:{x_test.shape}')
# #模型训练
# #SVM
# print("--- 1. 默认参数SVM (训练集 -> 测试集) ---")
# start_time=time.time()
# svm_model=SVC(random_state=42,class_weight='balanced')
# svm_model.fit(x_train,y_train)
# svm_pred=svm_model.predict(x_test)
# end_time=time.time()
# print(f'训练与预测耗时:{end_time-start_time:.4f}')
# print('\nSVM分类报告')
# print(classification_report(y_test,svm_pred))
# print('\nSVM混淆矩阵')
# print(confusion_matrix(y_test,svm_pred))
# #随机森林
# print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
# start_time=time.time()
# rf_model=RandomForestClassifier(random_state=42,class_weight='balanced')
# rf_model.fit(x_train,y_train)
# rf_pred=rf_model.predict(x_test)
# end_time=time.time()
# print(f'训练与预测耗时:{end_time-start_time:.4f}')
# print('\n随机森林分类报告')
# print(classification_report(y_test,rf_pred))
# print('\n随机森林混淆矩阵')
# print(confusion_matrix(y_test,rf_pred))
# #定义约登指数
# def youden_score(y_true,y_pred):
# tn,fp,fn,tp=confusion_matrix(y_true,y_pred).ravel()
# sensitivity=tp/(tp+fn)
# specificity=tn/(tn+fp)
# return sensitivity+specificity-1
# youden_scorer=make_scorer(youden_score)
# #SMOTE过采样后带权重网格搜索优化的随机森林
# #SMOTE
# from imblearn.over_sampling import SMOTE
# smote=SMOTE(random_state=42)
# x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
# #网格搜索&交叉验证
# from sklearn.model_selection import GridSearchCV
# cv_strategy=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
# param_grid={'n_estimators':[5,10,15],
# 'max_depth':[None,5,10],
# 'min_samples_split':[2,3,4],
# 'min_samples_leaf':[2,3,4]}
# grid_search=GridSearchCV(
# estimator=RandomForestClassifier(random_state=42,class_weight='balanced'),
# param_grid=param_grid,
# cv=cv_strategy,
# n_jobs=-1,
# scoring=youden_scorer
# )
# start_time=time.time()
# grid_search.fit(x_train_smote,y_train_smote)
# end_time=time.time()
# best_model=grid_search.best_estimator_
# best_pred=best_model.predict(x_test)
# print(f'网格搜索耗时:{end_time-start_time:.4f}秒')
# print('最佳参数:',grid_search.best_params_)
# print('\n带权重网格搜索优化后的随机森林在测试集上的分类报告')
# print(classification_report(y_test,best_pred))
# print('网格搜索优化后的随机森林在测试集上的混淆矩阵')
# print(confusion_matrix(y_test,best_pred))
# #SMOTE过采样后带权重的贝叶斯优化的随机森林
# #SMOTE过采样
# from imblearn.over_sampling import SMOTE
# smote=SMOTE(random_state=42)
# x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
# #贝叶斯优化&交叉验证
# from sklearn.model_selection import GridSearchCV
# cv_strategy=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
# search_space={
# 'n_estimators':Integer(1,5),
# 'max_depth':Integer(1,5),
# 'min_samples_split':(2,6),
# 'min_samples_leaf':Integer(1,5)
# }
# bayes_search=BayesSearchCV(
# estimator=RandomForestClassifier(random_state=42,class_weight='balanced'),
# search_spaces=search_space,
# n_iter=5,
# cv=cv_strategy,
# n_jobs=-1,
# scoring=youden_scorer
# )
# start_time=time.time()
# bayes_search.fit(x_train_smote,y_train_smote)
# end_time=time.time()
# best_model=bayes_search.best_estimator_
# best_pred=best_model.predict(x_test)
# print(f'贝叶斯优化耗时:{end_time-start_time:.4f}秒')
# print('最佳参数',bayes_search.best_params_)
# print('\n贝叶斯优化后的随机森林在测试集上的分类报告')
# print(classification_report(y_test,best_pred))
# print('\n贝叶斯优化后的随机森林在测试集上侧混淆矩阵')
# print(confusion_matrix(y_test,best_pred))
# #SHAP分析
# start_time=time.time()
# explainer=shap.TreeExplainer(best_model)
# shap_values=explainer.shap_values(x_test)
# end_time=time.time()
# print(f"shap分析耗时: {end_time - start_time:.4f} 秒")
# print('shap_values shape:',shap_values.shape)
# print('shap_values[0,:,:] shape:',shap_values[0,:,:].shape)##这里也可以省略后面的写作shap_values[0】,代表第一个样本所有特征对所有类别的贡献,后面部位可以省略但是前面不能。
# print('shap_values[:,:,0] shape:',shap_values[:,:,0].shape)
# print('x_test shape:',x_test.shape)
# ##SHAP特征重要性条形图 (Summary Plot - bar)
# print("--- 2. SHAP 特征重要性条形图 ---")
# shap.summary_plot(
# shap_values[:,:,0],
# x_test,
# plot_type='bar',
# show=False
# )
# plt.title('SHAP特征重要性条形图')
# plt.tight_layout()
# plt.show()
# # ##SHAP特征重要性蜂巢图
# print("--- 2. SHAP 特征重要性蜂巢图 ---")
# shap.summary_plot(
# shap_values[:,:,0],
# x_test,
# plot_type='violin',
# show=False
# )
# plt.tight_layout()
# plt.show()
#标准化数据,将自变量标准化,聚类就是从自变量中聚合新的自变量,与因变量无关
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
#kmeans++聚类
k_range=range(2,50)
intertia_values=[]
silhouette_scores=[]
ch_scores=[]
db_scores=[]
start_time=time.time()
for k in k_range:
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans_label=kmeans.fit_predict(x_scaled)
intertia_values.append(kmeans.inertia_)#惯性(肘部法则)
silhouette=silhouette_score(x_scaled,kmeans_label)#轮廓系数
silhouette_scores.append(silhouette)
ch=calinski_harabasz_score(x_scaled,kmeans_label)#ch系数
ch_scores.append(ch)
db=davies_bouldin_score(x_scaled,kmeans_label)
db_scores.append(db)
# print(f'k={k}\n 惯性:{kmeans.inertia_:.2f}\n轮廓系数:{silhouette:.3f}\n CH系数:{ch:.2f}\n DB{db:.3f}')
end_time=time.time()
print(f'聚类分析耗时:{end_time-start_time:.4f}秒')
# #绘制评估指标图
# plt.figure(figsize=(12,6))
# #肘部法则图(Inertia)
# plt.subplot(2,2,1)
# plt.plot(k_range,intertia_values,marker='o')
# plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('惯性')
# plt.grid(True)
# #轮廓系数图
# plt.subplot(2,2,2)
# plt.plot(k_range,silhouette_scores,marker='o',color='orange')
# plt.title('轮廓系数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('轮廓系数')
# plt.grid(True)
# #CH指数图
# plt.subplot(2,2,3)
# plt.plot(k_range,ch_scores,marker='o',color='yellow')
# plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('CH 指数')
# plt.grid(True)
# #DB指数图
# plt.subplot(2,2,4)
# plt.plot(k_range,db_scores,marker='o',color='red')
# plt.title('Davies-Bouldin 指数确定最优聚类数 k(越小越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('DB 指数')
# plt.grid(True)
# plt.tight_layout()
# plt.show()
# #选择K值进行聚类
# selected_k=20
# kmeans=KMeans(n_clusters=selected_k,random_state=42)
# kmeans_label=kmeans.fit_predict(x_scaled)
# x['KMeans_Cluster']=kmeans_label
# #PCA降维
# pca=PCA(n_components=2)
# x_pca=pca.fit_transform(x_scaled)
# #聚类可视化
# plt.figure(figsize=(6,5))
# sns.scatterplot(x=x_pca[:,0],y=x_pca[:,1],hue=kmeans_label,palette='viridis')
# plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.show()
# # 打印 KMeans 聚类标签的前几行
# print(f"KMeans Cluster labels (k={selected_k}) added to X:")
# print(x[['KMeans_Cluster']].value_counts())
# #3D可视化
# pca=PCA(n_components=3)
# x_pca=pca.fit_transform(x_scaled)
# fig=plt.figure(figsize=(8,6))
# ax=fig.add_subplot(111,projection='3d')
# scatter=ax.scatter(
# x_pca[:,0],x_pca[:,1],x_pca[:,2],
# c=kmeans_label,cmap='viridis',s=30,alpha=0.8
# )
# ax.set_title(f'KMeans Clustering with k={selected_k} (PCA 3D Visualization)')
# ax.set_xlabel('PCA Component 1')
# ax.set_ylabel('PCA Component 2')
# ax.set_zlabel('PCA Component 3')
# plt.colorbar(scatter, ax=ax, shrink=0.5)
# plt.show()