from a_model_preparation import *
#设置中文字体&负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
#读取数据
data=pd.read_csv(r'data.csv')
x=data.drop(['Id','Credit Default'],axis=1)
y=data['Credit Default']
#定义pipeline 相关定义&处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
ordinal_features=['Years in current job']
ordinal_catagories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)
ordinal_transforms=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_catagories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('ordinal',ordinal_transforms,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_cols)
],remainder='passthrough',verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成。")
#构建完整pipeline
pipeline=Pipeline(steps=[
('preprocessor',preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time=time.time()
x_precessed=pipeline.fit_transform(x)
end_time=time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names=preprocessor.get_feature_names_out()
x_processed_df=pd.DataFrame(x_precessed,columns=feature_names)
print(x_processed_df.info())
#划分数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,random_state=42)
#smote(为了训练模型)
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
#标准化数据(为了聚类)
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x_processed_df)
##kmeans++
k_range=(2,5)
inertia_value=[]
silhouette_scores=[]
ch_scores=[]
db_scores=[]
start_time=time.time()
for k in k_range:
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans_label=kmeans.fit_predict(x_scaled)
inertia_value.append(kmeans.inertia_)
silhouette=silhouette_score(x_scaled,kmeans_label)
silhouette_scores.append(silhouette)
ch=calinski_harabasz_score(x_scaled,kmeans_label)
ch_scores.append(ch)
db=davies_bouldin_score(x_scaled,kmeans_label)
db_scores.append(db)
print(f'聚类分析耗时:{end_time-start_time:.4f}')
# #绘制评估指标图
# plt.figure(figsize=(12,6))
# ##肘部法则
# plt.subplot(2,2,1)
# plt.plot(k_range,inertia_value,marker='o')
# plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('惯性')
# plt.grid(True)
# ##轮廓系数图
# plt.subplot(2,2,2)
# plt.plot(k_range,ch_scores,marker='o',color='yellow')
# plt.title('轮廓系数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('轮廓系数')
# plt.grid(True)
# ##CH系数图
# plt.subplot(2,2,3)
# plt.plot(k_range,ch_scores,marker='o',color='yellow')
# plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('CH 指数')
# plt.grid(True)
# ##DB系数图
# plt.subplot(2,2,4)
# plt.plot(k_range,db_scores,marker='o',color='red')
# plt.title('DB 指数确定最优聚类数 k(越小越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('DB 指数')
# plt.grid(True)
# plt.tight_layout()
# plt.show()
###选择K值进行聚类
selected_k=3
kmeans=KMeans(n_clusters=selected_k)
kmeans_label=kmeans.fit_predict(x_scaled)
x['KMeans_cluster']=kmeans_label
# ##PCA降维
# print(f"\n--- PCA 降维 ---")
# pca=PCA(n_components=3)
# x_pca=pca.fit_transform(x_scaled)
# #聚类可视化
# plt.figure(figsize=(6,5))
# df_pca_2d=pd.DataFrame({
# 'x':x_pca[:,0],
# 'y':x_pca[:,1],
# 'cluster':kmeans_label
# })
# sample_size_2d=min(1000,len(df_pca_2d))
# df_sample_2d=df_pca_2d.sample(sample_size_2d,random_state=42)
# sns.scatterplot(
# x='x',y='y',
# hue='cluster',
# data=df_sample_2d,
# palette='viridis'
# )
# plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.show()
# ##3D可视化
# df_pca=pd.DataFrame(x_pca)
# df_pca['cluster']=x['KMeans_cluster']
# sample_size_3d=min(1000,len(df_pca))
# df_sample_3d=df_pca.sample(sample_size_3d,random_state=42)
# fig=px.scatter_3d(
# df_sample_3d,x=0,y=1,z=2,
# color='cluster',
# color_discrete_sequence=px.colors.qualitative.Bold,
# title='3D可视化'
# )
# fig.update_layout(
# scene=dict(
# xaxis_title='pca_0',
# yaxis_title='pca_1',
# zaxis_title='pca_2'
# ),
# width=1200,
# height=1000
# )
# fig.show()
# print(f"\n---t-SNE 降维 ---")
# n_component_tsne=3
# tsne=TSNE(
# n_components=n_component_tsne,
# perplexity=1000,
# n_iter=250,
# learning_rate='auto',
# random_state=42,
# n_jobs=-1
# )
# print("正在对训练集进行 t-SNE fit_transform...")
# start_time=time.time()
# x_tsne=tsne.fit_transform(x_scaled)
# end_time=time.time()
# print(f"训练集 t-SNE耗时: {end_time - start_time:.2f} 秒")
# # ##3D可视化
# # ##准备数据
# df_tsne=pd.DataFrame(x_tsne)
# df_tsne['cluster']=x['KMeans_cluster']
# fig=px.scatter_3d(
# df_tsne,x=0,y=1,z=2,
# color='cluster',
# color_discrete_sequence=px.colors.qualitative.Bold,
# title='T-SNE特征选择的3D可视化'
# )
# fig.update_layout(
# scene=dict(
# xaxis_title='tsne_0',
# yaxis_title='tsne_1',
# zaxis_title='tsne_2'
# ),
# width=1200,
# height=1000
# )
# fig.show()
##打印KMeans聚类前几行
print(f'KMeans Cluster labels(k={selected_k}added to x):')
print(x[['KMeans_cluster']].value_counts())
# ##SHAP分析
# start_time=time.time()
# rf1_model=RandomForestClassifier(random_state=42,class_weight='balanced')
# rf1_model.fit(x_train_smote,y_train_smote)
# explainer=shap.TreeExplainer(rf1_model)
# shap_value=explainer.shap_values(x_test)
# print(shap_value.shape)
# end_time=time.time()
# print(f'SHAP分析耗时:{end_time-start_time:.4f}')
# # --- 1. SHAP 特征重要性蜂巢图 (Summary Plot - violin) ---
# print("--- 1. SHAP 特征重要性蜂巢图 ---")
# shap.summary_plot(shap_value[:,:,0],x_test,plot_type='violin',show=False)
# plt.title('shap feature importance (bar plot)')
# plt.tight_layout()
# plt.show()
selected_features=['Credit Score','Current Loan Amount','Annual Income','Term_Long Term']
# fig,axes=plt.subplots(2,2,figsize=(10,8))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# unique_count=x_processed_df[feature].nunique()
# if unique_count<10:
# sns.countplot(x=x_processed_df[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# else:
# sns.histplot(x=x_processed_df[feature],ax=axes[i])
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('frequency')
# plt.tight_layout()
# plt.show()
print(x[['KMeans_cluster']].value_counts)
x_cluster0=x_processed_df[x['KMeans_cluster']==0]
x_cluster1=x_processed_df[x['KMeans_cluster']==1]
x_cluster2=x_processed_df[x['KMeans_cluster']==2]
# ##簇0
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# unique_count=x_cluster0[feature].nunique()
# if unique_count<10:
# sns.countplot(x=x_cluster0[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# else:
# sns.histplot(x=x_cluster0[feature],ax=axes[i])
# axes[i].set_title(f'histplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('frequence')
# plt.tight_layout()
# plt.show()
# # #簇1
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate (selected_features):
# unique_count=x_cluster1[feature].nunique()
# if unique_count<10:
# sns.countplot(x=x_cluster1[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# else:
# sns.histplot(x=x_cluster1[feature],ax=axes[i])
# axes[i].set_title(f'histplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('frequence')
# plt.tight_layout()
# plt.show()
# #簇2
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# unique_count=x_cluster0[feature].nunique()
# if unique_count<10:
# sns.countplot(x=x_cluster2[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# else:
# sns.histplot(x=x_cluster2[feature],ax=axes[i])
# axes[i].set_title(f'histplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# plt.tight_layout()
# plt.show()
print("--- 递归特征消除 (RFE) ---")
from sklearn.feature_selection import RFE
start_time=time.time()
base_model=lgb.LGBMClassifier(random_state=42,class_weight='balanced')
rfe=RFE(base_model,n_features_to_select=3)
rfe.fit(x_train_smote,y_train_smote)
x_train_rfe=rfe.transform(x_train_smote)
x_test_rfe=rfe.transform(x_test)
selected_features_rfe=x_train.columns[rfe.support_]
print(f"RFE筛选后保留的特征数量: {len(selected_features_rfe)}")
print(f"保留的特征: {selected_features_rfe}")
end_time=time.time()
print(f'RFE分析耗时:{end_time-start_time:.4f}')
##3d可视化
x_selected=x_processed_df[selected_features_rfe]
df_viz=pd.DataFrame(x_selected)
df_viz['cluster']=x['KMeans_cluster']
fig=px.scatter_3d(
df_viz,
x=selected_features_rfe[0],
y=selected_features_rfe[1],
z=selected_features_rfe[2],
color='cluster',
color_discrete_sequence=px.colors.qualitative.Bold,
title='RFE特征选择的3D可视化'
)
fig.update_layout(
scene=dict(
xaxis_title=selected_features_rfe[0],
yaxis_title=selected_features_rfe[1],
zaxis_title=selected_features_rfe[2]
),
width=1200,
height=1000
)
fig.show()
##训练XGBOOST模型
lgb_mobel_rfe=lgb.LGBMClassifier(random_state=42,class_weight='balanced')
lgb_mobel_rfe.fit(x_train_rfe,y_train_smote)
lgb_pred_rfe=lgb_mobel_rfe.predict(x_test_rfe)
print("\nRFE筛选后XGBOOST在测试集上的分类报告:")
print(classification_report(y_test, lgb_pred_rfe))
print("RFE筛选后XGBOOST在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, lgb_pred_rfe))