from MLP_model import *
#设置中文字体&负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
torch.manual_seed(42)
##GPU
if torch.backends.mps.is_available():
device=torch.device('mps')
print("使用MPS加速训练")
else:
device=torch.device('cpu')
print("MPS不可用,使用CPU训练")
# ##CPU
# device=torch.device('cpu')
# =================== 数据预处理 =====================
data=pd.read_csv(r'data.csv')
x=data.drop(['Id','Credit Default'],axis=1)
y=data['Credit Default']
##定义pipeline相关定义&处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
print(f'object_cols:{object_cols}\nnumeric_cols:{numeric_cols}')
ordinal_features=['Years in current job']
ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years',
'6 years', '7 years', '8 years', '9 years', '10+ years']]
ordinal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
print("名义特征处理 Pipeline 定义完成。")
continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('ordinal',ordinal_transformer,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_cols)
],remainder='passthrough',verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成")
# 构建完整pipeline
pipeline=Pipeline(steps=[
('preprocessor',preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time = time.time()
x_processed=pipeline.fit_transform(x)
end_time = time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names=preprocessor.get_feature_names_out()
x_processed_df=pd.DataFrame(x_processed,columns=feature_names)
print(x_processed_df.info())
# =================== 聚类分析与标签生成 =====================
x_scaled=MinMaxScaler().fit_transform(x_processed_df)
k_range=range(2,50)
inertia_values=[]
silhouette_scores=[]
ch_scores=[]
db_scores=[]
start_time=time.time()
for k in k_range:
kmeans=KMeans(n_clusters=k,random_state=42,max_iter=500)
kmeans_label=kmeans.fit_predict(x_scaled)
inertia_values.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(x_scaled,kmeans_label))
ch_scores.append(calinski_harabasz_score(x_scaled,kmeans_label))
db_scores.append(davies_bouldin_score(x_scaled,kmeans_label))
end_time = time.time()
print(f'聚类分析耗时:{end_time-start_time:.4f}秒')
# #可视化聚类
# plt.figure(figsize=(12,6))
# plt.subplot(2,2,1)
# plt.plot(k_range,inertia_values,marker='o')
# plt.title('肘部法则 (惯性越小越好)')
# plt.xlabel('k');plt.ylabel('inertia');plt.grid(True)
# plt.subplot(2,2,2)
# plt.plot(k_range, silhouette_scores, marker='o', color='orange')
# plt.title('轮廓系数 (越大越好)')
# plt.xlabel('k'); plt.ylabel('Silhouette'); plt.grid(True)
# plt.subplot(2,2,3)
# plt.plot(k_range, ch_scores, marker='o', color='yellow')
# plt.title('CH指数 (越大越好)')
# plt.xlabel('k'); plt.ylabel('CH Score'); plt.grid(True)
# plt.subplot(2,2,4)
# plt.plot(k_range, db_scores, marker='o', color='red')
# plt.title('DB指数 (越小越好)')
# plt.xlabel('k'); plt.ylabel('DB Score'); plt.grid(True)
# plt.tight_layout()
# plt.show()
# 最终聚类并添加标签
selected_k=12
kmeans=KMeans(n_clusters=selected_k,random_state=42)
kmeans_label=kmeans.fit_predict(x_scaled)
x_with_cluster=pd.DataFrame(x_scaled,columns=feature_names)
x_with_cluster['KMeans_Cluster']=kmeans_label
# =================== 特征列聚类增强 =====================
x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,shuffle=True)
# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式
# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
x_train_T=x_train.T
kmeans_feature=KMeans(n_clusters=selected_k,random_state=42)
cluster_labels=kmeans_feature.fit_predict(x_train_T)
# 构造特征簇均值增强特征
def build_cluster_features(x_data,cluster_labels,k):
cluster_features=[]
for i in range(k):
idx=np.where(cluster_labels==i)[0]
group_feature=x_data[:,idx].mean(axis=1,keepdims=True)
cluster_features.append(group_feature)
return np.hstack(cluster_features)
x_train_clustered=build_cluster_features(x_train,cluster_labels,selected_k)
x_test_clustered=build_cluster_features(x_test,cluster_labels,selected_k)
x_train_enhanced=np.hstack([x_train,x_train_clustered])
x_test_enhanced=np.hstack([x_test,x_test_clustered])
# 保存聚类模型用于部署
joblib.dump(kmeans_feature,'kmeans_feature_cluster.pkl')
# =================== 构建 Attention 模型 =====================
# # 将数据转换为PyTorch张量并移至GPU
# # 分类问题交叉熵损失要求标签为long类型
# # 张量具有to(device)方法,可以将张量移动到指定的设备上
# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练
# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0
try:
print("\n开始转换完整数据集为张量...")
# 转换为NumPy数组,然后再转换为张量
x_train_np=np.array(x_train_enhanced)
x_test_np=np.array(x_test_enhanced)
y_train_np=np.array(y_train)
y_test_np=np.array(y_test)
x_train_tensor=torch.FloatTensor(x_train_np).to(device)
x_test_tensor=torch.FloatTensor(x_test_np).to(device)
y_train_tensor=torch.LongTensor(y_train_np).to(device)
y_test_tensor=torch.LongTensor(y_test_np).to(device)
print("所有数据已加载完毕")
except Exception as e:
print(f"数据转换过程中出错: {e}")
import traceback
traceback.print_exc()
classes=np.unique(y_train_np)
class_weights=compute_class_weight('balanced',classes=classes,y=y_train_np)
class_weights_tensor=torch.FloatTensor(class_weights).to(device)
class AttentionMLP(nn.Module):
def __init__(self, input_dim,hidden_dim=16,num_classes=2):
super().__init__()
self.attention=nn.Sequential(
nn.Linear(input_dim,input_dim),
nn.Softmax(dim=1)
)
self.fc1=nn.Linear(input_dim,hidden_dim)
self.relu=nn.ReLU()
self.fc2=nn.Linear(hidden_dim,hidden_dim//2)
self.fc3=nn.Linear(hidden_dim//2,num_classes)
def forward(self,x):
attention_weights=self.attention(x)
x_weighted=x*attention_weights
x=self.relu(self.fc1(x_weighted))
x=self.relu(self.fc2(x))
x=self.fc3(x)
return x,attention_weights
input_dim=x_train_tensor.shape[1]
model=AttentionMLP(input_dim).to(device)
print(f"模型已移动到{device}设备")
criterion=nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)
num_epochs=3000
#早停相关参数初始化
best_test_loss=float('inf')
best_epoch=0
counter=0
patience=50
early_stopped=False
train_losses=[]
test_losses=[]
epochs=[]
start_time=time.time()
for epoch in range(num_epochs):
outputs,_=model(x_train_tensor)
train_loss=criterion(outputs,y_train_tensor)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
test_outputs,_=model(x_test_tensor)
test_loss=criterion(test_outputs,y_test_tensor)
model.train()
#=============early stopped==========
if test_loss.item()<best_test_loss:
best_test_loss=test_loss.item()
best_epoch=epoch+1
counter=0
checkpoint={
'model_state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'epoch':epoch+1,
'loss':test_loss.item()
}
torch.save(checkpoint,'best_checkpoint.pyh')
else:
counter+=1
if counter>patience:
print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")
print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")
early_stopped=True
break
#===========================================
train_losses.append(train_loss.item())
test_losses.append(test_loss.item())
epochs.append(epoch+1)
if (epoch+1) % 100 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")
time_all = time.time() - start_time
print(f'Training time: {time_all:.2f} seconds')
# ===== 新增:加载最佳模型用于最终评估 =====
if early_stopped:
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
checkpoint=torch.load('best_checkpoint.pyh')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
best_epoch=checkpoint['epoch']
best_test_loss=checkpoint['loss']
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
# # =================== 可视化损失与权重 =====================
# plt.figure(figsize=(10, 6))
# plt.plot(epochs, train_losses, label='Train Loss')
# plt.plot(epochs, test_losses, label='Test Loss')
# plt.xlabel('Epoch');plt.ylabel('Loss')
# plt.title('Training and Test Loss over Epochs')
# plt.legend();plt.grid(True)
# plt.tight_layout()
# plt.show()
#可视化权重图表
weight_data={}
for name,param in model.named_parameters():
if 'weight' in name:
weight_data[name]=param.detach().cpu().numpy()
print(name,param.shape)
fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))
fig.suptitle('Weight Distribution of Layers')
for i,(name,weights) in enumerate(weight_data.items()):
weights_flat=weights.flatten()
axes[i].hist(weights_flat,bins=50,alpha=0.7)
axes[i].set_title(name)
axes[i].set_xlabel('Weight Value')
axes[i].set_ylabel('Frequency')
axes[i].grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.show()
print("\n=== 权重统计信息 ===")
for name,weights in weight_data.items():
mean=np.mean(weights)
std=np.std(weights)
min_val=np.min(weights)
max_val=np.max(weights)
print(f"{name}:")
print(f" 均值: {mean:.6f}")
print(f" 标准差: {std:.6f}")
print(f" 最小值: {min_val:.6f}")
print(f" 最大值: {max_val:.6f}")
print("-" * 30)
# =================== 可视化注意力权重(特征重要性) =====================
with torch.no_grad():
_,attention_weights=model(x_test_tensor)
attention_avg=attention_weights.cpu().numpy().mean(axis=0)
# 构建增强后的特征名称(原始特征 + 聚类特征)
enhanced_feature_names=list(feature_names)+[f'Cluster_{i}' for i in range(selected_k)]
plt.figure(figsize=(12, 6))
plt.bar(enhanced_feature_names, attention_avg)
plt.xticks(rotation=45,ha='right')
plt.xlabel("Feature Index")
plt.ylabel("Average Attention Weight")
plt.title("Average Attention Weights (Feature Importance)")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
model.eval()
with torch.no_grad():
outputs,_=model(x_test_tensor)
_,predicted=torch.max(outputs,1)
y_true=y_test_tensor.cpu().numpy()
y_pred=predicted.cpu().numpy()
acc=accuracy_score(y_true,y_pred)
print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')
cm=confusion_matrix(y_true,y_pred)
print(f'混淆矩阵:\n{cm}')
print(f'分类报告:\n{classification_report(y_true,y_pred)}')