from MLP_model import *
#设置中文字体&负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
torch.manual_seed(42)
##GPU
if torch.backends.mps.is_available():
device=torch.device('mps')
print("使用MPS加速训练")
else:
device=torch.device('cpu')
print("MPS不可用,使用CPU训练")
# ##CPU
# device=torch.device('cpu')
# =================== 数据预处理 =====================
data = pd.read_csv("data.csv")
x = data.drop(['Id', 'Credit Default'], axis=1)
y = data['Credit Default']
# 定义Pipeline相关定义&处理步骤
object_cols = x.select_dtypes(include=['object']).columns.tolist()
numeric_cols = x.select_dtypes(exclude=['object']).columns.tolist()
print(f'object_cols:{object_cols}\nnumeric_cols:{numeric_cols}')
ordinal_features = ['Years in current job']
ordinal_categories = [['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years',
'6 years', '7 years', '8 years', '9 years', '10+ years']]
ordinal_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features = ['Home Ownership', 'Purpose', 'Term']
nominal_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_cols = x.columns.difference(ordinal_features + nominal_features).tolist()
continuous_transform = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor = ColumnTransformer(
transformers=[
('ordinal', ordinal_transformer, ordinal_features),
('nominal', nominal_transformer, nominal_features),
('continuous', continuous_transform, continuous_cols)
], remainder='passthrough', verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成。")
# 构建完整pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time = time.time()
x_processed = pipeline.fit_transform(x)
end_time = time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names = preprocessor.get_feature_names_out()
x_processed_df = pd.DataFrame(x_processed, columns=feature_names)
print(x_processed_df.info())
# =================== 聚类分析与标签生成 =====================
x_scaled = MinMaxScaler().fit_transform(x_processed)
k_range = range(2, 21)
intertia_values = []
silhouette_scores = []
ch_scores = []
db_scores = []
start_time = time.time()
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans_label = kmeans.fit_predict(x_scaled)
intertia_values.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(x_scaled, kmeans_label))
ch_scores.append(calinski_harabasz_score(x_scaled, kmeans_label))
db_scores.append(davies_bouldin_score(x_scaled, kmeans_label))
end_time = time.time()
print(f'聚类分析耗时:{end_time-start_time:.4f}秒')
# 可视化聚类评估
plt.figure(figsize=(12,6))
plt.subplot(2,2,1)
plt.plot(k_range, intertia_values, marker='o')
plt.title('肘部法则 (惯性越小越好)')
plt.xlabel('k'); plt.ylabel('Inertia'); plt.grid(True)
plt.subplot(2,2,2)
plt.plot(k_range, silhouette_scores, marker='o', color='orange')
plt.title('轮廓系数 (越大越好)')
plt.xlabel('k'); plt.ylabel('Silhouette'); plt.grid(True)
plt.subplot(2,2,3)
plt.plot(k_range, ch_scores, marker='o', color='yellow')
plt.title('CH指数 (越大越好)')
plt.xlabel('k'); plt.ylabel('CH Score'); plt.grid(True)
plt.subplot(2,2,4)
plt.plot(k_range, db_scores, marker='o', color='red')
plt.title('DB指数 (越小越好)')
plt.xlabel('k'); plt.ylabel('DB Score'); plt.grid(True)
plt.tight_layout()
plt.show()
# 最终聚类并添加标签
selected_k = 6
kmeans = KMeans(n_clusters=selected_k, random_state=42)
kmeans_label = kmeans.fit_predict(x_scaled)
x_with_cluster = pd.DataFrame(x_scaled, columns=feature_names)
x_with_cluster['KMeans_Cluster'] = kmeans_label
# =================== 特征列聚类增强 =====================
x_train, x_test, y_train, y_test = train_test_split(x_processed_df, y, test_size=0.2, shuffle=True)
# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式
# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)
x_train_T = x_train.T
kmeans_feature = KMeans(n_clusters=selected_k, random_state=42)
cluster_labels = kmeans_feature.fit_predict(x_train_T)
# 构造特征簇均值增强特征
def build_cluster_features(x_data, cluster_labels, k):
cluster_features = []
for i in range(k):
idx = np.where(cluster_labels == i)[0]
group_feature = x_data[:, idx].mean(axis=1, keepdims=True)
cluster_features.append(group_feature)
return np.hstack(cluster_features)
x_train_clustered = build_cluster_features(x_train, cluster_labels, selected_k)
x_test_clustered = build_cluster_features(x_test, cluster_labels, selected_k)
x_train_enhanced = np.hstack([x_train, x_train_clustered])
x_test_enhanced = np.hstack([x_test, x_test_clustered])
# 保存聚类模型用于部署
joblib.dump(kmeans_feature, 'kmeans_feature_cluster.pkl')
# =================== 构建 Attention 模型 =====================
# # 将数据转换为PyTorch张量并移至GPU
# # 分类问题交叉熵损失要求标签为long类型
# # 张量具有to(device)方法,可以将张量移动到指定的设备上
# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练
# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0
try:
print("\n开始转换完整数据集为张量...")
# 转换为NumPy数组,然后再转换为张量
x_train_np=np.array(x_train_enhanced)
x_test_np=np.array(x_test_enhanced)
y_train_np=np.array(y_train)
y_test_np=np.array(y_test)
x_train_tensor=torch.FloatTensor(x_train_np).to(device)
x_test_tensor=torch.FloatTensor(x_test_np).to(device)
y_train_tensor=torch.LongTensor(y_train_np).to(device)
y_test_tensor=torch.LongTensor(y_test_np).to(device)
print("所有数据已加载完毕")
except Exception as e:
print(f"数据转换过程中出错: {e}")
import traceback
traceback.print_exc()
class AttentionMLP(nn.Module):
def __init__(self, input_dim, hidden_dim=16, num_classes=2):
super().__init__()
self.attention = nn.Sequential(
nn.Linear(input_dim, input_dim),
nn.Tanh(),
nn.Linear(input_dim, input_dim),
nn.Softmax(dim=1)
)
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
def forward(self, x):
attention_weights = self.attention(x)
x_weighted = x * attention_weights
x = self.relu(self.fc1(x_weighted))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x, attention_weights
input_dim = x_train_tensor.shape[1]
model = AttentionMLP(input_dim).to(device)
print(f"模型已移动到{device}设备")
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)
num_epochs=5000
# 早停相关参数初始化
best_test_loss=float('inf')
best_epoch=0
counter=0
patience=50
early_stopped=False
train_losses=[]
test_losses=[]
epochs=[]
start_time=time.time()
for epoch in range(num_epochs):
outputs,_=model(x_train_tensor)
train_loss=criterion(outputs,y_train_tensor)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
test_outputs,_=model(x_test_tensor)
test_loss=criterion(test_outputs,y_test_tensor)
model.train()
# ===== 新增早停逻辑 =====
if test_loss.item()<best_test_loss:
best_test_loss=test_loss.item()
best_epoch=epoch+1
counter=0
checkpoint={
'model_state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'epoch':epoch+1,
'loss':test_loss.item()
}
torch.save(checkpoint,'best_checkpoint.pth')
else:
counter+=1
if counter>patience:
print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")
print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")
early_stopped=True
break
#===========================================
train_losses.append(train_loss.item())
test_losses.append(test_loss.item())
epochs.append(epoch+1)
if (epoch+1) % 100 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")
time_all = time.time() - start_time
print(f'Training time: {time_all:.2f} seconds')
# ===== 新增:加载最佳模型用于最终评估 =====
if early_stopped:
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
checkpoint=torch.load('best_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
best_epoch=checkpoint['epoch']
best_test_loss=checkpoint['loss']
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
# ================================
# =================== 可视化损失与权重 =====================
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# 可视化权重图表
weight_data={}
for name,param in model.named_parameters():
if 'weight' in name:
weight_data[name]=param.detach().cpu().numpy()
print(name,param.shape)
fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))
fig.suptitle('Weight Distribution of Layers')
for i,(name,weights) in enumerate(weight_data.items()):
weights_flat=weights.flatten()
axes[i].hist(weights_flat,bins=50,alpha=0.7)
axes[i].set_title(name)
axes[i].set_xlabel('Weight Value')
axes[i].set_ylabel('Frequency')
axes[i].grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.show()
print("\n=== 权重统计信息 ===")
for name,weights in weight_data.items():
mean=np.mean(weights)
std=np.std(weights)
min_val=np.min(weights)
max_val=np.max(weights)
print(f"{name}:")
print(f" 均值: {mean:.6f}")
print(f" 标准差: {std:.6f}")
print(f" 最小值: {min_val:.6f}")
print(f" 最大值: {max_val:.6f}")
print("-" * 30)
# =================== 可视化注意力权重(特征重要性) =====================
with torch.no_grad():
_, attention_weights = model(x_test_tensor)
attention_avg = attention_weights.cpu().numpy().mean(axis=0)
# 构建增强后的特征名称(原始特征 + 聚类特征)
enhanced_feature_names = list(feature_names) + [f'Cluster_{i}' for i in range(selected_k)]
plt.figure(figsize=(12, 6))
plt.bar(enhanced_feature_names, attention_avg)
plt.xticks(rotation=90)
plt.xlabel("Feature Index")
plt.ylabel("Average Attention Weight")
plt.title("Average Attention Weights (Feature Importance)")
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(rotation=30,ha='right')
plt.tight_layout()
plt.show()
model.eval()
with torch.no_grad():
outputs,_=model(x_test_tensor)
_,predicted=torch.max(outputs,1)
y_true=y_test_tensor.cpu().numpy()
y_pred=predicted.cpu().numpy()
acc=accuracy_score(y_true,y_pred)
print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')
cm=confusion_matrix(y_true,y_pred)
print(f'混淆矩阵:\n{cm}')
print(f'分类报告:\n{classification_report(y_true,y_pred)}')
809

被折叠的 条评论
为什么被折叠?



