python打卡DAY34-优快云博客

本文链接：https://blog.youkuaiyun.com/Bugabooo/article/details/148314596

from MLP_model import *

#设置中文字体&负号正确显示

plt.rcParams['font.sans-serif']=['STHeiti']

plt.rcParams['axes.unicode_minus']=True

plt.rcParams['figure.dpi']=100

torch.manual_seed(42)

##GPU

if torch.backends.mps.is_available():

device=torch.device('mps')

print("使用MPS加速训练")

else:

device=torch.device('cpu')

print("MPS不可用,使用CPU训练")

# ##CPU

# device=torch.device('cpu')

# =================== 数据预处理 =====================

data=pd.read_csv(r'data.csv')

x=data.drop(['Id','Credit Default'],axis=1)

y=data['Credit Default']

##定义pipeline相关定义&处理步骤

object_cols=x.select_dtypes(include=['object']).columns.tolist()

numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()

print(f'object_cols:{object_cols}\nnumeric_cols:{numeric_cols}')

ordinal_features=['Years in current job']

ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years',

'6 years', '7 years', '8 years', '9 years', '10+ years']]

ordinal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))

])

print("有序特征处理 Pipeline 定义完成。")

nominal_features=['Home Ownership', 'Purpose', 'Term']

nominal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('onehot',OneHotEncoder(handle_unknown='ignore'))

])

print("名义特征处理 Pipeline 定义完成。")

continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()

continuous_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='mean'))

])

print("连续特征处理 Pipeline 定义完成。")

# --- 构建 ColumnTransformer ---

preprocessor=ColumnTransformer(

transformers=[

('ordinal',ordinal_transformer,ordinal_features),

('nominal',nominal_transformer,nominal_features),

('continuous',continuous_transformer,continuous_cols)

],remainder='passthrough',verbose_feature_names_out=False

)

print("\nColumnTransformer (预处理器) 定义完成")

# 构建完整pipeline

pipeline=Pipeline(steps=[

('preprocessor',preprocessor)

])

print("\n完整的 Pipeline 定义完成。")

print("\n开始对原始数据进行预处理...")

start_time = time.time()

x_processed=pipeline.fit_transform(x)

end_time = time.time()

print(f"预处理完成，耗时: {end_time - start_time:.4f} 秒")

feature_names=preprocessor.get_feature_names_out()

x_processed_df=pd.DataFrame(x_processed,columns=feature_names)

print(x_processed_df.info())

# =================== 聚类分析与标签生成 =====================

x_scaled=MinMaxScaler().fit_transform(x_processed_df)

k_range=range(2,50)

inertia_values=[]

silhouette_scores=[]

ch_scores=[]

db_scores=[]

start_time=time.time()

for k in k_range:

kmeans=KMeans(n_clusters=k,random_state=42,max_iter=500)

kmeans_label=kmeans.fit_predict(x_scaled)

inertia_values.append(kmeans.inertia_)

silhouette_scores.append(silhouette_score(x_scaled,kmeans_label))

ch_scores.append(calinski_harabasz_score(x_scaled,kmeans_label))

db_scores.append(davies_bouldin_score(x_scaled,kmeans_label))

end_time = time.time()

print(f'聚类分析耗时：{end_time-start_time:.4f}秒')

# #可视化聚类

# plt.figure(figsize=(12,6))

# plt.subplot(2,2,1)

# plt.plot(k_range,inertia_values,marker='o')

# plt.title('肘部法则 (惯性越小越好)')

# plt.xlabel('k');plt.ylabel('inertia');plt.grid(True)

# plt.subplot(2,2,2)

# plt.plot(k_range, silhouette_scores, marker='o', color='orange')

# plt.title('轮廓系数 (越大越好)')

# plt.xlabel('k'); plt.ylabel('Silhouette'); plt.grid(True)

# plt.subplot(2,2,3)

# plt.plot(k_range, ch_scores, marker='o', color='yellow')

# plt.title('CH指数 (越大越好)')

# plt.xlabel('k'); plt.ylabel('CH Score'); plt.grid(True)

# plt.subplot(2,2,4)

# plt.plot(k_range, db_scores, marker='o', color='red')

# plt.title('DB指数 (越小越好)')

# plt.xlabel('k'); plt.ylabel('DB Score'); plt.grid(True)

# plt.tight_layout()

# plt.show()

# 最终聚类并添加标签

selected_k=12

kmeans=KMeans(n_clusters=selected_k,random_state=42)

kmeans_label=kmeans.fit_predict(x_scaled)

x_with_cluster=pd.DataFrame(x_scaled,columns=feature_names)

x_with_cluster['KMeans_Cluster']=kmeans_label

# =================== 特征列聚类增强 =====================

x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,shuffle=True)

# # 归一化数据，神经网络对于输入数据的尺寸敏感，归一化是最常见的处理方式

# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

x_train=scaler.fit_transform(x_train)

x_test=scaler.transform(x_test)

x_train_T=x_train.T

kmeans_feature=KMeans(n_clusters=selected_k,random_state=42)

cluster_labels=kmeans_feature.fit_predict(x_train_T)

# 构造特征簇均值增强特征

def build_cluster_features(x_data,cluster_labels,k):

cluster_features=[]

for i in range(k):

idx=np.where(cluster_labels==i)[0]

group_feature=x_data[:,idx].mean(axis=1,keepdims=True)

cluster_features.append(group_feature)

return np.hstack(cluster_features)

x_train_clustered=build_cluster_features(x_train,cluster_labels,selected_k)

x_test_clustered=build_cluster_features(x_test,cluster_labels,selected_k)

x_train_enhanced=np.hstack([x_train,x_train_clustered])

x_test_enhanced=np.hstack([x_test,x_test_clustered])

# 保存聚类模型用于部署

joblib.dump(kmeans_feature,'kmeans_feature_cluster.pkl')

# =================== 构建 Attention 模型 =====================

# # 将数据转换为PyTorch张量并移至GPU

# # 分类问题交叉熵损失要求标签为long类型

# # 张量具有to(device)方法，可以将张量移动到指定的设备上

# # 将数据转换为 PyTorch 张量，因为 PyTorch 使用张量进行训练

# # y_train和y_test是整数，所以需要转化为long类型，如果是float32，会输出1.0 0.0

try:

print("\n开始转换完整数据集为张量...")

# 转换为NumPy数组，然后再转换为张量

x_train_np=np.array(x_train_enhanced)

x_test_np=np.array(x_test_enhanced)

y_train_np=np.array(y_train)

y_test_np=np.array(y_test)

x_train_tensor=torch.FloatTensor(x_train_np).to(device)

x_test_tensor=torch.FloatTensor(x_test_np).to(device)

y_train_tensor=torch.LongTensor(y_train_np).to(device)

y_test_tensor=torch.LongTensor(y_test_np).to(device)

print("所有数据已加载完毕")

except Exception as e:

print(f"数据转换过程中出错: {e}")

import traceback

traceback.print_exc()

classes=np.unique(y_train_np)

class_weights=compute_class_weight('balanced',classes=classes,y=y_train_np)

class_weights_tensor=torch.FloatTensor(class_weights).to(device)

class AttentionMLP(nn.Module):

def __init__(self, input_dim,hidden_dim=16,num_classes=2):

super().__init__()

self.attention=nn.Sequential(

nn.Linear(input_dim,input_dim),

nn.Softmax(dim=1)

)

self.fc1=nn.Linear(input_dim,hidden_dim)

self.relu=nn.ReLU()

self.fc2=nn.Linear(hidden_dim,hidden_dim//2)

self.fc3=nn.Linear(hidden_dim//2,num_classes)

def forward(self,x):

attention_weights=self.attention(x)

x_weighted=x*attention_weights

x=self.relu(self.fc1(x_weighted))

x=self.relu(self.fc2(x))

x=self.fc3(x)

return x,attention_weights

input_dim=x_train_tensor.shape[1]

model=AttentionMLP(input_dim).to(device)

print(f"模型已移动到{device}设备")

criterion=nn.CrossEntropyLoss(weight=class_weights_tensor)

optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)

num_epochs=3000

#早停相关参数初始化

best_test_loss=float('inf')

best_epoch=0

counter=0

patience=50

early_stopped=False

train_losses=[]

test_losses=[]

epochs=[]

start_time=time.time()

for epoch in range(num_epochs):

outputs,_=model(x_train_tensor)

train_loss=criterion(outputs,y_train_tensor)

optimizer.zero_grad()

train_loss.backward()

optimizer.step()

model.eval()

with torch.no_grad():

test_outputs,_=model(x_test_tensor)

test_loss=criterion(test_outputs,y_test_tensor)

model.train()

#=============early stopped==========

if test_loss.item()<best_test_loss:

best_test_loss=test_loss.item()

best_epoch=epoch+1

counter=0

checkpoint={

'model_state_dict':model.state_dict(),

'optimizer_state_dict':optimizer.state_dict(),

'epoch':epoch+1,

'loss':test_loss.item()

}

torch.save(checkpoint,'best_checkpoint.pyh')

else:

counter+=1

if counter>patience:

print(f"早停触发！在第{epoch+1}轮，测试集损失已有{patience}轮未改善。")

print(f"最佳测试集损失出现在第{best_epoch}轮，损失值为{best_test_loss:.4f}")

early_stopped=True

break

#===========================================

train_losses.append(train_loss.item())

test_losses.append(test_loss.item())

epochs.append(epoch+1)

if (epoch+1) % 100 == 0:

print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

time_all = time.time() - start_time

print(f'Training time: {time_all:.2f} seconds')

# ===== 新增：加载最佳模型用于最终评估 =====

if early_stopped:

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

checkpoint=torch.load('best_checkpoint.pyh')

model.load_state_dict(checkpoint['model_state_dict'])

optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

best_epoch=checkpoint['epoch']

best_test_loss=checkpoint['loss']

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

# # =================== 可视化损失与权重 =====================

# plt.figure(figsize=(10, 6))

# plt.plot(epochs, train_losses, label='Train Loss')

# plt.plot(epochs, test_losses, label='Test Loss')

# plt.xlabel('Epoch');plt.ylabel('Loss')

# plt.title('Training and Test Loss over Epochs')

# plt.legend();plt.grid(True)

# plt.tight_layout()

# plt.show()

#可视化权重图表

weight_data={}

for name,param in model.named_parameters():

if 'weight' in name:

weight_data[name]=param.detach().cpu().numpy()

print(name,param.shape)

fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))

fig.suptitle('Weight Distribution of Layers')

for i,(name,weights) in enumerate(weight_data.items()):

weights_flat=weights.flatten()

axes[i].hist(weights_flat,bins=50,alpha=0.7)

axes[i].set_title(name)

axes[i].set_xlabel('Weight Value')

axes[i].set_ylabel('Frequency')

axes[i].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()

plt.subplots_adjust(top=0.85)

plt.show()

print("\n=== 权重统计信息 ===")

for name,weights in weight_data.items():

mean=np.mean(weights)

std=np.std(weights)

min_val=np.min(weights)

max_val=np.max(weights)

print(f"{name}:")

print(f" 均值: {mean:.6f}")

print(f" 标准差: {std:.6f}")

print(f" 最小值: {min_val:.6f}")

print(f" 最大值: {max_val:.6f}")

print("-" * 30)

# =================== 可视化注意力权重（特征重要性） =====================

with torch.no_grad():

_,attention_weights=model(x_test_tensor)

attention_avg=attention_weights.cpu().numpy().mean(axis=0)

# 构建增强后的特征名称（原始特征 + 聚类特征）

enhanced_feature_names=list(feature_names)+[f'Cluster_{i}' for i in range(selected_k)]

plt.figure(figsize=(12, 6))

plt.bar(enhanced_feature_names, attention_avg)

plt.xticks(rotation=45,ha='right')

plt.xlabel("Feature Index")

plt.ylabel("Average Attention Weight")

plt.title("Average Attention Weights (Feature Importance)")

plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()

plt.show()

model.eval()

with torch.no_grad():

outputs,_=model(x_test_tensor)

_,predicted=torch.max(outputs,1)

y_true=y_test_tensor.cpu().numpy()

y_pred=predicted.cpu().numpy()

acc=accuracy_score(y_true,y_pred)

print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')

cm=confusion_matrix(y_true,y_pred)

print(f'混淆矩阵:\n{cm}')

print(f'分类报告:\n{classification_report(y_true,y_pred)}')