python打卡DAY31-优快云博客

本文链接：https://blog.youkuaiyun.com/Bugabooo/article/details/148242542

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import time

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.impute import SimpleImputer

import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader

##GPU

if torch.backends.mps.is_available():

device=torch.device('mps')

print("使用MPS加速训练")

else:

device=torch.device('cpu')

print("MPS不可用,使用CPU训练")

# #CPU

# device=torch.device('cpu') # 强制使用CPU

##加载数据

data=pd.read_csv(r'data.csv')

x=data.drop(['Id','Credit Default'],axis=1)

y=data['Credit Default']

#定义Pipeline相关定义&处理步骤

object_cols=x.select_dtypes(include=['object']).columns.tolist()

numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()

ordinal_features=['Years in current job']

ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)

ordinal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))

])

print("有序特征处理 Pipeline 定义完成。")

nominal_features=['Home Ownership', 'Purpose', 'Term']

nominal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('onehot',OneHotEncoder(handle_unknown='ignore'))

])

print("标称特征处理 Pipeline 定义完成。")

continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()

continuous_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='mean'))

])

print("连续特征处理 Pipeline 定义完成。")

# --- 构建 ColumnTransformer ---

preprocessor=ColumnTransformer(

transformers=[

('ordinal',ordinal_transformer,ordinal_features),

('nominal',nominal_transformer,nominal_features),

('continuous',continuous_transformer,continuous_cols)

],remainder='passthrough',verbose_feature_names_out=False

)

print("\nColumnTransformer (预处理器) 定义完成。")

#构建完整pipeline

pipeline=Pipeline(steps=[

('processor',preprocessor)

])

print("\n完整的 Pipeline 定义完成。")

print("\n开始对原始数据进行预处理...")

start_time=time.time()

x_preocessed=pipeline.fit_transform(x)

end_time=time.time()

print(f"预处理完成，耗时: {end_time - start_time:.4f} 秒")

feature_names=preprocessor.get_feature_names_out()

x_preocessed_df=pd.DataFrame(x_preocessed,columns=feature_names)

print(x_preocessed_df.info())

x_train,x_test,y_train,y_test=train_test_split(x_preocessed_df,y,test_size=0.2,shuffle=True)

# # 归一化数据，神经网络对于输入数据的尺寸敏感，归一化是最常见的处理方式

# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

x_train=scaler.fit_transform(x_train)

x_test=scaler.transform(x_test)

# # 将数据转换为PyTorch张量并移至GPU

# # 分类问题交叉熵损失要求标签为long类型

# # 张量具有to(device)方法，可以将张量移动到指定的设备上

# # 将数据转换为 PyTorch 张量，因为 PyTorch 使用张量进行训练

# # y_train和y_test是整数，所以需要转化为long类型，如果是float32，会输出1.0 0.0

try:

print("\n开始转换完整数据集为张量...")

# 转换为NumPy数组，然后再转换为张量

x_train_np=np.array(x_train)

y_train_np=np.array(y_train)

x_test_np=np.array(x_test)

y_test_np=np.array(y_test)

print("NumPy数组转换完成")

#转化为张量

x_train_tensor=torch.FloatTensor(x_train_np).to(device)

y_train_tensor=torch.LongTensor(y_train_np).to(device)

x_test_tensor=torch.FloatTensor(x_test_np).to(device)

y_test_tensor=torch.LongTensor(y_test_np).to(device)

print("所有数据已加载完毕")

except Exception as e:

print(f"数据转换过程中出错: {e}")

import traceback

traceback.print_exc()

class MLP(nn.Module):

def __init__(self):

super(MLP,self).__init__()

self.fc1=nn.Linear(34,50)

self.relu=nn.ReLU()

self.fc2=nn.Linear(50,5)

self.fc3=nn.Linear(5,2)

def forward(self,x):

x=self.relu(self.fc1(x))

x=self.relu(self.fc2(x))

x=self.fc3(x)

return x

# # 实例化模型并移至GPU

# # MLP继承nn.Module类，所以也具有to(device)方法

model=MLP().to(device)

print(f"模型已移动到{device}设备")

criterion=nn.CrossEntropyLoss()

optimizer=optim.Adam(model.parameters(),lr=0.001)

num_epochs = 500

# 早停相关参数初始化

best_test_loss = float('inf')

best_epoch = 0

counter = 0

patience = 10 # 容忍测试集损失不下降的轮数

early_stopped = False

train_losses = [] # 存储训练集损失

test_losses = [] # 新增：存储测试集损失

epochs = []

start_time = time.time()

for epoch in range(num_epochs):

outputs = model(x_train_tensor)

train_loss = criterion(outputs, y_train_tensor)

optimizer.zero_grad()

train_loss.backward()

optimizer.step()

model.eval()

with torch.no_grad():

test_outputs = model(x_test_tensor)

test_loss = criterion(test_outputs, y_test_tensor)

model.train()

# ===== 新增早停逻辑 =====

if test_loss.item() < best_test_loss: # 如果当前测试集损失小于最佳损失

best_test_loss = test_loss.item() # 更新最佳损失

best_epoch = epoch + 1 # 更新最佳epoch

counter = 0 # 重置计数器

# 保存最佳模型

checkpoint = {

"model_state_dict": model.state_dict(),

"optimizer_state_dict": optimizer.state_dict(),

"epoch": epoch + 1,

"loss": test_loss.item()

}

torch.save(checkpoint, "best_checkpoint.pth")

else:

counter += 1

if counter >= patience:

print(f"早停触发！在第{epoch+1}轮，测试集损失已有{patience}轮未改善。")

print(f"最佳测试集损失出现在第{best_epoch}轮，损失值为{best_test_loss:.4f}")

early_stopped = True

break # 终止训练循环

# ======================

train_losses.append(train_loss.item())

test_losses.append(test_loss.item())

epochs.append(epoch + 1)

if (epoch+1) % 100 == 0:

print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

time_all = time.time() - start_time

print(f'Training time: {time_all:.2f} seconds')

# ===== 新增：加载最佳模型用于最终评估 =====

if early_stopped:

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

checkpoint = torch.load('best_checkpoint.pth')

model.load_state_dict(checkpoint['model_state_dict'])

optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

best_epoch = checkpoint['epoch']

best_test_loss = checkpoint['loss']

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

# ================================

# 可视化损失曲线

plt.figure(figsize=(10, 6))

plt.plot(epochs, train_losses, label='Train Loss') # 原始代码已有

plt.plot(epochs, test_losses, label='Test Loss') # 新增：测试集损失曲线

plt.xlabel('Epoch')

plt.ylabel('Loss')

plt.title('Training and Test Loss over Epochs')

plt.legend() # 新增：显示图例

plt.grid(True)

plt.show()

##可视化权重图表

weight_data={}

for name,param in model.named_parameters():

if 'weight'in name:

weight_data[name]=param.detach().cpu().numpy()

fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))

fig.suptitle('Weight Distribution of Layers')

# for i,(name,weights) in enumerate(weight_data.items()):

# weights_flat=weights.flatten()

# axes[i].hist(weights_flat,bins=50,alpha=0.7)

# axes[i].set_title(name)

# axes[i].set_xlabel('Weight Value')

# axes[i].set_ylabel('Frequency')

# axes[i].grid(True, linestyle='--', alpha=0.7)

# plt.tight_layout()

# plt.subplots_adjust(top=0.85)

# plt.show()

print("\n=== 权重统计信息 ===")

for name,weights in weight_data.items():

mean=np.mean(weights)

std = np.std(weights)

min_val = np.min(weights)

max_val = np.max(weights)

print(f"{name}:")

print(f" 均值: {mean:.6f}")

print(f" 标准差: {std:.6f}")

print(f" 最小值: {min_val:.6f}")

print(f" 最大值: {max_val:.6f}")

print("-" * 30)

model.eval()

with torch.no_grad():

outputs=model(x_test_tensor)

_,predicted=torch.max(outputs,1)

y_true=y_test_tensor.cpu().numpy()

y_pred=predicted.cpu().numpy()

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report