import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
##GPU
if torch.backends.mps.is_available():
device=torch.device('mps')
print("使用MPS加速训练")
else:
device=torch.device('cpu')
print("MPS不可用,使用CPU训练")
# #CPU
# device=torch.device('cpu') # 强制使用CPU
##加载数据
data=pd.read_csv(r'data.csv')
x=data.drop(['Id','Credit Default'],axis=1)
y=data['Credit Default']
#定义Pipeline相关定义&处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
ordinal_features=['Years in current job']
ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)
ordinal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('ordinal',ordinal_transformer,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_cols)
],remainder='passthrough',verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成。")
#构建完整pipeline
pipeline=Pipeline(steps=[
('processor',preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time=time.time()
x_preocessed=pipeline.fit_transform(x)
end_time=time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names=preprocessor.get_feature_names_out()
x_preocessed_df=pd.DataFrame(x_preocessed,columns=feature_names)
print(x_preocessed_df.info())
x_train,x_test,y_train,y_test=train_test_split(x_preocessed_df,y,test_size=0.2,shuffle=True)
# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式
# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
# # 将数据转换为PyTorch张量并移至GPU
# # 分类问题交叉熵损失要求标签为long类型
# # 张量具有to(device)方法,可以将张量移动到指定的设备上
# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练
# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0
try:
print("\n开始转换完整数据集为张量...")
# 转换为NumPy数组,然后再转换为张量
x_train_np=np.array(x_train)
y_train_np=np.array(y_train)
x_test_np=np.array(x_test)
y_test_np=np.array(y_test)
print("NumPy数组转换完成")
#转化为张量
x_train_tensor=torch.FloatTensor(x_train_np).to(device)
y_train_tensor=torch.LongTensor(y_train_np).to(device)
x_test_tensor=torch.FloatTensor(x_test_np).to(device)
y_test_tensor=torch.LongTensor(y_test_np).to(device)
print("所有数据已加载完毕")
except Exception as e:
print(f"数据转换过程中出错: {e}")
import traceback
traceback.print_exc()
class MLP(nn.Module):
def __init__(self):
super(MLP,self).__init__()
self.fc1=nn.Linear(34,50)
self.relu=nn.ReLU()
self.fc2=nn.Linear(50,5)
self.fc3=nn.Linear(5,2)
def forward(self,x):
x=self.relu(self.fc1(x))
x=self.relu(self.fc2(x))
x=self.fc3(x)
return x
# # 实例化模型并移至GPU
# # MLP继承nn.Module类,所以也具有to(device)方法
model=MLP().to(device)
print(f"模型已移动到{device}设备")
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.001)
num_epochs = 500
# 早停相关参数初始化
best_test_loss = float('inf')
best_epoch = 0
counter = 0
patience = 10 # 容忍测试集损失不下降的轮数
early_stopped = False
train_losses = [] # 存储训练集损失
test_losses = [] # 新增:存储测试集损失
epochs = []
start_time = time.time()
for epoch in range(num_epochs):
outputs = model(x_train_tensor)
train_loss = criterion(outputs, y_train_tensor)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
test_outputs = model(x_test_tensor)
test_loss = criterion(test_outputs, y_test_tensor)
model.train()
# ===== 新增早停逻辑 =====
if test_loss.item() < best_test_loss: # 如果当前测试集损失小于最佳损失
best_test_loss = test_loss.item() # 更新最佳损失
best_epoch = epoch + 1 # 更新最佳epoch
counter = 0 # 重置计数器
# 保存最佳模型
checkpoint = {
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"epoch": epoch + 1,
"loss": test_loss.item()
}
torch.save(checkpoint, "best_checkpoint.pth")
else:
counter += 1
if counter >= patience:
print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")
print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")
early_stopped = True
break # 终止训练循环
# ======================
train_losses.append(train_loss.item())
test_losses.append(test_loss.item())
epochs.append(epoch + 1)
if (epoch+1) % 100 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")
time_all = time.time() - start_time
print(f'Training time: {time_all:.2f} seconds')
# ===== 新增:加载最佳模型用于最终评估 =====
if early_stopped:
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
checkpoint = torch.load('best_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
best_epoch = checkpoint['epoch']
best_test_loss = checkpoint['loss']
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
# ================================
# 可视化损失曲线
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label='Train Loss') # 原始代码已有
plt.plot(epochs, test_losses, label='Test Loss') # 新增:测试集损失曲线
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend() # 新增:显示图例
plt.grid(True)
plt.show()
##可视化权重图表
weight_data={}
for name,param in model.named_parameters():
if 'weight'in name:
weight_data[name]=param.detach().cpu().numpy()
fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))
fig.suptitle('Weight Distribution of Layers')
# for i,(name,weights) in enumerate(weight_data.items()):
# weights_flat=weights.flatten()
# axes[i].hist(weights_flat,bins=50,alpha=0.7)
# axes[i].set_title(name)
# axes[i].set_xlabel('Weight Value')
# axes[i].set_ylabel('Frequency')
# axes[i].grid(True, linestyle='--', alpha=0.7)
# plt.tight_layout()
# plt.subplots_adjust(top=0.85)
# plt.show()
print("\n=== 权重统计信息 ===")
for name,weights in weight_data.items():
mean=np.mean(weights)
std = np.std(weights)
min_val = np.min(weights)
max_val = np.max(weights)
print(f"{name}:")
print(f" 均值: {mean:.6f}")
print(f" 标准差: {std:.6f}")
print(f" 最小值: {min_val:.6f}")
print(f" 最大值: {max_val:.6f}")
print("-" * 30)
model.eval()
with torch.no_grad():
outputs=model(x_test_tensor)
_,predicted=torch.max(outputs,1)
y_true=y_test_tensor.cpu().numpy()
y_pred=predicted.cpu().numpy()
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
acc=accuracy_score(y_true,y_pred)
print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')
cm=confusion_matrix(y_true,y_pred)
print("混淆矩阵:")
print(cm)
# 打印分类报告
print("\n分类报告:")
print(classification_report(y_true, y_pred))