import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
##GPU
if torch.backends.mps.is_available():
device=torch.device('mps')
print("使用MPS加速训练")
else:
device=torch.device('cpu')
print("MPS不可用,使用CPU训练")
# ##CPU
# device=torch.device('cpu')
##加载数据
data=pd.read_csv(r'data.csv')
x=data.drop(['Id','Credit Default'],axis=1)
y=data['Credit Default']
#定义Pipeline相关定义&处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
ordinal_features=['Years in current job']
ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)
ordinal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('ordinal',ordinal_transformer,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_cols)
],remainder='passthrough',verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成。")
#构建完整pipeline
pipeline=Pipeline(steps=[
('preprecessor',preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time=time.time()
x_processed=pipeline.fit_transform(x)
end_time=time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names=preprocessor.get_feature_names_out()
x_processed_df=pd.DataFrame(x_processed,columns=feature_names)
print(x_processed_df.info())
x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,shuffle=True)
# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式
# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
# # 将数据转换为PyTorch张量并移至GPU
# # 分类问题交叉熵损失要求标签为long类型
# # 张量具有to(device)方法,可以将张量移动到指定的设备上
# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练
# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0
try:
print("\n开始转换完整数据集为张量...")
# 转换为NumPy数组,然后再转换为张量
x_train_np=np.array(x_train)
x_test_np=np.array(x_test)
y_train_np=np.array(y_train)
y_test_np=np.array(y_test)
x_train_tensor=torch.FloatTensor(x_train_np).to(device)
x_test_tensor=torch.FloatTensor(x_test_np).to(device)
y_train_tensor=torch.LongTensor(y_train_np).to(device)
y_test_tensor=torch.LongTensor(y_test_np).to(device)
print("所有数据已加载完毕")
except Exception as e:
print(f"数据转换过程中出错: {e}")
import traceback
traceback.print_exc()
class MLP(nn.Module):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fc1=nn.Linear(34,8)
self.relu=nn.ReLU()
self.fc2=nn.Linear(8,5)
self.fc3=nn.Linear(5,2)
def forward(self,x):
x=self.relu(self.fc1(x))
x=self.relu(self.fc2(x))
x=self.fc3(x)
return x
# # 实例化模型并移至GPU
# # MLP继承nn.Module类,所以也具有to(device)方法
model=MLP().to(device)
print(f"模型已移动到{device}设备")
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)
num_epochs=5000
# 早停相关参数初始化
best_test_loss=float('inf')
best_epoch=0
counter=0
patience=50
early_stopped=False
train_losses=[]
test_losses=[]
epochs=[]
start_time=time.time()
for epoch in range(num_epochs):
outputs=model(x_train_tensor)
train_loss=criterion(outputs,y_train_tensor)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
test_outputs=model(x_test_tensor)
test_loss=criterion(test_outputs,y_test_tensor)
model.train()
# ===== 新增早停逻辑 =====
if test_loss.item()<best_test_loss:
best_test_loss=test_loss.item()
best_epoch=epoch+1
counter=0
checkpoint={
'model_state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'epoch':epoch+1,
'loss':test_loss.item()
}
torch.save(checkpoint,'best_checkpoint.pth')
else:
counter+=1
if counter>=patience:
print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")
print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")
early_stopped=True
break
#============================
train_losses.append(train_loss.item())
test_losses.append(test_loss.item())
epochs.append(epoch+1)
if (epoch+1) % 1000 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")
time_all = time.time() - start_time
print(f'Training time: {time_all:.2f} seconds')
# ===== 新增:加载最佳模型用于最终评估 =====
if early_stopped:
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
checkpoint=torch.load('best_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
best_epoch = checkpoint['epoch']
best_test_loss = checkpoint['loss']
print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
# ================================
# # 可视化损失曲线
# plt.figure(figsize=(10, 6))
# plt.plot(epochs, train_losses, label='Train Loss') # 原始代码已有
# plt.plot(epochs, test_losses, label='Test Loss') # 新增:测试集损失曲线
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training and Test Loss over Epochs')
# plt.legend() # 新增:显示图例
# plt.grid(True)
# plt.show()
#可视化权重图表
weight_data={}
for name,param in model.named_parameters():
if 'weight' in name:
weight_data[name]=param.detach().cpu().numpy()
print(name,param.shape)
fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))
fig.suptitle('Weight Distribution of Layers')
for i,(name,weights) in enumerate(weight_data.items()):
weights_flat=weights.flatten()
axes[i].hist(weights_flat,bins=50,alpha=0.7)
axes[i].set_title(name)
axes[i].set_xlabel('Weight Value')
axes[i].set_ylabel('Frequency')
axes[i].grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.show()
print("\n=== 权重统计信息 ===")
for name,weights in weight_data.items():
mean=np.mean(weights)
std = np.std(weights)
min_val = np.min(weights)
max_val = np.max(weights)
print(f"{name}:")
print(f" 均值: {mean:.6f}")
print(f" 标准差: {std:.6f}")
print(f" 最小值: {min_val:.6f}")
print(f" 最大值: {max_val:.6f}")
print("-" * 30)
model.eval()
with torch.no_grad():
outputs=model(x_test_tensor)
_,predicted=torch.max(outputs,1)
y_true=y_test_tensor.cpu().numpy()
y_pred=predicted=predicted.cpu().numpy()
acc=accuracy_score(y_true,y_pred)
print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')
cm=confusion_matrix(y_true,y_pred)
print(f'混淆矩阵:\n{cm}')
print(f'分类报告:\n{classification_report(y_true,y_pred)}')