python打卡DAY32

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import time

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.impute import SimpleImputer

import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader

##GPU

if torch.backends.mps.is_available():

device=torch.device('mps')

print("使用MPS加速训练")

else:

device=torch.device('cpu')

print("MPS不可用,使用CPU训练")

# ##CPU

# device=torch.device('cpu')

##加载数据

data=pd.read_csv(r'data.csv')

x=data.drop(['Id','Credit Default'],axis=1)

y=data['Credit Default']

#定义Pipeline相关定义&处理步骤

object_cols=x.select_dtypes(include=['object']).columns.tolist()

numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()

ordinal_features=['Years in current job']

ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)

ordinal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))

])

print("有序特征处理 Pipeline 定义完成。")

nominal_features=['Home Ownership', 'Purpose', 'Term']

nominal_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent')),

('onehot',OneHotEncoder(handle_unknown='ignore'))

])

print("标称特征处理 Pipeline 定义完成。")

continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()

continuous_transformer=Pipeline(steps=[

('imputer',SimpleImputer(strategy='most_frequent'))

])

print("连续特征处理 Pipeline 定义完成。")

# --- 构建 ColumnTransformer ---

preprocessor=ColumnTransformer(

transformers=[

('ordinal',ordinal_transformer,ordinal_features),

('nominal',nominal_transformer,nominal_features),

('continuous',continuous_transformer,continuous_cols)

],remainder='passthrough',verbose_feature_names_out=False

)

print("\nColumnTransformer (预处理器) 定义完成。")

#构建完整pipeline

pipeline=Pipeline(steps=[

('preprecessor',preprocessor)

])

print("\n完整的 Pipeline 定义完成。")

print("\n开始对原始数据进行预处理...")

start_time=time.time()

x_processed=pipeline.fit_transform(x)

end_time=time.time()

print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")

feature_names=preprocessor.get_feature_names_out()

x_processed_df=pd.DataFrame(x_processed,columns=feature_names)

print(x_processed_df.info())

x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,shuffle=True)

# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式

# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

x_train=scaler.fit_transform(x_train)

x_test=scaler.transform(x_test)

# # 将数据转换为PyTorch张量并移至GPU

# # 分类问题交叉熵损失要求标签为long类型

# # 张量具有to(device)方法,可以将张量移动到指定的设备上

# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练

# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0

try:

print("\n开始转换完整数据集为张量...")

# 转换为NumPy数组,然后再转换为张量

x_train_np=np.array(x_train)

x_test_np=np.array(x_test)

y_train_np=np.array(y_train)

y_test_np=np.array(y_test)

x_train_tensor=torch.FloatTensor(x_train_np).to(device)

x_test_tensor=torch.FloatTensor(x_test_np).to(device)

y_train_tensor=torch.LongTensor(y_train_np).to(device)

y_test_tensor=torch.LongTensor(y_test_np).to(device)

print("所有数据已加载完毕")

except Exception as e:

print(f"数据转换过程中出错: {e}")

import traceback

traceback.print_exc()

class MLP(nn.Module):

def __init__(self, *args, **kwargs):

super().__init__(*args, **kwargs)

self.fc1=nn.Linear(34,8)

self.relu=nn.ReLU()

self.fc2=nn.Linear(8,5)

self.fc3=nn.Linear(5,2)

def forward(self,x):

x=self.relu(self.fc1(x))

x=self.relu(self.fc2(x))

x=self.fc3(x)

return x

# # 实例化模型并移至GPU

# # MLP继承nn.Module类,所以也具有to(device)方法

model=MLP().to(device)

print(f"模型已移动到{device}设备")

criterion=nn.CrossEntropyLoss()

optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)

num_epochs=5000

# 早停相关参数初始化

best_test_loss=float('inf')

best_epoch=0

counter=0

patience=50

early_stopped=False

train_losses=[]

test_losses=[]

epochs=[]

start_time=time.time()

for epoch in range(num_epochs):

outputs=model(x_train_tensor)

train_loss=criterion(outputs,y_train_tensor)

optimizer.zero_grad()

train_loss.backward()

optimizer.step()

model.eval()

with torch.no_grad():

test_outputs=model(x_test_tensor)

test_loss=criterion(test_outputs,y_test_tensor)

model.train()

# ===== 新增早停逻辑 =====

if test_loss.item()<best_test_loss:

best_test_loss=test_loss.item()

best_epoch=epoch+1

counter=0

checkpoint={

'model_state_dict':model.state_dict(),

'optimizer_state_dict':optimizer.state_dict(),

'epoch':epoch+1,

'loss':test_loss.item()

}

torch.save(checkpoint,'best_checkpoint.pth')

else:

counter+=1

if counter>=patience:

print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")

print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")

early_stopped=True

break

#============================

train_losses.append(train_loss.item())

test_losses.append(test_loss.item())

epochs.append(epoch+1)

if (epoch+1) % 1000 == 0:

print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

time_all = time.time() - start_time

print(f'Training time: {time_all:.2f} seconds')

# ===== 新增:加载最佳模型用于最终评估 =====

if early_stopped:

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

checkpoint=torch.load('best_checkpoint.pth')

model.load_state_dict(checkpoint['model_state_dict'])

optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

best_epoch = checkpoint['epoch']

best_test_loss = checkpoint['loss']

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

# ================================

# # 可视化损失曲线

# plt.figure(figsize=(10, 6))

# plt.plot(epochs, train_losses, label='Train Loss') # 原始代码已有

# plt.plot(epochs, test_losses, label='Test Loss') # 新增:测试集损失曲线

# plt.xlabel('Epoch')

# plt.ylabel('Loss')

# plt.title('Training and Test Loss over Epochs')

# plt.legend() # 新增:显示图例

# plt.grid(True)

# plt.show()

#可视化权重图表

weight_data={}

for name,param in model.named_parameters():

if 'weight' in name:

weight_data[name]=param.detach().cpu().numpy()

print(name,param.shape)

fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))

fig.suptitle('Weight Distribution of Layers')

for i,(name,weights) in enumerate(weight_data.items()):

weights_flat=weights.flatten()

axes[i].hist(weights_flat,bins=50,alpha=0.7)

axes[i].set_title(name)

axes[i].set_xlabel('Weight Value')

axes[i].set_ylabel('Frequency')

axes[i].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()

plt.subplots_adjust(top=0.85)

plt.show()


 

print("\n=== 权重统计信息 ===")

for name,weights in weight_data.items():

mean=np.mean(weights)

std = np.std(weights)

min_val = np.min(weights)

max_val = np.max(weights)

print(f"{name}:")

print(f" 均值: {mean:.6f}")

print(f" 标准差: {std:.6f}")

print(f" 最小值: {min_val:.6f}")

print(f" 最大值: {max_val:.6f}")

print("-" * 30)

model.eval()

with torch.no_grad():

outputs=model(x_test_tensor)

_,predicted=torch.max(outputs,1)

y_true=y_test_tensor.cpu().numpy()

y_pred=predicted=predicted.cpu().numpy()

acc=accuracy_score(y_true,y_pred)

print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')

cm=confusion_matrix(y_true,y_pred)

print(f'混淆矩阵:\n{cm}')

print(f'分类报告:\n{classification_report(y_true,y_pred)}')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值