python打卡DAY33

from MLP_model import *

#设置中文字体&负号正确显示

plt.rcParams['font.sans-serif']=['STHeiti']

plt.rcParams['axes.unicode_minus']=True

plt.rcParams['figure.dpi']=100

torch.manual_seed(42)

##GPU

if torch.backends.mps.is_available():

device=torch.device('mps')

print("使用MPS加速训练")

else:

device=torch.device('cpu')

print("MPS不可用,使用CPU训练")

# ##CPU

# device=torch.device('cpu')

# =================== 数据预处理 =====================

data = pd.read_csv("data.csv")

x = data.drop(['Id', 'Credit Default'], axis=1)

y = data['Credit Default']

# 定义Pipeline相关定义&处理步骤

object_cols = x.select_dtypes(include=['object']).columns.tolist()

numeric_cols = x.select_dtypes(exclude=['object']).columns.tolist()

print(f'object_cols:{object_cols}\nnumeric_cols:{numeric_cols}')

ordinal_features = ['Years in current job']

ordinal_categories = [['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years',

'6 years', '7 years', '8 years', '9 years', '10+ years']]

ordinal_transformer = Pipeline(steps=[

('imputer', SimpleImputer(strategy='most_frequent')),

('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))

])

print("有序特征处理 Pipeline 定义完成。")

nominal_features = ['Home Ownership', 'Purpose', 'Term']

nominal_transformer = Pipeline(steps=[

('imputer', SimpleImputer(strategy='most_frequent')),

('onehot', OneHotEncoder(handle_unknown='ignore'))

])

print("标称特征处理 Pipeline 定义完成。")

continuous_cols = x.columns.difference(ordinal_features + nominal_features).tolist()

continuous_transform = Pipeline(steps=[

('imputer', SimpleImputer(strategy='most_frequent'))

])

print("连续特征处理 Pipeline 定义完成。")

# --- 构建 ColumnTransformer ---

preprocessor = ColumnTransformer(

transformers=[

('ordinal', ordinal_transformer, ordinal_features),

('nominal', nominal_transformer, nominal_features),

('continuous', continuous_transform, continuous_cols)

], remainder='passthrough', verbose_feature_names_out=False

)

print("\nColumnTransformer (预处理器) 定义完成。")

# 构建完整pipeline

pipeline = Pipeline(steps=[

('preprocessor', preprocessor)

])

print("\n完整的 Pipeline 定义完成。")

print("\n开始对原始数据进行预处理...")

start_time = time.time()

x_processed = pipeline.fit_transform(x)

end_time = time.time()

print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")

feature_names = preprocessor.get_feature_names_out()

x_processed_df = pd.DataFrame(x_processed, columns=feature_names)

print(x_processed_df.info())

# =================== 聚类分析与标签生成 =====================

x_scaled = MinMaxScaler().fit_transform(x_processed)

k_range = range(2, 21)

intertia_values = []

silhouette_scores = []

ch_scores = []

db_scores = []

start_time = time.time()

for k in k_range:

kmeans = KMeans(n_clusters=k, random_state=42)

kmeans_label = kmeans.fit_predict(x_scaled)

intertia_values.append(kmeans.inertia_)

silhouette_scores.append(silhouette_score(x_scaled, kmeans_label))

ch_scores.append(calinski_harabasz_score(x_scaled, kmeans_label))

db_scores.append(davies_bouldin_score(x_scaled, kmeans_label))

end_time = time.time()

print(f'聚类分析耗时:{end_time-start_time:.4f}秒')

# 可视化聚类评估

plt.figure(figsize=(12,6))

plt.subplot(2,2,1)

plt.plot(k_range, intertia_values, marker='o')

plt.title('肘部法则 (惯性越小越好)')

plt.xlabel('k'); plt.ylabel('Inertia'); plt.grid(True)

plt.subplot(2,2,2)

plt.plot(k_range, silhouette_scores, marker='o', color='orange')

plt.title('轮廓系数 (越大越好)')

plt.xlabel('k'); plt.ylabel('Silhouette'); plt.grid(True)

plt.subplot(2,2,3)

plt.plot(k_range, ch_scores, marker='o', color='yellow')

plt.title('CH指数 (越大越好)')

plt.xlabel('k'); plt.ylabel('CH Score'); plt.grid(True)

plt.subplot(2,2,4)

plt.plot(k_range, db_scores, marker='o', color='red')

plt.title('DB指数 (越小越好)')

plt.xlabel('k'); plt.ylabel('DB Score'); plt.grid(True)

plt.tight_layout()

plt.show()

# 最终聚类并添加标签

selected_k = 6

kmeans = KMeans(n_clusters=selected_k, random_state=42)

kmeans_label = kmeans.fit_predict(x_scaled)

x_with_cluster = pd.DataFrame(x_scaled, columns=feature_names)

x_with_cluster['KMeans_Cluster'] = kmeans_label

# =================== 特征列聚类增强 =====================

x_train, x_test, y_train, y_test = train_test_split(x_processed_df, y, test_size=0.2, shuffle=True)

# # 归一化数据,神经网络对于输入数据的尺寸敏感,归一化是最常见的处理方式

# ##自变量有多个,且量纲不尽相同对其进行归一化减少量纲对结果的影响

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

x_train=scaler.fit_transform(x_train)

x_test=scaler.fit_transform(x_test)

x_train_T = x_train.T

kmeans_feature = KMeans(n_clusters=selected_k, random_state=42)

cluster_labels = kmeans_feature.fit_predict(x_train_T)

# 构造特征簇均值增强特征

def build_cluster_features(x_data, cluster_labels, k):

cluster_features = []

for i in range(k):

idx = np.where(cluster_labels == i)[0]

group_feature = x_data[:, idx].mean(axis=1, keepdims=True)

cluster_features.append(group_feature)

return np.hstack(cluster_features)

x_train_clustered = build_cluster_features(x_train, cluster_labels, selected_k)

x_test_clustered = build_cluster_features(x_test, cluster_labels, selected_k)

x_train_enhanced = np.hstack([x_train, x_train_clustered])

x_test_enhanced = np.hstack([x_test, x_test_clustered])

# 保存聚类模型用于部署

joblib.dump(kmeans_feature, 'kmeans_feature_cluster.pkl')

# =================== 构建 Attention 模型 =====================

# # 将数据转换为PyTorch张量并移至GPU

# # 分类问题交叉熵损失要求标签为long类型

# # 张量具有to(device)方法,可以将张量移动到指定的设备上

# # 将数据转换为 PyTorch 张量,因为 PyTorch 使用张量进行训练

# # y_train和y_test是整数,所以需要转化为long类型,如果是float32,会输出1.0 0.0

try:

print("\n开始转换完整数据集为张量...")

# 转换为NumPy数组,然后再转换为张量

x_train_np=np.array(x_train_enhanced)

x_test_np=np.array(x_test_enhanced)

y_train_np=np.array(y_train)

y_test_np=np.array(y_test)

x_train_tensor=torch.FloatTensor(x_train_np).to(device)

x_test_tensor=torch.FloatTensor(x_test_np).to(device)

y_train_tensor=torch.LongTensor(y_train_np).to(device)

y_test_tensor=torch.LongTensor(y_test_np).to(device)

print("所有数据已加载完毕")

except Exception as e:

print(f"数据转换过程中出错: {e}")

import traceback

traceback.print_exc()

class AttentionMLP(nn.Module):

def __init__(self, input_dim, hidden_dim=16, num_classes=2):

super().__init__()

self.attention = nn.Sequential(

nn.Linear(input_dim, input_dim),

nn.Tanh(),

nn.Linear(input_dim, input_dim),

nn.Softmax(dim=1)

)

self.fc1 = nn.Linear(input_dim, hidden_dim)

self.relu = nn.ReLU()

self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)

self.fc3 = nn.Linear(hidden_dim // 2, num_classes)

def forward(self, x):

attention_weights = self.attention(x)

x_weighted = x * attention_weights

x = self.relu(self.fc1(x_weighted))

x = self.relu(self.fc2(x))

x = self.fc3(x)

return x, attention_weights

input_dim = x_train_tensor.shape[1]

model = AttentionMLP(input_dim).to(device)

print(f"模型已移动到{device}设备")

criterion=nn.CrossEntropyLoss()

optimizer=optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)

num_epochs=5000

# 早停相关参数初始化

best_test_loss=float('inf')

best_epoch=0

counter=0

patience=50

early_stopped=False

train_losses=[]

test_losses=[]

epochs=[]

start_time=time.time()

for epoch in range(num_epochs):

outputs,_=model(x_train_tensor)

train_loss=criterion(outputs,y_train_tensor)

optimizer.zero_grad()

train_loss.backward()

optimizer.step()

model.eval()

with torch.no_grad():

test_outputs,_=model(x_test_tensor)

test_loss=criterion(test_outputs,y_test_tensor)

model.train()

# ===== 新增早停逻辑 =====

if test_loss.item()<best_test_loss:

best_test_loss=test_loss.item()

best_epoch=epoch+1

counter=0

checkpoint={

'model_state_dict':model.state_dict(),

'optimizer_state_dict':optimizer.state_dict(),

'epoch':epoch+1,

'loss':test_loss.item()

}

torch.save(checkpoint,'best_checkpoint.pth')

else:

counter+=1

if counter>patience:

print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")

print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")

early_stopped=True

break

#===========================================

train_losses.append(train_loss.item())

test_losses.append(test_loss.item())

epochs.append(epoch+1)

if (epoch+1) % 100 == 0:

print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

time_all = time.time() - start_time

print(f'Training time: {time_all:.2f} seconds')

# ===== 新增:加载最佳模型用于最终评估 =====

if early_stopped:

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

checkpoint=torch.load('best_checkpoint.pth')

model.load_state_dict(checkpoint['model_state_dict'])

optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

best_epoch=checkpoint['epoch']

best_test_loss=checkpoint['loss']

print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")

# ================================

# =================== 可视化损失与权重 =====================

plt.figure(figsize=(10, 6))

plt.plot(epochs, train_losses, label='Train Loss')

plt.plot(epochs, test_losses, label='Test Loss')

plt.xlabel('Epoch')

plt.ylabel('Loss')

plt.title('Training and Test Loss over Epochs')

plt.legend()

plt.grid(True)

plt.tight_layout()

plt.show()

# 可视化权重图表

weight_data={}

for name,param in model.named_parameters():

if 'weight' in name:

weight_data[name]=param.detach().cpu().numpy()

print(name,param.shape)

fig,axes=plt.subplots(1,len(weight_data),figsize=(15,5))

fig.suptitle('Weight Distribution of Layers')

for i,(name,weights) in enumerate(weight_data.items()):

weights_flat=weights.flatten()

axes[i].hist(weights_flat,bins=50,alpha=0.7)

axes[i].set_title(name)

axes[i].set_xlabel('Weight Value')

axes[i].set_ylabel('Frequency')

axes[i].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()

plt.subplots_adjust(top=0.85)

plt.show()


 

print("\n=== 权重统计信息 ===")

for name,weights in weight_data.items():

mean=np.mean(weights)

std=np.std(weights)

min_val=np.min(weights)

max_val=np.max(weights)

print(f"{name}:")

print(f" 均值: {mean:.6f}")

print(f" 标准差: {std:.6f}")

print(f" 最小值: {min_val:.6f}")

print(f" 最大值: {max_val:.6f}")

print("-" * 30)

# =================== 可视化注意力权重(特征重要性) =====================

with torch.no_grad():

_, attention_weights = model(x_test_tensor)

attention_avg = attention_weights.cpu().numpy().mean(axis=0)

# 构建增强后的特征名称(原始特征 + 聚类特征)

enhanced_feature_names = list(feature_names) + [f'Cluster_{i}' for i in range(selected_k)]

plt.figure(figsize=(12, 6))

plt.bar(enhanced_feature_names, attention_avg)

plt.xticks(rotation=90)

plt.xlabel("Feature Index")

plt.ylabel("Average Attention Weight")

plt.title("Average Attention Weights (Feature Importance)")

plt.grid(True, linestyle='--', alpha=0.6)

plt.xticks(rotation=30,ha='right')

plt.tight_layout()

plt.show()


 

model.eval()

with torch.no_grad():

outputs,_=model(x_test_tensor)

_,predicted=torch.max(outputs,1)

y_true=y_test_tensor.cpu().numpy()

y_pred=predicted.cpu().numpy()

acc=accuracy_score(y_true,y_pred)

print(f'测试集准确率 (accuracy_score): {acc * 100:.2f}%')

cm=confusion_matrix(y_true,y_pred)

print(f'混淆矩阵:\n{cm}')

print(f'分类报告:\n{classification_report(y_true,y_pred)}')

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值