该代码是否完整,若不完整,请将其补充完整:import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import os
import json
import time
import warnings
import pickle
import random
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, LabelEncoder, QuantileTransformer
from sklearn.mixture import GaussianMixture
from scipy import stats
from scipy.stats import ks_2samp, wasserstein_distance
warnings.filterwarnings('ignore')
# 设置随机种子保证可重现性
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
# ================================ 改进的数据加载与预处理 ================================
class BankDataset(Dataset):
def __init__(self, data_dir: str, split: str = 'train',
label_encoders: Optional[List[LabelEncoder]] = None,
num_scalers: Optional[List[QuantileTransformer]] = None):
self.split = split
self.data_dir = data_dir
self.label_encoders = label_encoders
self.num_scalers = num_scalers
self.feature_stats = []
self._load_metadata()
self._load_numpy_data()
self._align_samples()
self._preprocess()
def _load_metadata(self):
"""加载元数据信息"""
with open(f'{self.data_dir}/info.json', 'r') as f:
info = json.load(f)
self.n_num_features = info['n_num_features']
self.n_cat_features = info['n_cat_features']
self.n_classes = info['n_classes']
self.train_size = info.get('train_size', 0)
self.val_size = info.get('val_size', 0)
self.test_size = info.get('test_size', 0)
with open(f'{self.data_dir}/domain.json', 'r') as f:
self.domain = json.load(f)
self.num_boundaries = []
self.cat_cardinalities = []
for i in range(1, self.n_num_features + 1):
self.num_boundaries.append(self.domain[f'num_attr_{i}'])
for i in range(1, self.n_cat_features + 1):
self.cat_cardinalities.append(self.domain[f'cat_attr_{i}'])
print(f"[{self.split.upper()}] 特征: 数值={self.n_num_features}, 分类={self.n_cat_features}")
def _load_numpy_data(self):
"""加载numpy数据文件"""
try:
self.X_num = np.load(f'{self.data_dir}/X_num_{self.split}.npy')
self.X_cat = np.load(f'{self.data_dir}/X_cat_{self.split}.npy', allow_pickle=True)
self.y = np.load(f'{self.data_dir}/y_{self.split}.npy').flatten()
print(f" 加载成功: X_num={self.X_num.shape}, X_cat={self.X_cat.shape}")
except FileNotFoundError as e:
print(f"错误: 文件未找到 - {e}")
raise
def _align_samples(self):
"""确保数值、分类、标签特征样本数一致"""
min_samples = min(len(self.X_num), len(self.X_cat), len(self.y))
if len(self.X_num) != min_samples:
print(f" 对齐数值特征: {len(self.X_num)} -> {min_samples}")
self.X_num = self.X_num[:min_samples]
if len(self.X_cat) != min_samples:
print(f" 对齐分类特征: {len(self.X_cat)} -> {min_samples}")
self.X_cat = self.X_cat[:min_samples]
if len(self.y) != min_samples:
print(f" 对齐标签: {len(self.y)} -> {min_samples}")
self.y = self.y[:min_samples]
def _preprocess(self):
"""数据预处理"""
# 处理数值特征
self.X_num = np.nan_to_num(self.X_num, nan=0.0)
self.X_num_raw = self.X_num.copy() # 保存原始数据
if self.split == 'train' or self.num_scalers is None:
self.num_scalers = []
self.feature_stats = [] # 存储每个特征的统计信息
for i in range(self.n_num_features):
col = self.X_num[:, i].astype(float)
# 计算特征统计
stats = {
'min': np.min(col),
'max': np.max(col),
'mean': np.mean(col),
'std': np.std(col),
'q1': np.percentile(col, 25),
'q3': np.percentile(col, 75),
'skew': stats.skew(col),
'kurtosis': stats.kurtosis(col),
'gmm_components': self._estimate_gmm_components(col)
}
self.feature_stats.append(stats)
# 使用QuantileTransformer保留分布形状
qt = QuantileTransformer(n_quantiles=500, output_distribution='uniform', random_state=42)
qt.fit(col.reshape(-1, 1))
self.num_scalers.append(qt)
else:
# 验证时使用已有的scaler
pass
# 应用转换
self.X_num_norm = np.zeros_like(self.X_num, dtype=float)
for i in range(self.n_num_features):
qt = self.num_scalers[i]
self.X_num_norm[:, i] = qt.transform(self.X_num[:, i].reshape(-1, 1)).flatten()
# 分类特征处理保持不变
# ...
def _estimate_gmm_components(self, col):
"""使用BIC估计最佳GMM组件数量"""
n_components = np.arange(1, 5)
best_bic = np.inf
best_n = 1
for n in n_components:
gmm = GaussianMixture(n, random_state=42)
try:
gmm.fit(col.reshape(-1, 1))
bic = gmm.bic(col.reshape(-1, 1))
if bic < best_bic:
best_bic = bic
best_n = n
except:
continue
return best_n
# 其余方法保持不变
# ...
# ================================ 改进的Tokenizer ================================
class EnhancedFeatureTokenizer:
def __init__(self, domain_info: Dict, feature_stats: List[Dict], num_scalers: List[QuantileTransformer]):
self.domain = domain_info
self.feature_stats = feature_stats
self.num_scalers = num_scalers
# 解析特征数量和类型
self.n_num_features = len([k for k in domain_info.keys() if k.startswith('num_attr')])
self.n_cat_features = len([k for k in domain_info.keys() if k.startswith('cat_attr')])
# 定义特殊token
self.BOS_TOKEN = 0
self.EOS_TOKEN = 1
self.PAD_TOKEN = 2
self.MASK_TOKEN = 3
# 定义token范围
self.num_bins = 100 # 每个数值特征的分箱数
self.NUM_START_ID = 4
self.CAT_START_ID = self.NUM_START_ID + (self.n_num_features * self.num_bins)
self.LABEL_START_ID = self.CAT_START_ID + sum(
self.domain[f'cat_attr_{i + 1}'] for i in range(self.n_cat_features))
self.vocab_size = self.LABEL_START_ID + 2
# 为每个数值特征创建分箱边界
self.num_bin_edges = []
for i in range(self.n_num_features):
col_min = self.feature_stats[i]['min']
col_max = self.feature_stats[i]['max']
# 使用基于数据分布的分箱策略
if self.feature_stats[i]['skew'] > 1.0:
# 高度偏斜数据使用对数分箱
min_val = max(col_min, 1e-6)
log_min = np.log10(min_val)
log_max = np.log10(col_max)
edges = np.logspace(log_min, log_max, self.num_bins + 1)
else:
# 正态分布数据使用线性分箱
edges = np.linspace(col_min, col_max, self.num_bins + 1)
# 确保边界包含最小值最大值
edges[0] = col_min
edges[-1] = col_max
self.num_bin_edges.append(edges)
print(f"[Tokenizer] 词汇表大小: {self.vocab_size}")
def numerical_to_token(self, value: float, feature_idx: int) -> int:
"""将数值转换为token"""
edges = self.num_bin_edges[feature_idx]
value = float(value)
bin_idx = np.searchsorted(edges, value, side='right') - 1
bin_idx = np.clip(bin_idx, 0, len(edges) - 2)
token_id = self.NUM_START_ID + (feature_idx * self.num_bins) + bin_idx
return int(token_id)
# 其余方法保持不变
# ...
# ================================ 改进的Transformer模型 ================================
class NumericalFocusedTransformer(nn.Module):
def __init__(self, vocab_size: int, n_num_features: int, n_cat_features: int,
d_model: int = 256, num_heads: int = 8, num_layers: int = 6,
d_ff: int = 1024, max_seq_len: int = 30, dropout: float = 0.1):
super(NumericalFocusedTransformer, self).__init__()
# 模型参数
self.vocab_size = vocab_size
self.n_num_features = n_num_features
self.n_cat_features = n_cat_features
self.d_model = d_model
# 嵌入层
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_encoding = PositionalEncoding(d_model, max_len=max_seq_len)
# 专注于数值特征的双流注意力
self.num_features_attention = nn.ModuleList([
FeatureAwareAttention(d_model, num_heads, n_num_features, n_cat_features, dropout)
for _ in range(num_layers)
])
# 通用注意力层
self.common_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers // 2)
])
# 输出层
self.output_norm = nn.LayerNorm(d_model)
self.output_layer = nn.Linear(d_model, vocab_size)
self.dropout = nn.Dropout(dropout)
# 数值重建头
self.numerical_recon_head = nn.Sequential(
nn.Linear(d_model, 256),
nn.ReLU(),
nn.Linear(256, n_num_features)
)
self._init_weights()
def forward(self, x, vocab_info, return_attention=False):
# 嵌入和位置编码
token_emb = self.token_embedding(x)
pos_emb = self.position_encoding(token_emb)
x_embed = self.dropout(pos_emb)
attention_weights = []
# 通过数值特征关注层
for attn_layer in self.num_features_attention:
x_embed, attn = attn_layer(x_embed, x, vocab_info)
if return_attention:
attention_weights.append(attn)
# 通过通用层
for layer in self.common_layers:
x_embed, attn = layer(x_embed, x, vocab_info)
if return_attention:
attention_weights.append(attn)
# 获取token预测
x_embed_norm = self.output_norm(x_embed)
token_logits = self.output_layer(x_embed_norm)
# 额外重建数值特征 (辅助损失)
num_position = self.n_num_features + 1 # BOS + 数值特征位置
num_embeddings = x_embed[:, 1:num_position] # 跳过BOS
recon_numerical = self.numerical_recon_head(num_embeddings.mean(dim=1))
if return_attention:
return token_logits, recon_numerical, attention_weights
return token_logits, recon_numerical
# 生成函数保持类似,但加入数值约束
def _apply_numerical_generation_constraints(self, logits, current_pos):
"""对数值特征应用数学约束"""
# 识别特殊值(如-1),增加其概率
special_values = [self.vocab_info['pad_token'],
self.vocab_info['mask_token']]
for tok in special_values:
if tok >= 0 and tok < self.vocab_size:
logits[..., tok] += 5.0 # 增加选择概率
# 对于位置1-7(数值位置),减少极端值的概率
if 1 <= current_pos <= self.n_num_features:
feature_idx = current_pos - 1
feature_stat = self.feature_stats[feature_idx]
# 降低均值±2标准差外的token概率
mean_val = feature_stat['mean']
std_val = max(feature_stat['std'], 0.001)
# 计算每个token的中心值
base_id = self.vocab_info['num_start_id'] + (feature_idx * self.vocab_info['num_bins'])
for bin_idx in range(self.vocab_info['num_bins']):
token_id = base_id + bin_idx
bin_low = self.num_bin_edges[feature_idx][bin_idx]
bin_high = self.num_bin_edges[feature_idx][bin_idx + 1]
bin_center = (bin_low + bin_high) / 2.0
# 如果远离均值,降低概率
if abs(bin_center - mean_val) > 2 * std_val:
logits[..., token_id] -= 5.0
return logits
# ================================ 改进的训练循环 ================================
def train_epoch(model, train_loader, token_criterion, num_criterion, optimizer,
vocab_info, feature_stats, config, device, epoch, num_epochs):
model.train()
total_token_loss = 0
total_num_loss = 0
total_steps = 0
alpha = config.get('alpha', 0.5) # 数值重建权重
for batch_idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
# 前向传播
token_logits, recon_numerical = model(inputs, vocab_info)
# 计算主要token预测损失
token_loss = token_criterion(token_logits.view(-1, vocab_info['vocab_size']),
targets.view(-1))
# 数值重建损失计算
# 获取目标序列中的数值token位置 (位置1到n_num_features)
num_tokens = inputs[:, 1:model.n_num_features + 1]
num_features = model.token_to_numerical_matrix(num_tokens)
num_loss = num_criterion(recon_numerical, num_features)
# 组合损失
total_loss = (1 - alpha) * token_loss + alpha * num_loss
# 反向传播
optimizer.zero_grad()
total_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])
optimizer.step()
total_token_loss += token_loss.item()
total_num_loss += num_loss.item()
total_steps += 1
if batch_idx % 100 == 0:
print(f'Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}, '
f'Token Loss: {token_loss.item():.4f}, '
f'Num Loss: {num_loss.item():.4f}')
return total_token_loss / total_steps, total_num_loss / total_steps
# 在模型类中添加方法
def token_to_numerical_matrix(self, token_ids):
"""将token ID转换为原始数值矩阵"""
batch_size, num_positions = token_ids.shape
numerical_matrix = torch.zeros((batch_size, num_positions), device=token_ids.device)
for i in range(batch_size):
for j in range(num_positions):
token = token_ids[i, j].item()
if self.vocab_info['num_start_id'] <= token < self.vocab_info['cat_start_id']:
feature_idx = (token - self.vocab_info['num_start_id']) // self.vocab_info['num_bins']
bin_idx = (token - self.vocab_info['num_start_id']) % self.vocab_info['num_bins']
bin_low = self.num_bin_edges[feature_idx][bin_idx]
bin_high = self.num_bin_edges[feature_idx][bin_idx + 1]
numerical_matrix[i, j] = (bin_low + bin_high) / 2.0
return numerical_matrix
# 添加到模型类中
NumericalFocusedTransformer
class PositionalEncoding(nn.Module):
"""位置编码层"""
def __init__(self, d_model, max_len=50):
super(PositionalEncoding, self).__init__()
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.transpose(0, 1))
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return x
class FeatureAwareAttention(nn.Module):
"""考虑特征相关性的多头注意力机制"""
def __init__(self, d_model, num_heads, n_num_feat, n_cat_feat, dropout=0.1):
super().__init__()
self.num_heads = num_heads
self.d_model = d_model
self.attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
# 特征类型感知线性变换
self.feat_type_emb = nn.Embedding(3, d_model) # 0=数值,1=分类,2=标签
def forward(self, x, tokens, vocab_info):
# 创建特征类型掩码 (0-数值,1-分类,2-标签)
feat_type_mask = self._get_feature_type_mask(tokens, vocab_info)
# 添加特征类型嵌入
x = x + self.feat_type_emb(feat_type_mask)
# 自注意力机制
attn_output, attn_weights = self.attention(
x, x, x,
key_padding_mask=(tokens == vocab_info['pad_token'])
)
return attn_output, attn_weights
def _get_feature_type_mask(self, tokens, vocab_info):
"""根据token ID范围生成特征类型掩码"""
num_mask = (vocab_info['num_start_id'] <= tokens) & (tokens < vocab_info['cat_start_id'])
cat_mask = (vocab_info['cat_start_id'] <= tokens) & (tokens < vocab_info['label_start_id'])
lab_mask = tokens >= vocab_info['label_start_id']
feat_type = torch.zeros_like(tokens, dtype=torch.long)
feat_type[num_mask] = 0 # 数值特征
feat_type[cat_mask] = 1 # 分类特征
feat_type[lab_mask] = 2 # 标签特征
return feat_type
class TransformerEncoderLayer(nn.Module):
"""通用Transformer编码层"""
def __init__(self, d_model, num_heads, d_ff=1024, dropout=0.1):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = nn.GELU()
def forward(self, src, tokens, vocab_info):
# 自注意力
attn_output, attn_weights = self.self_attn(
src, src, src,
key_padding_mask=(tokens == vocab_info['pad_token'])
)
src = src + self.dropout(attn_output)
src = self.norm1(src)
# 前馈网络
ff_output = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout(ff_output)
src = self.norm2(src)
return src, attn_weights
# ================================ 完整训练流程 ================================
def main():
# 配置参数
config = {
'data_dir': 'data/bank_data',
'batch_size': 64,
'd_model': 256,
'num_heads': 8,
'num_layers': 6,
'd_ff': 1024,
'lr': 1e-4,
'num_epochs': 50,
'grad_clip': 1.0,
'alpha': 0.6 # 数值重建损失权重
}
# 设备设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 准备数据集
train_dataset = BankDataset(
config['data_dir'],
split='train',
label_encoders=None,
num_scalers=None
)
val_dataset = BankDataset(
config['data_dir'],
split='val',
label_encoders=train_dataset.label_encoders,
num_scalers=train_dataset.num_scalers
)
train_loader = DataLoader(
train_dataset,
batch_size=config['batch_size'],
shuffle=True,
collate_fn=collate_fn
)
val_loader = DataLoader(
val_dataset,
batch_size=config['batch_size'],
collate_fn=collate_fn
)
# 初始化tokenizer
tokenizer = EnhancedFeatureTokenizer(
train_dataset.domain,
train_dataset.feature_stats,
train_dataset.num_scalers
)
# 创建模型
vocab_info = {
'vocab_size': tokenizer.vocab_size,
'num_start_id': tokenizer.NUM_START_ID,
'cat_start_id': tokenizer.CAT_START_ID,
'label_start_id': tokenizer.LABEL_START_ID,
'pad_token': tokenizer.PAD_TOKEN,
'mask_token': tokenizer.MASK_TOKEN,
'num_bins': tokenizer.num_bins
}
model = NumericalFocusedTransformer(
vocab_size=tokenizer.vocab_size,
n_num_features=train_dataset.n_num_features,
n_cat_features=train_dataset.n_cat_features,
d_model=config['d_model'],
num_heads=config['num_heads'],
num_layers=config['num_layers'],
d_ff=config['d_ff'],
max_seq_len=get_max_seq_len(train_dataset, tokenizer)
)
model.to(device)
# 损失函数和优化器
token_criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.PAD_TOKEN)
num_criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=5
)
# 训练循环
for epoch in range(config['num_epochs']):
train_token_loss, train_num_loss = train_epoch(
model, train_loader, token_criterion, num_criterion,
optimizer, vocab_info, train_dataset, config, device, epoch, config['num_epochs']
)
val_token_loss, val_num_loss = evaluate(
model, val_loader, token_criterion, num_criterion,
vocab_info, device
)
scheduler.step(val_token_loss)
print(f"Epoch {epoch + 1}/{config['num_epochs']}")
print(f"训练损失 - Token: {train_token_loss:.4f}, Num: {train_num_loss:.4f}")
print(f"验证损失 - Token: {val_token_loss:.4f}, Num: {val_num_loss:.4f}")
# 每5个epoch保存一次模型
if (epoch + 1) % 5 == 0:
save_model(model, tokenizer, config, epoch)
# 补充工具函数
def collate_fn(batch):
"""处理变长序列的批处理函数"""
tokenized_seqs = [item[0] for item in batch]
padded_seqs = torch.nn.utils.rnn.pad_sequence(
tokenized_seqs, batch_first=True, padding_value=2
)
targets = torch.stack([item[1] for item in batch])
return padded_seqs, targets
def get_max_seq_len(dataset, tokenizer):
"""计算数据集中最长序列长度"""
max_len = 0
for i in range(len(dataset)):
seq = tokenizer.tokenize_sample(*dataset[i])
if len(seq) > max_len:
max_len = len(seq)
return max_len
def evaluate(model, dataloader, token_criterion, num_criterion, vocab_info, device):
"""模型评估函数"""
model.eval()
total_token_loss = 0
total_num_loss = 0
total_steps = 0
with torch.no_grad():
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
token_logits, recon_numerical = model(inputs, vocab_info)
token_loss = token_criterion(
token_logits.view(-1, vocab_info['vocab_size']),
targets.view(-1)
)
# 计算数值重建损失
num_tokens = inputs[:, 1:model.n_num_features + 1]
num_features = model.token_to_numerical_matrix(num_tokens)
num_loss = num_criterion(recon_numerical, num_features)
total_token_loss += token_loss.item()
total_num_loss += num_loss.item()
total_steps += 1
return total_token_loss / total_steps, total_num_loss / total_steps
def save_model(model, tokenizer, config, epoch):
"""保存模型和配置"""
checkpoint = {
'model_state_dict': model.state_dict(),
'tokenizer_config': {
'domain': tokenizer.domain,
'feature_stats': tokenizer.feature_stats,
'num_bins': tokenizer.num_bins,
'num_bin_edges': tokenizer.num_bin_edges
},
'epoch': epoch,
'config': config
}
torch.save(checkpoint, f"model_epoch_{epoch + 1}.pth")
print(f"模型已保存: model_epoch_{epoch + 1}.pth")
if __name__ == "__main__":
main()
最新发布