import gc
import time
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
import warnings
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import boxcox
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.switch_backend('TkAgg')
# 设置路径
Path(r"D:\result2").mkdir(parents=True, exist_ok=True)
Path("model_results/").mkdir(parents=True, exist_ok=True)
# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用 {device} 进行训练")
# 设置随机种子保证可重复性
torch.manual_seed(42)
np.random.seed(42)
# 1. 数据预处理模块
def load_and_preprocess_data():
"""加载并预处理数据(内存安全版)"""
chunksize = 10000 # 每次处理1万行
dfs = []
datetimes_list = []
location_codes_list = []
# 指定列数据类型以减少内存使用
dtype_dict = {
'damage_count': 'float32',
'damage_depth': 'float32',
}
for chunk in pd.read_csv(
r"D:\my_data\clean\locationTransfer.csv",
chunksize=chunksize,
dtype=dtype_dict
):
# 保存非数值列
datetimes_list.append(chunk['datetime'].copy())
location_codes_list.append(chunk['locationCode'].copy())
# 只处理数值列
numeric_cols = chunk.select_dtypes(include=[np.number]).columns
chunk = chunk[numeric_cols]
chunk = chunk.dropna(subset=['damage_count'])
chunk = chunk[pd.to_numeric(chunk['damage_count'], errors='coerce').notna()]
chunk = chunk.fillna(method='ffill').fillna(method='bfill')
dfs.append(chunk)
if len(dfs) > 10: # 测试时限制块数
break
# 合并数据块
df = pd.concat(dfs, ignore_index=True)
def create_lag_features(df, lags=3):
for lag in range(1, lags + 1):
df[f'damage_count_lag_{lag}'] = df['damage_count'].shift(lag)
return df.dropna()
df = create_lag_features(df) # 在合并df之后,填充na之前
df = df.dropna(subset=['damage_count'])
datetimes = pd.concat(datetimes_list, ignore_index=True)
location_codes = pd.concat(location_codes_list, ignore_index=True)
# 确保长度一致
min_length = min(len(df), len(datetimes), len(location_codes))
df = df.iloc[:min_length]
datetimes = datetimes.iloc[:min_length]
location_codes = location_codes.iloc[:min_length]
# 检查是否存在 NaN 值
nan_check = df.isnull().sum().sum()
inf_check = df.isin([np.Inf, -np.Inf]).sum().sum()
if nan_check > 0 or inf_check > 0:
# 处理 NaN 值或者无穷大值
# 填充缺失值为均值
df = df.fillna(df.mean())
# 删除包含 NaN 值或者无穷大值的行
df = df.dropna()
# 结构化特征
X_structured = df.drop(columns=['damage_count', 'damage_depth', 'damage_db',
'asset_code_mapping', 'pile_longitude',
'pile_latitude', 'locationCode', 'datetime',
'locationCode_encoded','damage_count_lag_1','damage_count_lag_2','damage_count_lag_3'], errors='ignore')
# 填充缺失值
numeric_cols = X_structured.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
X_structured[col] = X_structured[col].fillna(X_structured[col].mean())
# 标准化数据
scaler = RobustScaler() # 替换StandardScaler,更抗异常值
X_structured = pd.DataFrame(scaler.fit_transform(X_structured),
columns=X_structured.columns)
# 确保X_structured是DataFrame
if not isinstance(X_structured, pd.DataFrame):
X_structured = pd.DataFrame(X_structured, columns=[f"feature_{i}" for i in range(X_structured.shape[1])])
# X_structured = X_structured.values # 将DataFrame转换为NumPy数组
# 修改后的目标变量处理部分
y = df[['damage_count']].values.astype(np.float32)
# 添加数据缩放
y_scaler = RobustScaler()
y = y_scaler.fit_transform(y) # 使用标准化代替log变换
y = np.clip(y, -1e6, 1e6) # 设置合理的上下界
# 添加数据检查
assert not np.any(np.isinf(y)), "y中包含无限值"
assert not np.any(np.isnan(y)), "y中包含NaN值"
# 数据检查
print("原始数据统计:")
print(f"最小值: {y.min()}, 最大值: {y.max()}, NaN数量: {np.isnan(y).sum()}")
print("处理后y值范围:", np.min(y), np.max(y))
print("无限值数量:", np.isinf(y).sum())
# 清理内存
del df, chunk, dfs
gc.collect()
torch.cuda.empty_cache()
return datetimes, X_structured, y, location_codes, scaler, y_scaler
# 2. 时间序列数据集类
class TimeSeriesDataset(Dataset):
"""自定义时间序列数据集类"""
def __init__(self, X, y, timesteps):
# 确保输入是NumPy数组
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
y = y.values
assert X.ndim == 2, f"X应为2维,实际为{X.ndim}维"
assert y.ndim == 2, f"y应为2维,实际为{y.ndim}维"
# 添加维度调试信息
print(f"数据形状 - X: {X.shape}, y: {y.shape}")
self.X = torch.FloatTensor(X).unsqueeze(-1) # [samples, timesteps, 1]
self.y = torch.FloatTensor(y)
self.timesteps = timesteps
# 验证形状
if len(self.X) != len(self.y):
raise ValueError("X和y的长度不匹配")
def __len__(self):
return len(self.X) - self.timesteps
def __getitem__(self, idx):
# [seq_len, num_features]
x_window = self.X[idx:idx + self.timesteps]
y_target = self.y[idx + self.timesteps - 1]
return x_window.permute(1, 0), y_target # 调整维度顺序
def select_features_by_importance(X, y, n_features, feature_names=None):
"""使用随机森林选择特征(支持NumPy数组和DataFrame)"""
# 确保X是二维数组
if isinstance(X, pd.DataFrame):
feature_names = X.columns.tolist()
X = X.values
elif feature_names is None:
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
# 处理y的维度
y = np.ravel(np.asarray(y))
y = np.nan_to_num(y, nan=np.nanmean(y))
# 检查特征数
if X.shape[1] < n_features:
n_features = X.shape[1]
print(f"警告: 特征数少于请求数,使用所有 {n_features} 个特征")
# 训练随机森林
rf = RandomForestRegressor(n_estimators=250, random_state=42, n_jobs=-1)
rf.fit(X, y)
# 获取特征重要性
feature_importances = rf.feature_importances_
indices = np.argsort(feature_importances)[::-1][:n_features]
# 返回选中的特征数据和重要性
return X[:, indices], feature_importances[indices], [feature_names[i] for i in indices]
# 3. LSTM模型定义
class LSTMModel(nn.Module):
"""LSTM回归模型"""
def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.4):
super().__init__()
# 确保hidden_size*2能被num_heads整除
if (hidden_size * 2) % 4 != 0:
hidden_size = ((hidden_size * 2) // 4) * 4 // 2 # 调整到最近的合规值
print(f"调整hidden_size为{hidden_size}以满足整除条件")
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0,
bidirectional=True # 添加双向结构
)
# 添加维度检查的初始化
def weights_init(m):
if isinstance(m, nn.Linear):
if m.weight.dim() < 2:
m.weight.data = m.weight.data.unsqueeze(0) # 确保至少2维
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0.0)
self.bn = nn.BatchNorm1d(hidden_size * 2)
# # 注意力机制层
self.attention = nn.MultiheadAttention(embed_dim=hidden_size * 2, num_heads=4) # 改用多头注意力
# 更深D输出层
self.fc = nn.Sequential(
nn.Linear(hidden_size*2, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, 1)
)
# 应用初始化
self.apply(weights_init)
def forward(self, x):
lstm_out, _ = self.lstm(x) # [batch, seq_len, hidden*2]
lstm_out = lstm_out.permute(1, 0, 2) # [seq_len, batch, features]
attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
attn_out = attn_out.permute(1, 0, 2) # 恢复为[batch, seq_len, features]
return self.fc(attn_out[:, -1, :]).squeeze()
def plot_feature_importance(feature_names, importance_values, save_path):
"""绘制特征重要性图"""
# 验证输入
if len(feature_names) == 0 or len(importance_values) == 0:
print("警告: 无特征重要性数据可绘制")
return
if len(feature_names) != len(importance_values):
print(f"警告: 特征名数量({len(feature_names)})与重要性值数量({len(importance_values)})不匹配")
# 取较小值
min_len = min(len(feature_names), len(importance_values))
feature_names = feature_names[:min_len]
importance_values = importance_values[:min_len]
# 按重要性排序
indices = np.argsort(importance_values)[::-1]
sorted_features = [feature_names[i] for i in indices]
sorted_importance = importance_values[indices]
plt.figure(figsize=(12, 8))
plt.bar(range(len(sorted_features)), sorted_importance, align="center")
plt.xticks(range(len(sorted_features)), sorted_features, rotation=90)
plt.xlabel("特征")
plt.ylabel("重要性得分")
plt.title("特征重要性排序")
plt.tight_layout()
# 确保保存路径存在
save_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(save_path, dpi=300)
plt.close()
def evaluate(model, val_loader, criterion):
model.eval()
val_loss = 0.0
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
val_loss += loss.item()
return val_loss / len(val_loader)
# 4. 模型训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer,
scheduler=None, epochs=100, patience=30):
"""训练模型并返回最佳模型和训练历史"""
best_loss = float('inf')
history = {'train_loss': [], 'val_loss': []}
# 添加梯度累积
accumulation_steps = 5 # 每4个batch更新一次参数
for epoch in range(epochs):
# 训练阶段
model.train()
train_loss = 0.0
optimizer.zero_grad()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
scaler = torch.cuda.amp.GradScaler() # 在训练循环中添加
with torch.cuda.amp.autocast():
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
scaler.scale(loss).backward()
for batch_idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
# 梯度累积
loss = loss / accumulation_steps
scaler.scale(loss).backward()
if (batch_idx + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
train_loss += loss.item() * accumulation_steps
# 验证
val_loss = evaluate(model, val_loader, criterion)
if scheduler:
scheduler.step(val_loss) # 根据验证损失调整学习率
# 记录历史
avg_train_loss = train_loss / len(train_loader)
history['train_loss'].append(avg_train_loss)
history['val_loss'].append(val_loss)
# 早停逻辑
if val_loss < best_loss * 0.99:# 相对改进阈值
best_loss = val_loss
best_epoch = epoch
torch.save(model.state_dict(), 'best_model.pth')
print(f"Epoch {epoch + 1}/{epochs} - 训练损失: {avg_train_loss :.4f} - 验证损失: {val_loss:.4f}")
# 早停判断
if epoch - best_epoch >= patience:
print(f"早停触发,最佳epoch: {best_epoch+1}")
break
# 加载最佳模型
model.load_state_dict(torch.load('best_model.pth'))
return model, history
# 5. 贝叶斯优化函数
def optimize_hyperparameters(X_train, y_train, input_size):
"""使用贝叶斯优化寻找最佳超参数"""
# 自定义评分函数
def score_fn(params):
"""内部评分函数"""
try:
params = adjust_hidden_size(params) # 调整参数
hidden_size, num_layers, dropout, lr, batch_size, timesteps = params
# 确保参数有效
batch_size = max(32, min(256, int(batch_size)))
timesteps = max(3, min(10, int(timesteps)))
dropout = min(0.5, max(0.1, float(dropout)))
lr = min(0.01, max(1e-5, float(lr)))
# 检查数据是否足够
if len(X_train) < 2 * timesteps+1: # 至少需要2倍时间步长的数据
return float('inf')
# 创建模型
model = LSTMModel(
input_size=input_size,
hidden_size=int(hidden_size),
num_layers=min(3, int(num_layers)),
dropout=min(0.5, float(dropout))
).to(device)
# 初始化权重
# for name, param in model.named_parameters():
# if 'weight' in name:
# nn.init.xavier_normal_(param)
# elif 'bias' in name:
# nn.init.constant_(param, 0.1)
# 损失函数和优化器
criterion = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
# 创建数据加载器
dataset = TimeSeriesDataset(X_train, y_train, timesteps=int(timesteps))
# 简化验证流程
train_size = int(0.8 * len(dataset))
train_dataset = torch.utils.data.Subset(dataset, range(train_size))
val_dataset = torch.utils.data.Subset(dataset, range(train_size, len(dataset)))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# 简单训练和验证
model.train()
for epoch in range(15): # 减少epoch数以加快评估
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# 验证
model.eval()
val_loss = 0.0
with torch.no_grad():
for inputs, targets in val_loader:
outputs = model(inputs.to(device))
loss = criterion(outputs, targets.squeeze().to(device))
if torch.isnan(loss) or torch.isinf(loss):
return float('inf')
val_loss += loss.item()
return val_loss / len(val_loader)
except Exception as e:
print(f"参数评估失败: {str(e)}")
return float('inf')
# 定义搜索空间
search_spaces = [
Integer(32, 128, name='hidden_size'),
Integer(1, 3, name='num_layers'),
Real(0.2, 0.5, name='dropout'),
Real(5e-4, 1e-3, prior='log-uniform', name='lr'),
Categorical([64, 128, 256], name='batch_size'),
Integer(3, 10, name='timesteps') # 优化时间步长
]
def adjust_hidden_size(params):
"""确保hidden_size*2能被4整除"""
hs = params[0]
params[0] = ((hs * 2) // 4) * 4 // 2
return params
result = gp_minimize(
score_fn,
search_spaces,
n_calls=50,
random_state=42,
verbose=True,
n_jobs=1 # 并行执行
)
# 提取最佳参数
best_params = {
'hidden_size': result.x[0],
'num_layers': result.x[1],
'dropout': result.x[2],
'lr': result.x[3],
'batch_size': result.x[4],
'timesteps': result.x[5]
}
print("优化完成,最佳参数:", best_params)
return best_params
def collate_fn(batch):
"""增强型数据批处理函数"""
# 解包批次数据
inputs, targets = zip(*batch)
# 维度转换 (已包含在数据集中)
# inputs = [batch_size, features, seq_len]
# 数据增强(可选)
# 添加高斯噪声
noise = torch.randn_like(inputs) * 0.05
inputs = inputs + noise
# 归一化处理(可选)
# mean = inputs.mean(dim=(1,2), keepdim=True)
# std = inputs.std(dim=(1,2), keepdim=True)
# inputs = (inputs - mean) / (std + 1e-8)
return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features]
# 6. 评估函数
def evaluate_model(model, test_loader, criterion, test_indices, y_scaler=None):
"""评估模型性能"""
model.eval()
test_loss = 0.0
y_true = []
y_pred = []
all_indices = []
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
if outputs.dim() == 1:
outputs = outputs.unsqueeze(1)
loss = criterion(outputs, targets)
test_loss += loss.item() * inputs.size(0)
# 收集预测结果
y_true.extend(targets.cpu().numpy())
y_pred.extend(outputs.cpu().numpy())
# 获取原始数据集中的索引
current_indices = test_indices[batch_idx * test_loader.batch_size:
(batch_idx + 1) * test_loader.batch_size]
all_indices.extend(current_indices)
y_true = np.array(y_true).reshape(-1)
y_pred = np.array(y_pred).reshape(-1)
if y_scaler is not None:
y_true = y_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
# 基础指标
metrics = {
'MSE': mean_squared_error(y_true, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
'MAE': mean_absolute_error(y_true, y_pred),
'R2': r2_score(y_true, y_pred),
'MAPE': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100, # 避免除0
'indices': all_indices, # 添加原始索引
'y_true_original': y_true,
'y_pred_original': y_pred,
'test_loss': test_loss
}
# 可视化误差分布
errors = y_true - y_pred
plt.figure(figsize=(12, 6))
sns.histplot(errors, kde=True, bins=50)
plt.title('Error Distribution')
plt.savefig('error_distribution.png')
plt.close()
return metrics, y_true, y_pred
def collate_fn(batch):
"""增强型数据批处理函数"""
# 解包批次数据
inputs, targets = zip(*batch)
# 维度转换 (已包含在数据集中)
# inputs = [batch_size, features, seq_len]
# 数据增强(可选)
# 添加高斯噪声
noise = torch.randn_like(inputs) * 0.05
inputs = inputs + noise
# 归一化处理(可选)
# mean = inputs.mean(dim=(1,2), keepdim=True)
# std = inputs.std(dim=(1,2), keepdim=True)
# inputs = (inputs - mean) / (std + 1e-8)
return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features]
# 7. 主函数
def main():
# 1. 加载和预处理数据
print("正在加载和预处理数据...")
datetimes, X_structured, y, location_codes, scaler , y_scaler= load_and_preprocess_data()
# 2. 特征选择
print('正在进行特征选择')
# 修改为选择前15%特征
n_features = int(X_structured.shape[1] * 0.15)
X_selected, feature_importances, top_features = select_features_by_importance(
X_structured, y, n_features
)
X_selected = X_structured[top_features]
print(f"选择后的特征及其重要性:")
for feature, importance in zip(top_features, feature_importances):
print(f"{feature}: {importance:.4f}")
print(X_selected)
# 绘制特征重要性图
plot_feature_importance(top_features, feature_importances, Path("feature_importance.png"))
# 3. 创建时间序列数据集
print("正在创建时间序列数据集...")
timesteps = 5
dataset = TimeSeriesDataset(X_selected, y, timesteps)
# 4. 数据划分
train_size = int(0.8 * len(dataset))
train_indices = list(range(train_size))
test_indices = list(range(train_size, len(dataset)))
train_dataset = torch.utils.data.Subset(dataset, train_indices)
test_dataset = torch.utils.data.Subset(dataset, test_indices)
# 5. 贝叶斯优化超参数
print("正在进行贝叶斯优化...")
try:
best_params = optimize_hyperparameters(
X_selected.iloc[:train_size],
y[:train_size].copy(),
input_size=X_selected.shape[1]
)
print("最佳参数:", best_params)
except Exception as e:
print(f"贝叶斯优化失败: {str(e)}")
# 6. 使用最佳参数训练最终模型
torch.cuda.empty_cache() # 清理 GPU 缓存
print("\n使用最佳参数训练模型...")
# 获取并验证batch_size
batch_size = int(best_params.get('batch_size'))
print(f"实际使用的batch_size类型: {type(batch_size)}, 值: {batch_size}") # 调试输出
model = LSTMModel(
input_size=X_selected.shape[1],
hidden_size=int(best_params['hidden_size']),
num_layers=int(best_params['num_layers']),
dropout=float(best_params['dropout'])
).to(device)
# 数据加载器
train_loader = DataLoader(
train_dataset,
batch_size=int(batch_size),
shuffle=True, # 训练集需要打乱
collate_fn=collate_fn,
num_workers=4, # 多进程加载
pin_memory=True # 加速GPU传输
)
val_loader = DataLoader(
test_dataset,
batch_size=int(batch_size)*2, # 更大的批次提升验证效率
shuffle=False, # 验证集不需要打乱
collate_fn=lambda batch: (
torch.stack([x for x, y in batch]).permute(0, 2, 1),
torch.stack([y for x, y in batch])
),
num_workers=2,
pin_memory=True
)
# 损失函数和优化器
criterion = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
# 训练模型
model, history = train_model(
model, train_loader, val_loader, criterion, optimizer,
scheduler=scheduler, epochs=200, patience=15)
torch.cuda.empty_cache() # 清理 GPU 缓存
# 7. 评估模型
print("\n评估模型性能...")
metrics, y_true, y_pred = evaluate_model(model, val_loader, criterion, test_indices, y_scaler)
print(f"测试集 MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}")
# 8. 保存所有结果
print("\n保存所有结果...")
output_dir = Path(r"D:\result2")
output_dir.mkdir(parents=True, exist_ok=True)
# 保存评估指标
metrics_df = pd.DataFrame({
'Metric': ['MSE', 'MAE', 'R2', 'MAPE', 'Test Loss'],
'Value': [metrics['MSE'], metrics['MAE'], metrics['R2'], metrics['MAPE'], metrics['test_loss']]
})
metrics_df.to_csv(output_dir / 'evaluation_metrics.csv', index=False)
# 保存训练历史
history_df = pd.DataFrame(history)
history_df.to_csv(output_dir / 'training_history.csv', index=False)
# 保存预测结果与原始数据
pred_indices = [i + timesteps - 1 for i in metrics['indices']] # 调整索引以匹配原始数据
# 确保我们有足够的datetime和locationCode数据
if len(datetimes) > max(pred_indices) and len(location_codes) > max(pred_indices):
y_true = y_true.flatten() # 确保是一维
y_pred = y_pred.flatten() # 确保是一维
result_df = pd.DataFrame({
'datetime': datetimes.iloc[pred_indices].values,
'locationCode': location_codes.iloc[pred_indices].values,
'true_value': y_true,
'predicted_value': y_pred
})
# 有条件地添加分位数
if y_pred.shape[1] > 2:
result_df['predicted_lower'] = y_pred[:, 0] # 10%分位数
result_df['predicted_upper'] = y_pred[:, 2] # 90%分位数
# 添加其他特征
for i, feature in enumerate(X_selected.columns):
result_df[feature] = X_selected.iloc[pred_indices, i].values
result_df.to_csv(output_dir / 'predictions_with_metadata.csv', index=False)
else:
print("警告: datetime或locationCode数据不足,无法完全匹配预测结果")
# 保存基础预测结果
pd.DataFrame({
'true_value': y_true.flatten(),
'predicted_value': y_pred.flatten()
}).to_csv(output_dir / 'predictions.csv', index=False)
# 9. 可视化结果
plt.figure(figsize=(12, 6))
plt.plot(history['train_loss'], label='训练损失')
plt.plot(history['val_loss'], label='验证损失')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('训练过程')
plt.legend()
plt.savefig(output_dir / 'training_process.png', dpi=300)
plt.close()
# 添加预测结果可视化
plt.figure(figsize=(15, 6))
plt.plot(y_true[:200], label='真实值')
plt.plot(y_pred[:200], label='预测值') # 只使用中位数预测
plt.title('预测结果对比')
plt.legend()
plt.savefig(output_dir / 'prediction_comparison.png', dpi=300)
plt.show()
# 误差分布图
errors = y_true - y_pred[:, 1]
plt.hist(errors, bins=50)
plt.title('预测误差分布')
plt.savefig(output_dir / 'error_distribution.png', dpi=300) # 保存图像
plt.close()
# # 添加分位数预测可视化
# plt.figure(figsize=(15, 6))
# plt.plot(y_true[:100], label='真实值')
# plt.plot(y_pred[:100, 0], label='10%分位数')
# plt.plot(y_pred[:100, 1], label='中位数')
# plt.plot(y_pred[:100, 2], label='90%分位数')
# plt.legend()
# plt.savefig(output_dir / 'quantile_predictions.png', dpi=300) # 保存图像
# plt.close()
# 9. 保存模型
if metrics['r2'] > 0.8:
model_path = output_dir / 'best_model.pth'
torch.save(model.state_dict(), model_path)
print(f"模型保存成功: {model_path}")
print(f"所有结果已保存到 {output_dir}")
if __name__ == "__main__":
warnings.filterwarnings('ignore')
start_time = time.time()
main()
print(f"总运行时间: {(time.time() - start_time) / 60:.2f}分钟")import gc
import time
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
import warnings
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import boxcox
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.switch_backend('TkAgg')
# 设置路径
Path(r"D:\result2").mkdir(parents=True, exist_ok=True)
Path("model_results/").mkdir(parents=True, exist_ok=True)
# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用 {device} 进行训练")
# 设置随机种子保证可重复性
torch.manual_seed(42)
np.random.seed(42)
# 1. 数据预处理模块
def load_and_preprocess_data():
"""加载并预处理数据(内存安全版)"""
chunksize = 10000 # 每次处理1万行
dfs = []
datetimes_list = []
location_codes_list = []
# 指定列数据类型以减少内存使用
dtype_dict = {
'damage_count': 'float32',
'damage_depth': 'float32',
}
for chunk in pd.read_csv(
r"D:\my_data\clean\locationTransfer.csv",
chunksize=chunksize,
dtype=dtype_dict
):
# 保存非数值列
datetimes_list.append(chunk['datetime'].copy())
location_codes_list.append(chunk['locationCode'].copy())
# 只处理数值列
numeric_cols = chunk.select_dtypes(include=[np.number]).columns
chunk = chunk[numeric_cols]
chunk = chunk.dropna(subset=['damage_count'])
chunk = chunk[pd.to_numeric(chunk['damage_count'], errors='coerce').notna()]
chunk = chunk.fillna(method='ffill').fillna(method='bfill')
dfs.append(chunk)
if len(dfs) > 10: # 测试时限制块数
break
# 合并数据块
df = pd.concat(dfs, ignore_index=True)
def create_lag_features(df, lags=3):
for lag in range(1, lags + 1):
df[f'damage_count_lag_{lag}'] = df['damage_count'].shift(lag)
return df.dropna()
df = create_lag_features(df) # 在合并df之后,填充na之前
df = df.dropna(subset=['damage_count'])
datetimes = pd.concat(datetimes_list, ignore_index=True)
location_codes = pd.concat(location_codes_list, ignore_index=True)
# 确保长度一致
min_length = min(len(df), len(datetimes), len(location_codes))
df = df.iloc[:min_length]
datetimes = datetimes.iloc[:min_length]
location_codes = location_codes.iloc[:min_length]
# 检查是否存在 NaN 值
nan_check = df.isnull().sum().sum()
inf_check = df.isin([np.Inf, -np.Inf]).sum().sum()
if nan_check > 0 or inf_check > 0:
# 处理 NaN 值或者无穷大值
# 填充缺失值为均值
df = df.fillna(df.mean())
# 删除包含 NaN 值或者无穷大值的行
df = df.dropna()
# 结构化特征
X_structured = df.drop(columns=['damage_count', 'damage_depth', 'damage_db',
'asset_code_mapping', 'pile_longitude',
'pile_latitude', 'locationCode', 'datetime',
'locationCode_encoded','damage_count_lag_1','damage_count_lag_2','damage_count_lag_3'], errors='ignore')
# 填充缺失值
numeric_cols = X_structured.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
X_structured[col] = X_structured[col].fillna(X_structured[col].mean())
# 标准化数据
scaler = RobustScaler() # 替换StandardScaler,更抗异常值
X_structured = pd.DataFrame(scaler.fit_transform(X_structured),
columns=X_structured.columns)
# 确保X_structured是DataFrame
if not isinstance(X_structured, pd.DataFrame):
X_structured = pd.DataFrame(X_structured, columns=[f"feature_{i}" for i in range(X_structured.shape[1])])
# X_structured = X_structured.values # 将DataFrame转换为NumPy数组
# 修改后的目标变量处理部分
y = df[['damage_count']].values.astype(np.float32)
# 添加数据缩放
y_scaler = RobustScaler()
y = y_scaler.fit_transform(y) # 使用标准化代替log变换
y = np.clip(y, -1e6, 1e6) # 设置合理的上下界
# 添加数据检查
assert not np.any(np.isinf(y)), "y中包含无限值"
assert not np.any(np.isnan(y)), "y中包含NaN值"
# 数据检查
print("原始数据统计:")
print(f"最小值: {y.min()}, 最大值: {y.max()}, NaN数量: {np.isnan(y).sum()}")
print("处理后y值范围:", np.min(y), np.max(y))
print("无限值数量:", np.isinf(y).sum())
# 清理内存
del df, chunk, dfs
gc.collect()
torch.cuda.empty_cache()
return datetimes, X_structured, y, location_codes, scaler, y_scaler
# 2. 时间序列数据集类
class TimeSeriesDataset(Dataset):
"""自定义时间序列数据集类"""
def __init__(self, X, y, timesteps):
# 确保输入是NumPy数组
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
y = y.values
assert X.ndim == 2, f"X应为2维,实际为{X.ndim}维"
assert y.ndim == 2, f"y应为2维,实际为{y.ndim}维"
# 添加维度调试信息
print(f"数据形状 - X: {X.shape}, y: {y.shape}")
self.X = torch.FloatTensor(X).unsqueeze(-1) # [samples, timesteps, 1]
self.y = torch.FloatTensor(y)
self.timesteps = timesteps
# 验证形状
if len(self.X) != len(self.y):
raise ValueError("X和y的长度不匹配")
def __len__(self):
return len(self.X) - self.timesteps
def __getitem__(self, idx):
# [seq_len, num_features]
x_window = self.X[idx:idx + self.timesteps]
y_target = self.y[idx + self.timesteps - 1]
return x_window.permute(1, 0), y_target # 调整维度顺序
def select_features_by_importance(X, y, n_features, feature_names=None):
"""使用随机森林选择特征(支持NumPy数组和DataFrame)"""
# 确保X是二维数组
if isinstance(X, pd.DataFrame):
feature_names = X.columns.tolist()
X = X.values
elif feature_names is None:
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
# 处理y的维度
y = np.ravel(np.asarray(y))
y = np.nan_to_num(y, nan=np.nanmean(y))
# 检查特征数
if X.shape[1] < n_features:
n_features = X.shape[1]
print(f"警告: 特征数少于请求数,使用所有 {n_features} 个特征")
# 训练随机森林
rf = RandomForestRegressor(n_estimators=250, random_state=42, n_jobs=-1)
rf.fit(X, y)
# 获取特征重要性
feature_importances = rf.feature_importances_
indices = np.argsort(feature_importances)[::-1][:n_features]
# 返回选中的特征数据和重要性
return X[:, indices], feature_importances[indices], [feature_names[i] for i in indices]
# 3. LSTM模型定义
class LSTMModel(nn.Module):
"""LSTM回归模型"""
def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.4):
super().__init__()
# 确保hidden_size*2能被num_heads整除
if (hidden_size * 2) % 4 != 0:
hidden_size = ((hidden_size * 2) // 4) * 4 // 2 # 调整到最近的合规值
print(f"调整hidden_size为{hidden_size}以满足整除条件")
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0,
bidirectional=True # 添加双向结构
)
# 添加维度检查的初始化
def weights_init(m):
if isinstance(m, nn.Linear):
if m.weight.dim() < 2:
m.weight.data = m.weight.data.unsqueeze(0) # 确保至少2维
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0.0)
self.bn = nn.BatchNorm1d(hidden_size * 2)
# # 注意力机制层
self.attention = nn.MultiheadAttention(embed_dim=hidden_size * 2, num_heads=4) # 改用多头注意力
# 更深D输出层
self.fc = nn.Sequential(
nn.Linear(hidden_size*2, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, 1)
)
# 应用初始化
self.apply(weights_init)
def forward(self, x):
lstm_out, _ = self.lstm(x) # [batch, seq_len, hidden*2]
lstm_out = lstm_out.permute(1, 0, 2) # [seq_len, batch, features]
attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
attn_out = attn_out.permute(1, 0, 2) # 恢复为[batch, seq_len, features]
return self.fc(attn_out[:, -1, :]).squeeze()
def plot_feature_importance(feature_names, importance_values, save_path):
"""绘制特征重要性图"""
# 验证输入
if len(feature_names) == 0 or len(importance_values) == 0:
print("警告: 无特征重要性数据可绘制")
return
if len(feature_names) != len(importance_values):
print(f"警告: 特征名数量({len(feature_names)})与重要性值数量({len(importance_values)})不匹配")
# 取较小值
min_len = min(len(feature_names), len(importance_values))
feature_names = feature_names[:min_len]
importance_values = importance_values[:min_len]
# 按重要性排序
indices = np.argsort(importance_values)[::-1]
sorted_features = [feature_names[i] for i in indices]
sorted_importance = importance_values[indices]
plt.figure(figsize=(12, 8))
plt.bar(range(len(sorted_features)), sorted_importance, align="center")
plt.xticks(range(len(sorted_features)), sorted_features, rotation=90)
plt.xlabel("特征")
plt.ylabel("重要性得分")
plt.title("特征重要性排序")
plt.tight_layout()
# 确保保存路径存在
save_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(save_path, dpi=300)
plt.close()
def evaluate(model, val_loader, criterion):
model.eval()
val_loss = 0.0
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
val_loss += loss.item()
return val_loss / len(val_loader)
# 4. 模型训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer,
scheduler=None, epochs=100, patience=30):
"""训练模型并返回最佳模型和训练历史"""
best_loss = float('inf')
history = {'train_loss': [], 'val_loss': []}
# 添加梯度累积
accumulation_steps = 5 # 每4个batch更新一次参数
for epoch in range(epochs):
# 训练阶段
model.train()
train_loss = 0.0
optimizer.zero_grad()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
scaler = torch.cuda.amp.GradScaler() # 在训练循环中添加
with torch.cuda.amp.autocast():
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
scaler.scale(loss).backward()
for batch_idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
# 梯度累积
loss = loss / accumulation_steps
scaler.scale(loss).backward()
if (batch_idx + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
train_loss += loss.item() * accumulation_steps
# 验证
val_loss = evaluate(model, val_loader, criterion)
if scheduler:
scheduler.step(val_loss) # 根据验证损失调整学习率
# 记录历史
avg_train_loss = train_loss / len(train_loader)
history['train_loss'].append(avg_train_loss)
history['val_loss'].append(val_loss)
# 早停逻辑
if val_loss < best_loss * 0.99:# 相对改进阈值
best_loss = val_loss
best_epoch = epoch
torch.save(model.state_dict(), 'best_model.pth')
print(f"Epoch {epoch + 1}/{epochs} - 训练损失: {avg_train_loss :.4f} - 验证损失: {val_loss:.4f}")
# 早停判断
if epoch - best_epoch >= patience:
print(f"早停触发,最佳epoch: {best_epoch+1}")
break
# 加载最佳模型
model.load_state_dict(torch.load('best_model.pth'))
return model, history
# 5. 贝叶斯优化函数
def optimize_hyperparameters(X_train, y_train, input_size):
"""使用贝叶斯优化寻找最佳超参数"""
# 自定义评分函数
def score_fn(params):
"""内部评分函数"""
try:
params = adjust_hidden_size(params) # 调整参数
hidden_size, num_layers, dropout, lr, batch_size, timesteps = params
# 确保参数有效
batch_size = max(32, min(256, int(batch_size)))
timesteps = max(3, min(10, int(timesteps)))
dropout = min(0.5, max(0.1, float(dropout)))
lr = min(0.01, max(1e-5, float(lr)))
# 检查数据是否足够
if len(X_train) < 2 * timesteps+1: # 至少需要2倍时间步长的数据
return float('inf')
# 创建模型
model = LSTMModel(
input_size=input_size,
hidden_size=int(hidden_size),
num_layers=min(3, int(num_layers)),
dropout=min(0.5, float(dropout))
).to(device)
# 初始化权重
# for name, param in model.named_parameters():
# if 'weight' in name:
# nn.init.xavier_normal_(param)
# elif 'bias' in name:
# nn.init.constant_(param, 0.1)
# 损失函数和优化器
criterion = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
# 创建数据加载器
dataset = TimeSeriesDataset(X_train, y_train, timesteps=int(timesteps))
# 简化验证流程
train_size = int(0.8 * len(dataset))
train_dataset = torch.utils.data.Subset(dataset, range(train_size))
val_dataset = torch.utils.data.Subset(dataset, range(train_size, len(dataset)))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# 简单训练和验证
model.train()
for epoch in range(15): # 减少epoch数以加快评估
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets.squeeze())
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# 验证
model.eval()
val_loss = 0.0
with torch.no_grad():
for inputs, targets in val_loader:
outputs = model(inputs.to(device))
loss = criterion(outputs, targets.squeeze().to(device))
if torch.isnan(loss) or torch.isinf(loss):
return float('inf')
val_loss += loss.item()
return val_loss / len(val_loader)
except Exception as e:
print(f"参数评估失败: {str(e)}")
return float('inf')
# 定义搜索空间
search_spaces = [
Integer(32, 128, name='hidden_size'),
Integer(1, 3, name='num_layers'),
Real(0.2, 0.5, name='dropout'),
Real(5e-4, 1e-3, prior='log-uniform', name='lr'),
Categorical([64, 128, 256], name='batch_size'),
Integer(3, 10, name='timesteps') # 优化时间步长
]
def adjust_hidden_size(params):
"""确保hidden_size*2能被4整除"""
hs = params[0]
params[0] = ((hs * 2) // 4) * 4 // 2
return params
result = gp_minimize(
score_fn,
search_spaces,
n_calls=50,
random_state=42,
verbose=True,
n_jobs=1 # 并行执行
)
# 提取最佳参数
best_params = {
'hidden_size': result.x[0],
'num_layers': result.x[1],
'dropout': result.x[2],
'lr': result.x[3],
'batch_size': result.x[4],
'timesteps': result.x[5]
}
print("优化完成,最佳参数:", best_params)
return best_params
def collate_fn(batch):
"""增强型数据批处理函数"""
# 解包批次数据
inputs, targets = zip(*batch)
# 维度转换 (已包含在数据集中)
# inputs = [batch_size, features, seq_len]
# 数据增强(可选)
# 添加高斯噪声
noise = torch.randn_like(inputs) * 0.05
inputs = inputs + noise
# 归一化处理(可选)
# mean = inputs.mean(dim=(1,2), keepdim=True)
# std = inputs.std(dim=(1,2), keepdim=True)
# inputs = (inputs - mean) / (std + 1e-8)
return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features]
# 6. 评估函数
def evaluate_model(model, test_loader, criterion, test_indices, y_scaler=None):
"""评估模型性能"""
model.eval()
test_loss = 0.0
y_true = []
y_pred = []
all_indices = []
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
if outputs.dim() == 1:
outputs = outputs.unsqueeze(1)
loss = criterion(outputs, targets)
test_loss += loss.item() * inputs.size(0)
# 收集预测结果
y_true.extend(targets.cpu().numpy())
y_pred.extend(outputs.cpu().numpy())
# 获取原始数据集中的索引
current_indices = test_indices[batch_idx * test_loader.batch_size:
(batch_idx + 1) * test_loader.batch_size]
all_indices.extend(current_indices)
y_true = np.array(y_true).reshape(-1)
y_pred = np.array(y_pred).reshape(-1)
if y_scaler is not None:
y_true = y_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
# 基础指标
metrics = {
'MSE': mean_squared_error(y_true, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
'MAE': mean_absolute_error(y_true, y_pred),
'R2': r2_score(y_true, y_pred),
'MAPE': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100, # 避免除0
'indices': all_indices, # 添加原始索引
'y_true_original': y_true,
'y_pred_original': y_pred,
'test_loss': test_loss
}
# 可视化误差分布
errors = y_true - y_pred
plt.figure(figsize=(12, 6))
sns.histplot(errors, kde=True, bins=50)
plt.title('Error Distribution')
plt.savefig('error_distribution.png')
plt.close()
return metrics, y_true, y_pred
def collate_fn(batch):
"""增强型数据批处理函数"""
# 解包批次数据
inputs, targets = zip(*batch)
# 维度转换 (已包含在数据集中)
# inputs = [batch_size, features, seq_len]
# 数据增强(可选)
# 添加高斯噪声
noise = torch.randn_like(inputs) * 0.05
inputs = inputs + noise
# 归一化处理(可选)
# mean = inputs.mean(dim=(1,2), keepdim=True)
# std = inputs.std(dim=(1,2), keepdim=True)
# inputs = (inputs - mean) / (std + 1e-8)
return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features]
# 7. 主函数
def main():
# 1. 加载和预处理数据
print("正在加载和预处理数据...")
datetimes, X_structured, y, location_codes, scaler , y_scaler= load_and_preprocess_data()
# 2. 特征选择
print('正在进行特征选择')
# 修改为选择前15%特征
n_features = int(X_structured.shape[1] * 0.15)
X_selected, feature_importances, top_features = select_features_by_importance(
X_structured, y, n_features
)
X_selected = X_structured[top_features]
print(f"选择后的特征及其重要性:")
for feature, importance in zip(top_features, feature_importances):
print(f"{feature}: {importance:.4f}")
print(X_selected)
# 绘制特征重要性图
plot_feature_importance(top_features, feature_importances, Path("feature_importance.png"))
# 3. 创建时间序列数据集
print("正在创建时间序列数据集...")
timesteps = 5
dataset = TimeSeriesDataset(X_selected, y, timesteps)
# 4. 数据划分
train_size = int(0.8 * len(dataset))
train_indices = list(range(train_size))
test_indices = list(range(train_size, len(dataset)))
train_dataset = torch.utils.data.Subset(dataset, train_indices)
test_dataset = torch.utils.data.Subset(dataset, test_indices)
# 5. 贝叶斯优化超参数
print("正在进行贝叶斯优化...")
try:
best_params = optimize_hyperparameters(
X_selected.iloc[:train_size],
y[:train_size].copy(),
input_size=X_selected.shape[1]
)
print("最佳参数:", best_params)
except Exception as e:
print(f"贝叶斯优化失败: {str(e)}")
# 6. 使用最佳参数训练最终模型
torch.cuda.empty_cache() # 清理 GPU 缓存
print("\n使用最佳参数训练模型...")
# 获取并验证batch_size
batch_size = int(best_params.get('batch_size'))
print(f"实际使用的batch_size类型: {type(batch_size)}, 值: {batch_size}") # 调试输出
model = LSTMModel(
input_size=X_selected.shape[1],
hidden_size=int(best_params['hidden_size']),
num_layers=int(best_params['num_layers']),
dropout=float(best_params['dropout'])
).to(device)
# 数据加载器
train_loader = DataLoader(
train_dataset,
batch_size=int(batch_size),
shuffle=True, # 训练集需要打乱
collate_fn=collate_fn,
num_workers=4, # 多进程加载
pin_memory=True # 加速GPU传输
)
val_loader = DataLoader(
test_dataset,
batch_size=int(batch_size)*2, # 更大的批次提升验证效率
shuffle=False, # 验证集不需要打乱
collate_fn=lambda batch: (
torch.stack([x for x, y in batch]).permute(0, 2, 1),
torch.stack([y for x, y in batch])
),
num_workers=2,
pin_memory=True
)
# 损失函数和优化器
criterion = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
# 训练模型
model, history = train_model(
model, train_loader, val_loader, criterion, optimizer,
scheduler=scheduler, epochs=200, patience=15)
torch.cuda.empty_cache() # 清理 GPU 缓存
# 7. 评估模型
print("\n评估模型性能...")
metrics, y_true, y_pred = evaluate_model(model, val_loader, criterion, test_indices, y_scaler)
print(f"测试集 MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}")
# 8. 保存所有结果
print("\n保存所有结果...")
output_dir = Path(r"D:\result2")
output_dir.mkdir(parents=True, exist_ok=True)
# 保存评估指标
metrics_df = pd.DataFrame({
'Metric': ['MSE', 'MAE', 'R2', 'MAPE', 'Test Loss'],
'Value': [metrics['MSE'], metrics['MAE'], metrics['R2'], metrics['MAPE'], metrics['test_loss']]
})
metrics_df.to_csv(output_dir / 'evaluation_metrics.csv', index=False)
# 保存训练历史
history_df = pd.DataFrame(history)
history_df.to_csv(output_dir / 'training_history.csv', index=False)
# 保存预测结果与原始数据
pred_indices = [i + timesteps - 1 for i in metrics['indices']] # 调整索引以匹配原始数据
# 确保我们有足够的datetime和locationCode数据
if len(datetimes) > max(pred_indices) and len(location_codes) > max(pred_indices):
y_true = y_true.flatten() # 确保是一维
y_pred = y_pred.flatten() # 确保是一维
result_df = pd.DataFrame({
'datetime': datetimes.iloc[pred_indices].values,
'locationCode': location_codes.iloc[pred_indices].values,
'true_value': y_true,
'predicted_value': y_pred
})
# 有条件地添加分位数
if y_pred.shape[1] > 2:
result_df['predicted_lower'] = y_pred[:, 0] # 10%分位数
result_df['predicted_upper'] = y_pred[:, 2] # 90%分位数
# 添加其他特征
for i, feature in enumerate(X_selected.columns):
result_df[feature] = X_selected.iloc[pred_indices, i].values
result_df.to_csv(output_dir / 'predictions_with_metadata.csv', index=False)
else:
print("警告: datetime或locationCode数据不足,无法完全匹配预测结果")
# 保存基础预测结果
pd.DataFrame({
'true_value': y_true.flatten(),
'predicted_value': y_pred.flatten()
}).to_csv(output_dir / 'predictions.csv', index=False)
# 9. 可视化结果
plt.figure(figsize=(12, 6))
plt.plot(history['train_loss'], label='训练损失')
plt.plot(history['val_loss'], label='验证损失')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('训练过程')
plt.legend()
plt.savefig(output_dir / 'training_process.png', dpi=300)
plt.close()
# 添加预测结果可视化
plt.figure(figsize=(15, 6))
plt.plot(y_true[:200], label='真实值')
plt.plot(y_pred[:200], label='预测值') # 只使用中位数预测
plt.title('预测结果对比')
plt.legend()
plt.savefig(output_dir / 'prediction_comparison.png', dpi=300)
plt.show()
# 误差分布图
errors = y_true - y_pred[:, 1]
plt.hist(errors, bins=50)
plt.title('预测误差分布')
plt.savefig(output_dir / 'error_distribution.png', dpi=300) # 保存图像
plt.close()
# # 添加分位数预测可视化
# plt.figure(figsize=(15, 6))
# plt.plot(y_true[:100], label='真实值')
# plt.plot(y_pred[:100, 0], label='10%分位数')
# plt.plot(y_pred[:100, 1], label='中位数')
# plt.plot(y_pred[:100, 2], label='90%分位数')
# plt.legend()
# plt.savefig(output_dir / 'quantile_predictions.png', dpi=300) # 保存图像
# plt.close()
# 9. 保存模型
if metrics['r2'] > 0.8:
model_path = output_dir / 'best_model.pth'
torch.save(model.state_dict(), model_path)
print(f"模型保存成功: {model_path}")
print(f"所有结果已保存到 {output_dir}")
if __name__ == "__main__":
warnings.filterwarnings('ignore')
start_time = time.time()
main()
print(f"总运行时间: {(time.time() - start_time) / 60:.2f}分钟")参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.0120
Function value obtained: inf
Current minimum: inf
Iteration No: 6 started. Evaluating function at random point.
数据形状 - X: (21168, 3), y: (21168, 1)
参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.0170
Function value obtained: inf
Current minimum: inf
Iteration No: 7 started. Evaluating function at random point.
数据形状 - X: (21168, 3), y: (21168, 1)
参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.0126
Function value obtained: inf
Current minimum: inf
Iteration No: 8 started. Evaluating function at random point.
数据形状 - X: (21168, 3), y: (21168, 1)
参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.0126
Function value obtained: inf
Current minimum: inf
Iteration No: 9 started. Evaluating function at random point.
数据形状 - X: (21168, 3), y: (21168, 1)
参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.0085
Function value obtained: inf
Current minimum: inf
Iteration No: 10 started. Evaluating function at random point.
数据形状 - X: (21168, 3), y: (21168, 1)
参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2
贝叶斯优化失败: Input y contains infinity or a value too large for dtype('float64').结合代码分析为啥优化失败 并且改造