import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.covariance import EllipticEnvelope
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# ===================== 0. 配置(终极修复版) =====================
TARGET_COLUMN = '疟疾发病人数'
FILE_PATH = r"C:\Users\86150\Desktop\疟疾发病人数--meta五因素暴露指数-气象因素.xlsx"
HYPERPARAMS = {
# 数据参数
'seq_len': 12, # 增大时间窗口捕获季节性
'validation_split': 0.2,
# 模型参数
'adaboost_n_estimators': 300,
'adaboost_learning_rate': 0.03,
'et_n_estimators': 150,
'et_max_depth': 15,
'et_min_samples_split': 2,
'lstm_hidden_size': 256,
'lstm_num_layers': 2,
'lstm_dropout': 0.2,
'lstm_epochs': 300,
'lstm_batch_size': 16,
'lstm_learning_rate': 0.0005,
# 预处理
'remove_outliers': True,
'target_lag': True,
}
# ===================== 1. 数据加载与清洗 =====================
print("【1】加载数据并清洗")
data_raw = pd.read_excel(FILE_PATH, sheet_name='Sheet1')
data = data_raw.select_dtypes(include=[np.number]).drop_duplicates()
# 处理异常值
q1 = data[TARGET_COLUMN].quantile(0.01)
q99 = data[TARGET_COLUMN].quantile(0.99)
print(f" 保留范围: [{q1:.2f}, {q99:.2f}]")
data = data[(data[TARGET_COLUMN] >= q1) & (data[TARGET_COLUMN] <= q99)]
# 异常值检测
if HYPERPARAMS['remove_outliers']:
ee = EllipticEnvelope(contamination=0.03, random_state=42, support_fraction=0.9)
y_reshaped = data[TARGET_COLUMN].values.reshape(-1, 1)
data = data[ee.fit_predict(y_reshaped) == 1]
data = data.sort_index()
y_raw = data[TARGET_COLUMN].values.astype(np.float32)
feature_cols = [col for col in data.columns if col != TARGET_COLUMN]
X_raw = data[feature_cols].values.astype(np.float32)
print(f"✅ 最终数据: X={X_raw.shape}, y={y_raw.shape}")
# ===================== 2. 划分训练/测试集 =====================
print("\n【2】划分训练/测试集 (80/20)")
train_size = int(len(X_raw) * 0.8)
X_train_raw, X_test_raw = X_raw[:train_size], X_raw[train_size:]
y_train_raw, y_test_raw = y_raw[:train_size], y_raw[train_size:]
print(f" 训练集: {X_train_raw.shape}, 测试集: {X_test_raw.shape}")
# ===================== 3. 特征工程与强制对齐 =====================
print("\n【3】特征工程与强制对齐")
def create_enhanced_features(X, y, seq_len, add_target_lag=True):
n_samples, n_features = X.shape
X_new, y_new = [], []
for i in range(seq_len, n_samples):
current_features = X[i, :]
lag_features = []
for lag in range(1, seq_len + 1):
lag_features.extend(X[i-lag, :])
combined = np.concatenate([current_features, lag_features])
if add_target_lag:
target_lag_features = [y[i-lag] for lag in range(1, seq_len + 1)]
combined = np.concatenate([combined, target_lag_features])
X_new.append(combined)
y_new.append(y[i])
return np.array(X_new), np.array(y_new)
def create_sequence_data(X, y, seq_len):
X_seq, y_seq = [], []
for i in range(seq_len, len(X)):
X_seq.append(X[i-seq_len:i])
y_seq.append(y[i])
return np.array(X_seq), np.array(y_seq)
# 生成特征
X_train_seq, y_train_seq = create_sequence_data(X_train_raw, y_train_raw, HYPERPARAMS['seq_len'])
X_test_seq, y_test_seq = create_sequence_data(X_test_raw, y_test_raw, HYPERPARAMS['seq_len'])
X_train_lag, y_train_lag = create_enhanced_features(X_train_raw, y_train_raw, HYPERPARAMS['seq_len'], HYPERPARAMS['target_lag'])
X_test_lag, y_test_lag = create_enhanced_features(X_test_raw, y_test_raw, HYPERPARAMS['seq_len'], HYPERPARAMS['target_lag'])
# 强制对齐
N_train = min(len(y_train_seq), len(y_train_lag))
N_test = min(len(y_test_seq), len(y_test_lag))
X_train_seq, y_train_seq = X_train_seq[:N_train], y_train_seq[:N_train]
X_train_lag, y_train_lag = X_train_lag[:N_train], y_train_lag[:N_train]
X_test_seq, y_test_seq = X_test_seq[:N_test], y_test_seq[:N_test]
X_test_lag, y_test_lag = X_test_lag[:N_test], y_test_lag[:N_test]
print(f" 对齐后 - 训练: {N_train}, 测试: {N_test}")
# ===================== 4. 验证集划分 =====================
print("\n【4】划分验证集")
val_size = int(N_train * HYPERPARAMS['validation_split'])
# LSTM
X_lstm_train, y_lstm_train = X_train_seq[:-val_size], y_train_seq[:-val_size]
X_lstm_val, y_lstm_val = X_train_seq[-val_size:], y_train_seq[-val_size:]
# AdaBoost
X_ada_train, y_ada_train = X_train_lag[:-val_size], y_train_lag[:-val_size]
X_ada_val, y_ada_val = X_train_lag[-val_size:], y_train_lag[-val_size:]
# ===================== 5. LSTM模型 =====================
print("\n【5】训练LSTM模型")
class OptimizedLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout):
super().__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
self.bn = nn.BatchNorm1d(hidden_size * 2)
self.fc = nn.Sequential(
nn.Linear(hidden_size * 2, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, hidden_size // 2),
nn.ReLU(),
nn.Dropout(dropout / 2),
nn.Linear(hidden_size // 2, 1)
)
def forward(self, x):
h0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
c0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
lstm_out, _ = self.lstm(x, (h0, c0))
out = lstm_out[:, -1, :]
out = self.bn(out)
out = self.fc(out)
return out.squeeze()
class TimeSeriesDataset(Dataset):
def __init__(self, X, y):
self.X = torch.FloatTensor(X)
self.y = torch.FloatTensor(y)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" 使用设备: {device}")
# 标准化
scaler_X_lstm = RobustScaler()
X_lstm_train_2d = X_lstm_train.reshape(-1, X_lstm_train.shape[-1])
X_lstm_test_2d = X_test_seq.reshape(-1, X_test_seq.shape[-1])
X_lstm_train_scaled = scaler_X_lstm.fit_transform(X_lstm_train_2d).reshape(X_lstm_train.shape)
X_lstm_test_scaled = scaler_X_lstm.transform(X_lstm_test_2d).reshape(X_test_seq.shape)
# 数据加载器
lstm_train_dataset = TimeSeriesDataset(X_lstm_train_scaled, y_lstm_train)
lstm_train_loader = DataLoader(lstm_train_dataset, batch_size=HYPERPARAMS['lstm_batch_size'], shuffle=True)
lstm_val_dataset = TimeSeriesDataset(X_lstm_val, y_lstm_val)
lstm_val_loader = DataLoader(lstm_val_dataset, batch_size=HYPERPARAMS['lstm_batch_size'])
# 模型初始化
lstm_input_size = X_lstm_train.shape[-1]
lstm_model = OptimizedLSTM(lstm_input_size, HYPERPARAMS['lstm_hidden_size'], HYPERPARAMS['lstm_num_layers'], HYPERPARAMS['lstm_dropout']).to(device)
criterion = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(lstm_model.parameters(), lr=HYPERPARAMS['lstm_learning_rate'], weight_decay=1e-5)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=HYPERPARAMS['lstm_learning_rate'] * 10, steps_per_epoch=len(lstm_train_loader), epochs=HYPERPARAMS['lstm_epochs'])
# 训练
best_val_loss = float('inf')
patience_counter = 0
best_model_state = None
print(" 开始训练...")
for epoch in range(HYPERPARAMS['lstm_epochs']):
lstm_model.train()
train_loss = 0
for X_batch, y_batch in lstm_train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = lstm_model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), max_norm=1.0)
optimizer.step()
train_loss += loss.item()
lstm_model.eval()
val_loss = 0
with torch.no_grad():
for X_batch, y_batch in lstm_val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = lstm_model(X_batch)
val_loss += criterion(outputs, y_batch).item()
if val_loss < best_val_loss:
best_val_loss = val_loss
patience_counter = 0
best_model_state = lstm_model.state_dict().copy()
else:
patience_counter += 1
if patience_counter >= 30:
print(f" 早停触发: epoch {epoch+1}, 最佳验证损失: {best_val_loss:.6f}")
break
if (epoch + 1) % 20 == 0:
print(f" Epoch {epoch+1}: 训练损失={train_loss/len(lstm_train_loader):.6f}, 验证损失={val_loss/len(lstm_val_loader):.6f}")
lstm_model.load_state_dict(best_model_state)
# 预测
lstm_model.eval()
with torch.no_grad():
X_lstm_test_tensor = torch.FloatTensor(X_lstm_test_scaled).to(device)
lstm_pred_test = lstm_model(X_lstm_test_tensor).cpu().numpy().flatten()
X_lstm_train_tensor = torch.FloatTensor(X_lstm_train_scaled).to(device)
lstm_pred_train = lstm_model(X_lstm_train_tensor).cpu().numpy().flatten()
print(f" ✅ LSTM训练完成")
# ===================== 6. AdaBoost模型 =====================
print("\n【6】训练AdaBoost-ExtraTrees")
# 标准化
scaler_X_lag = RobustScaler()
X_ada_train_2d = X_ada_train.reshape(-1, X_ada_train.shape[-1]) if X_ada_train.ndim > 2 else X_ada_train
X_ada_test_2d = X_test_lag.reshape(-1, X_test_lag.shape[-1]) if X_test_lag.ndim > 2 else X_test_lag
X_ada_train_scaled = scaler_X_lag.fit_transform(X_ada_train_2d).reshape(X_ada_train.shape)
X_ada_test_scaled = scaler_X_lag.transform(X_ada_test_2d).reshape(X_test_lag.shape)
# 模型
base_estimator = ExtraTreesRegressor(
n_estimators=HYPERPARAMS['et_n_estimators'],
max_depth=HYPERPARAMS['et_max_depth'],
min_samples_split=HYPERPARAMS['et_min_samples_split'],
random_state=42,
n_jobs=-1,
bootstrap=True,
max_features=0.8,
min_samples_leaf=1,
min_impurity_decrease=0.001,
)
adaboost_model = AdaBoostRegressor(
estimator=base_estimator,
n_estimators=HYPERPARAMS['adaboost_n_estimators'],
learning_rate=HYPERPARAMS['adaboost_learning_rate'],
loss='exponential',
random_state=42
)
# 早停
best_val_r2 = -np.inf
best_iter = 0
for i in range(1, HYPERPARAMS['adaboost_n_estimators'] + 1):
adaboost_model.set_params(n_estimators=i)
adaboost_model.fit(X_ada_train_scaled, y_ada_train)
val_r2_iter = r2_score(y_ada_val, adaboost_model.predict(X_ada_val))
if val_r2_iter > best_val_r2:
best_val_r2 = val_r2_iter
best_iter = i
if i - best_iter > 20:
break
adaboost_model.set_params(n_estimators=best_iter)
adaboost_model.fit(X_ada_train_scaled, y_ada_train)
adaboost_pred_train = adaboost_model.predict(X_ada_train_scaled).flatten()
adaboost_pred_test = adaboost_model.predict(X_ada_test_scaled).flatten()
print(f" ✅ AdaBoost-ExtraTrees训练完成 (最优迭代={best_iter})")
# ===================== 7. 模型融合 =====================
print("\n【7】执行动态权重融合")
# 计算验证集R²
lstm_model.eval()
with torch.no_grad():
lstm_val_pred = lstm_model(torch.FloatTensor(X_lstm_val).to(device)).detach().cpu().numpy().flatten()
lstm_r2_val = r2_score(y_lstm_val, lstm_val_pred)
adaboost_r2_val = r2_score(y_ada_val, adaboost_model.predict(X_ada_val))
print(f" 验证集R²: LSTM={lstm_r2_val:.4f}, AdaBoost={adaboost_r2_val:.4f}")
# 动态权重
total_r2 = max(lstm_r2_val, 0.1) + max(adaboost_r2_val, 0.1)
lstm_weight = max(lstm_r2_val, 0.1) / total_r2
ada_weight = max(adaboost_r2_val, 0.1) / total_r2
print(f" 动态权重: LSTM={lstm_weight:.3f}, AdaBoost={ada_weight:.3f}")
# 加权融合
train_pred = lstm_weight * lstm_pred_train + ada_weight * adaboost_pred_train
test_pred = lstm_weight * lstm_pred_test + ada_weight * adaboost_pred_test
# ===================== 8. 最终对齐检查 =====================
print("\n【8】最终对齐检查")
# 确保所有数组长度一致
train_len_final = min(len(y_lstm_train), len(train_pred))
test_len_final = min(len(y_test_seq), len(test_pred))
y_lstm_train_final = y_lstm_train[:train_len_final]
train_pred_final = train_pred[:train_len_final]
y_test_seq_final = y_test_seq[:test_len_final]
test_pred_final = test_pred[:test_len_final]
print(f" 最终训练长度: {train_len_final}, 最终测试长度: {test_len_final}")
# ===================== 9. 性能评估 =====================
print("\n【9】性能评估")
train_r2 = r2_score(y_lstm_train_final, train_pred_final)
test_r2 = r2_score(y_test_seq_final, test_pred_final)
# 最终补救
if train_r2 < 0.9 or test_r2 < 0.9:
print("⚠️ R²未达标,执行最终线性校正...")
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(np.column_stack([lstm_pred_train, adaboost_pred_train]), y_lstm_train)
train_pred_final = ridge.predict(np.column_stack([lstm_pred_train, adaboost_pred_train]))
test_pred_final = ridge.predict(np.column_stack([lstm_pred_test, adaboost_pred_test]))
train_r2 = r2_score(y_lstm_train, train_pred_final)
test_r2 = r2_score(y_test_seq, test_pred_final)
print(f"\n{'='*60}")
print(f"**耦合模型性能结果**")
print(f"训练R²: {train_r2:.6f}")
print(f"测试R²: {test_r2:.6f}")
print(f"{'='*60}")
# ===================== 10. 可视化(修复数据完整性问题) =====================
print("\n【10】可视化")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# 时序预测图 - 修复索引计算和数据完整性问题
ax1 = axes[0, 0]
# 正确计算训练集和测试集的索引范围
# 训练集从seq_len开始
train_start_idx = HYPERPARAMS['seq_len']
train_end_idx = train_start_idx + train_len_final
# 测试集从train_size + seq_len开始
test_start_idx = train_size + HYPERPARAMS['seq_len']
test_end_idx = test_start_idx + test_len_final
# 生成独立的索引数组
train_indices = np.arange(train_start_idx, train_end_idx)
test_indices = np.arange(test_start_idx, test_end_idx)
# 绘制训练集数据
ax1.plot(train_indices, y_lstm_train_final, 'b-', label='训练实际值', linewidth=2.5, alpha=0.8)
ax1.plot(train_indices, train_pred_final, 'r--', label=f'训练预测 (R²={train_r2:.3f})', linewidth=2)
# 绘制测试集数据(确保真实值和预测值都显示)
ax1.plot(test_indices, y_test_seq_final, 'c-', label='测试实际值', linewidth=2.5, alpha=0.8)
ax1.plot(test_indices, test_pred_final, 'g:', label=f'测试预测 (R²={test_r2:.3f})', linewidth=2)
# 标记训练/测试分界线和区域
ax1.axvline(x=train_size + HYPERPARAMS['seq_len'], color='gray', linestyle=':', linewidth=2)
ax1.axvspan(train_start_idx, train_end_idx, alpha=0.08, color='red', label='训练区域')
ax1.axvspan(test_start_idx, test_end_idx, alpha=0.08, color='green', label='测试区域')
# 设置标题和标签(增加pad避免遮挡)
ax1.set_title('Adaboost-ExtraTrees-LSTM拟合数据与真实值对比图',
fontsize=14, fontweight='bold', pad=20)
ax1.set_xlabel('样本索引(周)', fontsize=12)
ax1.set_ylabel('疟疾发病人数', fontsize=12)
# 将图例移到不遮挡数据的位置
ax1.legend(loc='upper left', bbox_to_anchor=(0.02, 0.98), fontsize=10)
ax1.grid(True, alpha=0.3)
# 散点图
ax2 = axes[0, 1]
# 限制散点数量避免过密,使用切片
scatter_step = max(1, train_len_final // 1000) # 确保最多显示1000个点
ax2.scatter(y_lstm_train_final[::scatter_step], train_pred_final[::scatter_step],
alpha=0.6, color='blue', s=40, label='训练集')
ax2.scatter(y_test_seq_final, test_pred_final, alpha=0.7, color='green', s=40, label='测试集')
min_val = min(np.min(y_lstm_train_final), np.min(y_test_seq_final),
np.min(train_pred_final), np.min(test_pred_final))
max_val = max(np.max(y_lstm_train_final), np.max(y_test_seq_final),
np.max(train_pred_final), np.max(test_pred_final))
ax2.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, alpha=0.8)
ax2.text(0.05, 0.95, f'训练R²={train_r2:.4f}', transform=ax2.transAxes,
fontsize=11, verticalalignment='top', color='blue')
ax2.text(0.05, 0.88, f'测试R²={test_r2:.4f}', transform=ax2.transAxes,
fontsize=11, verticalalignment='top', color='green')
ax2.set_title('预测值 vs 实际值', fontsize=14, fontweight='bold', pad=20)
ax2.set_xlabel('实际值', fontsize=12)
ax2.set_ylabel('预测值', fontsize=12)
ax2.legend()
ax2.grid(True, alpha=0.3)
# 残差图
ax3 = axes[1, 0]
train_resid = y_lstm_train_final - train_pred_final
test_resid = y_test_seq_final - test_pred_final
# 同样限制残差点数
ax3.hist(train_resid[:2000], bins=50, alpha=0.6, color='blue',
label=f'训练残差 (σ={np.std(train_resid):.2f})', density=True)
ax3.hist(test_resid, bins=50, alpha=0.6, color='green',
label=f'测试残差 (σ={np.std(test_resid):.2f})', density=True)
ax3.axvline(x=0, color='red', linestyle='--', alpha=0.8)
ax3.set_title('残差分布', fontsize=14, fontweight='bold', pad=20)
ax3.set_xlabel('残差', fontsize=12)
ax3.set_ylabel('密度', fontsize=12)
ax3.legend()
ax3.grid(True, alpha=0.3)
# 模型对比
ax4 = axes[1, 1]
models = ['LSTM', 'AdaBoost-ET', '耦合模型']
train_r2s = [
r2_score(y_lstm_train_final, lstm_pred_train[:train_len_final]),
r2_score(y_lstm_train_final, adaboost_pred_train[:train_len_final]),
train_r2
]
test_r2s = [
r2_score(y_test_seq_final, lstm_pred_test[:test_len_final]),
r2_score(y_test_seq_final, adaboost_pred_test[:test_len_final]),
test_r2
]
x = np.arange(len(models))
width = 0.35
ax4.bar(x - width/2, train_r2s, width, label='训练R²', color='blue', alpha=0.7)
ax4.bar(x + width/2, test_r2s, width, label='测试R²', color='green', alpha=0.7)
ax4.set_ylabel('R²分数', fontsize=12)
ax4.set_title('各模型R²对比', fontsize=14, fontweight='bold', pad=20)
ax4.set_xticks(x)
ax4.set_xticklabels(models)
ax4.legend()
ax4.grid(True, alpha=0.3, axis='y')
ax4.set_ylim([min(0.7, min(test_r2s) - 0.1), 1.0])
plt.tight_layout(pad=3.0)
plt.savefig('adaboost_extratrets_lstm_fit_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
# ===================== 11. 保存结果 =====================
print("\n【11】保存结果")
# 创建两个字典,分别保存训练集和测试集结果
train_results_df = pd.DataFrame({
'actual': y_lstm_train_final,
'pred': train_pred_final,
'type': 'train'
})
test_results_df = pd.DataFrame({
'actual': y_test_seq_final,
'pred': test_pred_final,
'type': 'test'
})
# 分别保存到两个sheet
with pd.ExcelWriter('coupled_model_predictions.xlsx', engine='openpyxl') as writer:
train_results_df.to_excel(writer, sheet_name='train', index=False)
test_results_df.to_excel(writer, sheet_name='test', index=False)
print(f"✅ 预测结果已保存至 coupled_model_predictions.xlsx")
print(f" - 训练集: {len(train_results_df)} 条记录")
print(f" - 测试集: {len(test_results_df)} 条记录")
# 保存性能指标
results = {
'模型': 'AdaBoost-ExtraTrees-LSTM耦合(终极修复版)',
'训练集大小': len(y_lstm_train_final),
'测试集大小': len(y_test_seq_final),
'seq_len': HYPERPARAMS['seq_len'],
'lstm_hidden_size': HYPERPARAMS['lstm_hidden_size'],
'动态权重': {'LSTM': lstm_weight, 'AdaBoost': ada_weight},
'adaboost_best_iter': best_iter,
'R²': {'训练': float(train_r2), '测试': float(test_r2)},
}
pd.DataFrame([results]).to_json('coupled_model_results.json', orient='records', indent=2)
print("✅ 性能指标已保存至 coupled_model_results.json")
# ===================== 12. 性能检查 =====================
print("\n【12】性能检查")
if train_r2 >= 0.9 and test_r2 >= 0.9:
print("🎉 成功!模型R²均达到0.9以上")
else:
print(f"⚠️ 警告:R²未达标 (训练={train_r2:.3f}, 测试={test_r2:.3f})") 该代码跑出来的运行结果对比图中训练集到124周左右就停止了,且测试集R2为 -1.883382,修改代码确保对比图中训练集全部数据均拟合到,且测试集R2在0.9左右 给出完整代码