你遇到的错误:
```
ValueError: 未生成任何有效模型配置,请检查参数范围或数据。
```
以及之前的警告:
```
Input y contains infinity or a value too large for dtype('float64').
```
说明在贝叶斯优化过程中,`y`(目标变量)中出现了 **无穷大值(inf)或极大异常值**,导致模型训练失败,所有参数组合都返回 `-np.inf`,最终 `results_log` 为空。
---
## ✅ 根本原因分析
这类问题通常由以下几种情况引起:
| 原因 | 说明 |
|------|------|
| 🔴 数据含 `inf` 或 `NaN` | 原始 AQI 中可能有 `inf`, `-inf`, 或极离群点 |
| 🔴 归一化失败 | 如果 `y` 极大(如百万级 AQI),MinMaxScaler 会溢出 |
| 🔴 时序构造越界 | `create_dataset` 函数逻辑错误导致索引越界、空数组输入 |
| 🔴 贝叶斯优化尝试了无效参数 | 如 `look_back=0` 导致无输入样本 |
---
## ✅ 解决方案:完整修复代码(带健壮性检查)
以下是 **完全修复后的版本**,包含:
- ✅ 数据清洗
- ✅ 异常值过滤
- ✅ 安全归一化
- ✅ 输入检查
- ✅ 更稳定的贝叶斯优化容错机制
```python
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.models import Sequential
from keras.layers import Conv1D, Dense, Flatten, Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(precision=6, suppress=True)
# -------------------------------
# 1. 加载数据(请根据实际路径修改)
# -------------------------------
data = pd.read_excel(r"C:/Users/12203/Desktop/空气质量数据全new.xlsx")
# 定义特征与目标变量
features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h']
target = 'AQI'
# 检查列是否存在
assert target in data.columns, f"目标列 '{target}' 不存在"
assert all(f in data.columns for f in features), "某些特征列缺失"
X = data[features].values
y = data[target].values.copy() # 防止视图修改
# -------------------------------
# 2. 数据清洗:去除 NaN 和 inf
# -------------------------------
print("原始数据形状:", X.shape, y.shape)
# 检查并处理无穷值和 NaN
X = np.where(np.isinf(X), np.nan, X)
y = np.where(np.isinf(y), np.nan, y)
# 删除含有 NaN 的行
mask = ~(np.isnan(X).any(axis=1) | np.isnan(y))
X = X[mask]
y = y[mask]
print(f"清理后数据形状: {X.shape}, 缺失值已删除")
# -------------------------------
# 3. 过滤极端异常值(例如 AQI > 1000 可能是错误)
# -------------------------------
q99 = np.percentile(y, 99.5)
y_clipped = np.clip(y, None, q99) # 截断 top 0.5%
print(f"AQI 最大值(截断前): {y.max():.1f}, 截断后: {y_clipped.max():.1f}")
# 使用 y_clipped 替代原始 y
y = y_clipped
# -------------------------------
# 4. 归一化(安全方式)
# -------------------------------
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
# 再次检查是否含 inf 或 nan
assert not np.any(np.isnan(X_scaled)), "X_scaled 包含 NaN"
assert not np.any(np.isnan(y_scaled)), "y_scaled 包含 NaN"
assert not np.any(np.isinf(X_scaled)), "X_scaled 包含 inf"
assert not np.any(np.isinf(y_scaled)), "y_scaled 包含 inf"
# -------------------------------
# 5. 划分训练+验证集 vs 测试集(最后365天)
# -------------------------------
test_days = 365
if len(X_scaled) <= test_days:
raise ValueError("数据总量不足,无法保留365天测试集")
train_val_size = len(X_scaled) - test_days
X_train_val = X_scaled[:train_val_size]
y_train_val = y_scaled[:train_val_size]
X_test_full = X_scaled[train_val_size:]
y_test_full = y_scaled[train_val_size:]
print(f"训练+验证集大小: {len(X_train_val)}")
print(f"测试集大小: {len(X_test_full)}")
# -------------------------------
# 6. 构造时序样本函数(安全版)
# -------------------------------
def create_dataset(X, y, seq_len):
if seq_len >= len(X):
return np.array([]), np.array([])
n_samples = len(X) - seq_len
X_seq = np.zeros((n_samples, seq_len, X.shape[1]))
y_seq = np.zeros((n_samples,))
for i in range(seq_len, len(X)):
start_idx = i - seq_len
X_seq[i - seq_len] = X[start_idx:i]
y_seq[i - seq_len] = y[i]
return X_seq, y_seq
# -------------------------------
# 7. 创建 CNN 模型(增加安全性)
# -------------------------------
def create_cnn_model(seq_len, kernel_size, filters):
try:
effective_kernel = max(1, min(int(kernel_size), seq_len))
model = Sequential([
Conv1D(
filters=int(filters),
kernel_size=effective_kernel,
activation='relu',
kernel_regularizer=l2(1e-4),
input_shape=(seq_len, X_train_val.shape[1])
),
Dropout(0.3),
Flatten(),
Dense(32, activation='relu', kernel_regularizer=l2(1e-4)),
Dropout(0.3),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
return model
except Exception as e:
print(f"[Model Build Error] {e}")
return None
# -------------------------------
# 8. 贝叶斯优化目标函数(增强容错)
# -------------------------------
results_log = []
def cnn_evaluate(kernel_size, filters, batch_size, look_back):
try:
# 参数取整 & 限制范围
kernel_size = int(round(kernel_size))
filters = int(round(filters))
batch_size = int(round(batch_size))
look_back = int(round(look_back))
kernel_size = max(1, min(kernel_size, look_back))
filters = max(16, min(filters, 64))
batch_size = max(8, min(batch_size, 128))
look_back = max(1, min(look_back, 30))
# 构造数据
X_padded, y_padded = create_dataset(X_train_val, y_train_val, look_back)
if len(X_padded) == 0 or len(y_padded) < 10:
return -np.inf # 样本太少
# 分割训练/验证(按时间顺序)
val_ratio = 0.2
n_val = max(1, int(len(X_padded) * val_ratio))
X_tr, X_val = X_padded[:-n_val], X_padded[-n_val:]
y_tr, y_val = y_padded[:-n_val], y_padded[-n_val:]
# 构建模型
model = create_cnn_model(look_back, kernel_size, filters)
if model is None:
return -np.inf
# 早停
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
# 训练
history = model.fit(
X_tr, y_tr,
epochs=30,
batch_size=batch_size,
validation_data=(X_val, y_val),
callbacks=[es],
verbose=0
)
# 预测验证集
pred_scaled = model.predict(X_val, verbose=0)
pred_scaled = np.clip(pred_scaled, -1e3, 1e3) # 防止爆炸
# 反归一化
y_pred_inv = scaler_y.inverse_transform(pred_scaled).flatten()
y_true_inv = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()
# 检查预测值是否合理
if np.any(np.isnan(y_pred_inv)) or np.any(np.isinf(y_pred_inv)):
return -np.inf
rmse = np.sqrt(mean_squared_error(y_true_inv, y_pred_inv))
# 记录成功结果
results_log.append({
'kernel_size': kernel_size,
'filters': filters,
'batch_size': batch_size,
'look_back': look_back,
'RMSE_val': rmse
})
return -rmse # 最小化 RMSE
except Exception as e:
# print(f"[Trial Failed] {e}") # 可打开调试
return -np.inf
# -------------------------------
# 9. 贝叶斯优化搜索
# -------------------------------
pbounds = {
'kernel_size': (1, 5),
'filters': (16, 64),
'batch_size': (8, 128),
'look_back': (1, 10)
}
optimizer = BayesianOptimization(
f=cnn_evaluate,
pbounds=pbounds,
random_state=42,
verbose=2
)
print("🔍 开始贝叶斯优化搜索...")
try:
optimizer.maximize(init_points=10, n_iter=15)
except Exception as e:
print(f"优化过程异常: {e}")
# -------------------------------
# 10. 检查是否有有效结果
# -------------------------------
if not results_log:
raise ValueError("""
❌ 贝叶斯优化未生成任何有效结果!请检查:
1. 数据中是否有大量 NaN/inf?
2. 是否存在极端异常值(如 AQI > 1000)?
3. look_back 是否过大导致无足够样本?
4. GPU/CUDA 是否崩溃?尝试减小 batch_size
""")
results_df = pd.DataFrame(results_log)
best_row = results_df.loc[results_df['RMSE_val'].idxmin()]
print("\n" + "="*60)
print(" ✅ 贝叶斯优化完成 | 最优超参数 ")
print("="*60)
for k, v in best_row.items():
print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")
print("="*60)
opt_params = {
'kernel_size': int(best_row['kernel_size']),
'filters': int(best_row['filters']),
'batch_size': int(best_row['batch_size']),
'look_back': int(best_row['look_back'])
}
# -------------------------------
# 11. 训练最终模型 & 预测
# -------------------------------
look_back_final = opt_params['look_back']
X_train_padded, y_train_padded = create_dataset(X_train_val, y_train_val, look_back_final)
X_test_padded, y_test_padded = create_dataset(
np.concatenate([X_train_val[-look_back_final:], X_test_full]),
np.concatenate([y_train_val[-look_back_final:], y_test_full]),
look_back_final
)
final_model = create_cnn_model(look_back_final, **opt_params)
es = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
history = final_model.fit(
X_train_padded, y_train_padded,
epochs=100,
batch_size=opt_params['batch_size'],
callbacks=[es],
verbose=1
)
# -------------------------------
# 12. 评估与输出
# -------------------------------
def evaluate_and_inverse(y_true_scaled, y_pred_scaled, name):
y_true_inv = scaler_y.inverse_transform(y_true_scaled.reshape(-1, 1)).flatten()
y_pred_inv = scaler_y.inverse_transform(y_pred_scaled).flatten()
rmse = np.sqrt(mean_squared_error(y_true_inv, y_pred_inv))
mae = mean_absolute_error(y_true_inv, y_pred_inv)
r2 = r2_score(y_true_inv, y_pred_inv)
print(f"\n📊 {name} 性能:")
print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
return y_pred_inv, rmse, mae, r2
# 预测
train_pred_scaled = final_model.predict(X_train_padded, verbose=0)
cnn_pred_train, _, _, _ = evaluate_and_inverse(y_train_padded, train_pred_scaled, "训练集")
test_pred_scaled = final_model.predict(X_test_padded, verbose=0)
cnn_pred_test, _, _, _ = evaluate_and_inverse(y_test_padded, test_pred_scaled, "测试集")
# 可视化训练损失
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.title('CNN Model Training Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
# 保存结果
test_dates = data.index[train_val_size + look_back_final:]
pd.DataFrame({
'Date': test_dates,
'True_AQI': scaler_y.inverse_transform(y_test_padded.reshape(-1, 1)).flatten(),
'CNN_Pred_Test': cnn_pred_test
}).to_csv('cnn_aqi_prediction_results.csv', index=False)
print(f"\n✅ cnn_pred_train.shape: {cnn_pred_train.shape}")
print(f"✅ cnn_pred_test.shape: {cnn_pred_test.shape}")
```
---
### ✅ 关键修复点总结
| 问题 | 修复措施 |
|------|---------|
| `inf` 或 `NaN` | 清洗数据、替换 `inf`、删除 `NaN` 行 |
| 极端异常值 | 对 `y` 截断 top 0.5% |
| 归一化溢出 | 使用 `.clip()` 并双重检查 |
| 空数据集 | `create_dataset` 添加长度判断 |
| 贝叶斯全失败 | 添加 try-except 返回 `-np.inf`,避免中断 |
| 模型构建失败 | 增加 `if model is None` 判断 |
---
### ✅ 下一步建议
运行前先检查你的原始数据:
```python
print(data[['AQI'] + features].describe())
print(data.isnull().sum())
print((data == np.inf).sum())
```
确保没有离谱数值。
---
###