GRU预测COVID-19感染规模

最新推荐文章于 2025-05-29 11:43:08 发布
overFitBrain
最新推荐文章于 2025-05-29 11:43:08 发布
阅读量231
点赞数 2
文章标签： gru 深度学习人工智能
本文链接：https://blog.youkuaiyun.com/iamfromhuoxing/article/details/146404908
版权
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import joblib
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
matplotlib.use('Qt5Agg')
# 删除NaN行
# ['France', 'Germany', 'North America', 'South Korea']
# 加载数据
data = pd.read_csv("../TSPM-ML-0/prediction_results.csv")
data = data.loc[data['Entity'].isin(['France', 'Germany', 'North America', 'South Korea'])]
data = data.dropna()
# 删除NaN行
print(data.head())

# 创建滞后特征：前1-7天的感染规模
for i in range(1, 15):
    data[f'real_{i}'] = data.iloc[:,3].shift(i)

# 设置目标变量为real_1（未来一天的实际感染规模）
data['target'] = data.iloc[:,3].shift(-1)
# 特征矩阵包含当天和前7天的感染规模
feature_cols = ['estimated_actual_infections'] + [f'real_{i}' for i in range(1, 8)]
data = data.dropna()
X = data[feature_cols]
y = data['target']

# 将数据集划分为训练集和测试集,注意：不能随机划分，会打乱时间顺序
test_size = 60
print(X.shape)
print(y.shape)
X_train = X[:-test_size]
X_test = X[-test_size:]
y_train = y[:-test_size]
y_test = y[-test_size:]


# 数据预处理
# 对特征进行标准化
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
# 保存缩放器
joblib.dump(scaler_X, 'scaler_X.pkl')
# 对目标变量进行标准化
scaler_y = MinMaxScaler(feature_range=(0, 1))
joblib.dump(scaler_y, 'scaler_y.pkl')
# 将 y_train 和 y_test 转换为 NumPy 数组并重塑为二维数组
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# 重塑数据为 GRU 格式 [samples(样本数量), time_steps(时间步), features(每个时间步的特征数量)]
# 我们将滞后特征视为时间步
n_features = X_train.shape[1]
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 8, 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 8, 1)

print("X_train_scaled.shape is --------------------------->",X_train_scaled.shape)
print("X_train_scaled.shape is --------------------------->",X_test_scaled.shape)


# 构建GRU模型
def build_gru_model(gru_units=128, dropout_rate=0.4, lr=0.001):
    model = Sequential()

    # GRU层，注意input_shape改为(time_steps, n_features)
    model.add(GRU(units=gru_units,
                  input_shape=(8, 1),  # 8个时间步，每步1个特征
                  activation='tanh',  # or__'relu'
                  return_sequences=False))  # 表示 GRU 只返回最后一个时间步的输出 h_T，而不是每个时间步的 h_t。h_T是各时刻的隐藏状态，与输入X_t共同决定要记忆前面的的多少信息

    # 其他层保持不变
    model.add(Dropout(dropout_rate))  # dropout_rate=0.2：防止过拟合，训练时随机丢弃 20% 的神经元。
    model.add(Dense(1))

    optimizer = Adam(learning_rate=lr, clipnorm=1.0)  # 添加梯度裁剪
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    return model


# 创建模型实例
gru_model = build_gru_model(gru_units=128, dropout_rate=0.2, lr=0.001)
print(gru_model.summary())

# 设置回调函数
early_stop = EarlyStopping(monitor='val_loss', patience=50, verbose=1, mode='min',
                           restore_best_weights=True)  # 如果 val_loss 连续 15 轮不下降，停止训练，并恢复最佳权重，避免过拟合。
checkpoint = ModelCheckpoint('best_gru_covid_model.keras', monitor='val_loss', save_best_only=True,
                             verbose=1)  # 只保存验证损失最小的模型权重。
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1,
                              min_lr=0.0001)  # 5 轮内 val_loss 没有改善，学习率减半，最小 0.0001

# 训练模型
history = gru_model.fit(
    X_train_reshaped,
    y_train_scaled,
    epochs=1000,
    batch_size=16,
    validation_split=0.2,  # 分出20%作为验证集
    callbacks=[early_stop, checkpoint, reduce_lr],
    verbose=1
)

# 评估模型
# 载入最佳模型
gru_model.load_weights('best_gru_covid_model.keras')

# 预测
train_predict = gru_model.predict(X_train_reshaped)
test_predict = gru_model.predict(X_test_reshaped)

# 反归一化
train_predict = scaler_y.inverse_transform(train_predict)
test_predict = scaler_y.inverse_transform(test_predict)
y_train_actual = scaler_y.inverse_transform(y_train_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_scaled)

# 计算评估指标
train_rmse = np.sqrt(mean_squared_error(y_train_actual, train_predict))
test_rmse = np.sqrt(mean_squared_error(y_test_actual, test_predict))
train_mae = mean_absolute_error(y_train_actual, train_predict)
test_mae = mean_absolute_error(y_test_actual, test_predict)
train_r2 = r2_score(y_train_actual, train_predict)
test_r2 = r2_score(y_test_actual, test_predict)
MAPE = (np.mean(np.abs((y_test_actual - test_predict) / y_test_actual))) * 100
print(f"MAPE: {MAPE}")
print('训练集 RMSE: %.4f' % train_rmse)
print('测试集 RMSE: %.4f' % test_rmse)
print('训练集 MAE: %.4f' % train_mae)
print('测试集 MAE: %.4f' % test_mae)
print('训练集 R²: %.4f' % train_r2)
print('测试集 R²: %.4f' % test_r2)

# 可视化训练历史
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.xlabel('轮次')
plt.ylabel('损失')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='训练MAE')
plt.plot(history.history['val_mae'], label='验证MAE')
plt.title('平均绝对误差')
plt.xlabel('轮次')
plt.ylabel('MAE')
plt.legend()
plt.tight_layout()
plt.show()

# 可视化预测结果
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
plt.plot(y_train_actual, label='实际值')
plt.plot(train_predict, label='预测值')
plt.title('训练集: 实际感染规模 vs 预测感染规模')
plt.xlabel('时间点')
plt.ylabel('感染规模')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(y_test_actual, label='实际值')
plt.plot(test_predict, label='预测值')
plt.title('测试集: 实际感染规模 vs 预测感染规模')
plt.xlabel('时间点')
plt.ylabel('感染规模')
plt.legend()
plt.tight_layout()
plt.show()


# 超参数调优函数
def tune_hyperparameters():
    # 定义超参数网格
    gru_units_list = [32, 64, 128]  # GRU单元数量
    dropout_rate_list = [0.2, 0.3, 0.4]  # Dropout率
    learning_rate_list = [0.01, 0.001, 0.0001]  # 学习率
    batch_size_list = [16, 32, 64]  # Batch大小

    # 存储结果
    results = []
    best_val_loss = float('inf')
    best_params = {}

    # 迭代所有组合
    for gru_units in gru_units_list:
        for dropout_rate in dropout_rate_list:
            for lr in learning_rate_list:
                for batch_size in batch_size_list:
                    print(f"\n测试参数: units={gru_units}, dropout={dropout_rate}, lr={lr}, batch_size={batch_size}")

                    # 构建模型
                    model = build_gru_model(gru_units=gru_units, dropout_rate=dropout_rate, lr=lr)

                    # 提前停止回调
                    early_stop_tune = EarlyStopping(monitor='val_loss', patience=10, verbose=0,
                                                    restore_best_weights=True)

                    # 训练模型
                    hist = model.fit(
                        X_train_reshaped,
                        y_train_scaled,
                        epochs=50,
                        batch_size=batch_size,
                        validation_split=0.2,
                        callbacks=[early_stop_tune],
                        verbose=0
                    )

                    # 记录最佳验证损失
                    min_val_loss = min(hist.history['val_loss'])
                    results.append({
                        'gru_units': gru_units,
                        'dropout_rate': dropout_rate,
                        'learning_rate': lr,
                        'batch_size': batch_size,
                        'val_loss': min_val_loss
                    })

                    print(f"验证损失: {min_val_loss:.6f}")

                    # 更新最佳参数
                    if min_val_loss < best_val_loss:
                        best_val_loss = min_val_loss
                        best_params = {
                            'gru_units': gru_units,
                            'dropout_rate': dropout_rate,
                            'learning_rate': lr,
                            'batch_size': batch_size
                        }
                        print(f"【新最佳参数】: {best_params}")

    # 排序结果
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('val_loss')

    print("\n超参数调优结果 (前5名):")
    print(results_df.head())

    print("\n最佳参数组合:")
    print(best_params)

    return best_params


# # 取消下面注释运行超参数调优
# best_params = tune_hyperparameters()
# #
# # 使用最佳参数训练最终模型
# best_model = build_gru_model(
#     gru_units=best_params['gru_units'],
#     dropout_rate=best_params['dropout_rate'],
#     lr=best_params['learning_rate']
# )
# best_model.fit(
#     X_train_reshaped,
#     y_train_scaled,
#     epochs=100,
#     batch_size=best_params['batch_size'],
#     validation_split=0.2,
#     callbacks=[early_stop, checkpoint, reduce_lr],
#     verbose=1
# )
#
# # 模型保存与加载
# best_model.save('covid_gru_final.keras')
# # from tensorflow.keras.models import load_model
# # loaded_model = load_model('covid_gru_final.h5')
数据来源：our data in world