作业:手动构造类似的数据集(如cosx数据),观察不同的机器学习模型的差异
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
# 设置随机种子
np.random.seed(42)
# 生成合成时间序列数据
x = np.linspace(0, 100, 1000)
y = np.cos(x) + 0.1 * x + np.random.normal(0, 0.5, 1000)
# 数据预处理
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_y = scaler.fit_transform(y.reshape(-1, 1)).flatten()
# 创建序列数据
def create_sequences(data, seq_length):
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:i+seq_length])
y.append(data[i+seq_length])
return np.array(X), np.array(y)
seq_length = 30
X, y = create_sequences(scaled_y, seq_length)
# 划分训练集和测试集
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# 准备数据以适应机器学习模型
n_samples_train = X_train.shape[0]
n_samples_test = X_test.shape[0]
X_train_rf = X_train.reshape(n_samples_train, -1)
X_test_rf = X_test.reshape(n_samples_test, -1)
# 训练随机森林模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train)
train_predict_rf = rf_model.predict(X_train_rf)
test_predict_rf = rf_model.predict(X_test_rf)
# 训练 LightGBM 模型
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_rf, y_train)
train_predict_lgb = lgb_model.predict(X_train_rf)
test_predict_lgb = lgb_model.predict(X_test_rf)
# 反标准化预测结果
train_predict_rf = scaler.inverse_transform(train_predict_rf.reshape(-1, 1))
test_predict_rf = scaler.inverse_transform(test_predict_rf.reshape(-1, 1))
train_predict_lgb = scaler.inverse_transform(train_predict_lgb.reshape(-1, 1))
test_predict_lgb = scaler.inverse_transform(test_predict_lgb.reshape(-1, 1))
y_train_orig = scaler.inverse_transform(y_train.reshape(-1, 1))
y_test_orig = scaler.inverse_transform(y_test.reshape(-1, 1))
# 计算 RMSE
rf_train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_predict_rf))
rf_test_rmse = np.sqrt(mean_squared_error(y_test_orig, test_predict_rf))
lgb_train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_predict_lgb))
lgb_test_rmse = np.sqrt(mean_squared_error(y_test_orig, test_predict_lgb))
# 可视化结果
plt.figure(figsize=(15, 7))
plt.plot(y, label='原始数据', color='gray', alpha=0.5)
# 随机森林结果
train_predict_plot_rf = np.empty_like(y)
train_predict_plot_rf[:] = np.nan
train_predict_plot_rf[seq_length:len(train_predict_rf) + seq_length] = train_predict_rf.flatten()
test_predict_plot_rf = np.empty_like(y)
test_predict_plot_rf[:] = np.nan
test_predict_plot_rf[len(train_predict_rf) + seq_length:] = test_predict_rf.flatten()
# LightGBM 结果
train_predict_plot_lgb = np.empty_like(y)
train_predict_plot_lgb[:] = np.nan
train_predict_plot_lgb[seq_length:len(train_predict_lgb) + seq_length] = train_predict_lgb.flatten()
test_predict_plot_lgb = np.empty_like(y)
test_predict_plot_lgb[:] = np.nan
test_predict_plot_lgb[len(train_predict_lgb) + seq_length:] = test_predict_lgb.flatten()
plt.plot(train_predict_plot_rf, label='随机森林训练集预测值', color='blue', linestyle='--')
plt.plot(test_predict_plot_rf, label='随机森林测试集预测值', color='red', linestyle='--')
plt.plot(train_predict_plot_lgb, label='LightGBM 训练集预测值', color='green', linestyle=':')
plt.plot(test_predict_plot_lgb, label='LightGBM 测试集预测值', color='orange', linestyle=':')
plt.title('时间序列预测结果对比')
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()
print(f"随机森林训练集 RMSE: {rf_train_rmse:.4f}")
print(f"随机森林测试集 RMSE: {rf_test_rmse:.4f}")
print(f"LightGBM 训练集 RMSE: {lgb_train_rmse:.4f}")
print(f"LightGBM 测试集 RMSE: {lgb_test_rmse:.4f}")
@浙大疏锦行
6145

被折叠的 条评论
为什么被折叠?



