量化思路介绍:
EEMD分解 + LSTM预测
- 将股票数据的开盘价与收盘价分别进行EEMD分解, 每组数据分解为6个IMFs, 6个IMFs中由2个残差项, 2个周期项和2个趋势项组成, 共12个IMFs
- 模型输入数据为滞后7天的开盘价与收盘价分解后的两个IMFs, 输出数据为下一天的收盘价, 总共构建6个LSTM模型
- LSTM模型使用较为简单的双层lstm, 每层50个神经元, Adam优化器, 共100个神经元
- 得到6个IMFs的预测结果, 求和便是预测的收盘价结果
数据方面: 分为训练集, 验证集与测试集
测试集为题目指定的2022年至2024年4月30日数据, 即后562条数据
验证集为后762至562条数据, 主要目的是控制模型早停, 避免过拟合
测试集为前2717条数据, 即去掉验证集与测试集的数据
主要原理:
- EEMD分解可以帮助提取时间序列数据中不同尺度的特征信息,将原始数据分解成多个固有模态函数(IMFs),这些IMFs反映了不同尺度上的波动和趋势, 使得模型更好地捕捉到时间序列数据的内在规律和周期性
- LSTM神经网络能够解决传统RNN存在的梯度消失或梯度爆炸等问题,同时能够更好地捕捉长期依赖关系, 能够提高模型的预测性能和泛化能力,从而更准确地预测股票价格序列的走势
- 考虑了使用区间型股票价格数据以及区间型数据的相互作用对预测精度的提高, 利用开盘价与收盘价反映波动,有效捕捉了单日股价的真实波动。
一. 下载导入必要库
安装EEMD分解的python库--EMD-signal
! pip install -i https://mirrors.aliyun.com/pypi/simple/ EMD-signal
#导入各种库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import mpl
from PyEMD import EEMD
import datetime
二. 数据读取与预处理
1.数据读取
df = pd.read_csv("/home/mw/input/stock9243/data_00300.csv")
2.数据类型转换
df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d %H:%M:%S%z')
df['date'] = df['date'].dt.date
df = df.set_index('date')
df.info()
df.head()
三. EEMD分解
# 初始化 EEMD 对象
eemd = EEMD()
# 执行 EEMD 分解
eIMFs_open = eemd.eemd(df['open'].values, max_imf=5)
# 绘制原始信号和分解后的各个 IMF
plt.figure(figsize=(12, 9))
plt.subplot(len(eIMFs_open) + 1, 1, 1)
plt.plot(range(0,len(df['open'])), df['open'].values, 'r')
for i, eIMF in enumerate(eIMFs_open):
plt.subplot(len(eIMFs_open) + 1, 1, i + 2)
plt.plot(range(0,len(df['open'])), eIMF, 'g')
plt.tight_layout()
plt.show()
# 初始化 EEMD 对象
eemd = EEMD()
# 执行 EEMD 分解
eIMFs_close = eemd.eemd(df['close'].values, max_imf=5)
# 绘制原始信号和分解后的各个 IMF
plt.figure(figsize=(12, 9))
plt.subplot(len(eIMFs_open) + 1, 1, 1)
plt.plot(range(0,len(df['close'])), df['close'].values, 'r')
for i, eIMF in enumerate(eIMFs_open):
plt.subplot(len(eIMFs_open) + 1, 1, i + 2)
plt.plot(range(0,len(df['close'])), eIMF, 'g')
plt.tight_layout()
plt.show()
# 划分训练集和测试集
train_start_date = datetime.date(2010,1,1)
train_end_date = datetime.date(2021,12,31)
test_start_date = datetime.date(2022,1,1)
test_end_date = datetime.date(2024,4,30)
# 使用切片获取训练集和测试集
train_set = df[train_start_date:train_end_date]
test_set = df[test_start_date:test_end_date]
# 输出训练集和测试集的大小
print("训练集大小:", len(train_set))
print("测试集大小:", len(test_set))
df_ = df[['open', 'close']].copy()
#train_data是前2917训练集
train_data = df_[:-762]
#val_data是后762到562验证集
val_data = df_[-762:-562]
#test_data是后562测试集
test_data = df_[-562:]
四. 训练并保存LSTM模型
import torch
import torch.nn as nn
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# 定义LSTM模型
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
out, _ = self.lstm(x, (h0, c0))
out = self.fc(out[:, -1, :])
return out
# 超参数
input_size = 14
hidden_size = 50
num_layers = 2
output_size = 1
learning_rate = 0.001
epochs_num = 1000
patience_num = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x2_imfs = eIMFs_open
imfs = eIMFs_close
lstm_re = []
predict_column = 'close'
var_x2 = 'open'
for i in range(6):
window_size = 7 # 滞后阶数为7天
n_steps = 1 # 每次预测1个时间步
df = pd.DataFrame(np.concatenate([imfs[i].reshape(-1, 1), x2_imfs[i].reshape(-1, 1)], axis=1), columns=[predict_column, var_x2])
df_ = df
# 生成窗口数据
windowed_data = [df_.iloc[i - window_size:i, [0, 1]].values.flatten() for i in range(window_size, len(df_))]
new_df = pd.DataFrame(windowed_data, columns=[f'x{i}' for i in range(1, 2 * window_size + 1)])
# 添加目标值
new_df['target'] = df_.iloc[window_size:, 0].values
# 创建特征矩阵X和目标变量y
X = np.array([new_df.iloc[i:i + n_steps, :-1].values for i in range(len(new_df) - n_steps + 1)])
y = new_df['target']
# 数据标准化
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_reshaped = X.reshape(X.shape[0] * X.shape[1], X.shape[2])
X_scaled = scaler_X.fit_transform(X_reshaped)
X_scaled_reshaped = X_scaled.reshape(X.shape[0], X.shape[1], X.shape[2])
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
X_train = X_scaled_reshaped[:-762]
X_test = X_scaled_reshaped[-762:-562]
y_train = y_scaled[:-762]
y_test = y_scaled[-762:-562]
test_x = X_scaled_reshaped[-762:]
# 初始化模型
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
# 损失函数和优化器
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
early_stopping = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience_num, verbose=True)
print(f'--------第{i+1}个imf--------')
for epoch in range(epochs_num):
model.train()
optimizer.zero_grad()
outputs = model(torch.tensor(X_train, dtype=torch.float32).to(device))
loss = criterion(outputs, torch.tensor(y_train, dtype=torch.float32).to(device))
loss.backward()
optimizer.step()
if epoch % 100 == 0:
print(f'Epoch [{epoch+1}/{epochs_num}], Loss: {loss.item():.4f}')
model.eval()
with torch.no_grad():
y_pred = model(torch.tensor(test_x, dtype=torch.float32).to(device)).cpu().numpy().flatten()
y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
lstm_re.append(y_pred)
torch.save(model.state_dict(), f'lstm_model{i+1}.pth')
print(f'------------------------')
lstm_re_dataframes = [pd.DataFrame(data) for data in lstm_re]
lstm_result = lstm_re_dataframes[0]
for i in range(1, len(lstm_re_dataframes)):
lstm_result = lstm_result.add(lstm_re_dataframes[i], fill_value=0)
lstm_result.columns = [predict_column]
lstm_result = lstm_result[predict_column]
lstm_val = lstm_result[:200]
lstm_test = lstm_result[200:]
lstm_val_mse = mean_squared_error(val_data[predict_column].values, lstm_val)
lstm_val_rmse = np.sqrt(lstm_val_mse)
lstm_val_mape = np.mean(np.abs((val_data[predict_column].values - lstm_val) / val_data[predict_column].values)) * 100
lstm_tst_mse = mean_squared_error(test_data[predict_column].values, lstm_test)
lstm_tst_rmse = np.sqrt(lstm_tst_mse)
lstm_tst_mape = np.mean(np.abs((test_data[predict_column].values - lstm_test) / test_data[predict_column].values)) * 100
lstm_variables = [lstm_val_mse, lstm_val_rmse, lstm_val_mape, lstm_tst_mse, lstm_tst_rmse, lstm_tst_mape]
lstm_variable_names = ['lstm_val_mse', 'lstm_val_rmse', 'lstm_val_mape', 'lstm_tst_mse', 'lstm_tst_rmse', 'lstm_tst_mape']
for name, value in zip(lstm_variable_names, lstm_variables):
print(f"{name}: {value}")
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.plot(range(1, len(lstm_val) + 1), lstm_val, label='Validation set prediction value')
plt.plot(range(1, len(val_data[predict_column].values) + 1), val_data[predict_column].values, label='true value')
plt.title('Prediction after combining LSTM')
plt.subplot(2, 1, 2)
plt.plot(range(1, len(lstm_test) + 1), lstm_test, label='Test set prediction value')
plt.plot(range(1, len(test_data[predict_column].values) + 1), test_data[predict_column].values, label='true value')
plt.legend()
plt.show()
模型在验证集上的mape为0.61, 测试集上的mape为0.66, 从图中也可以看出模型预测效果很好, 很好的预测到股价的走向与趋势
plt.figure(figsize=(10, 6))
for i in range(1, len(imfs) + 1):
# 更新子图
plt.subplot(6, 1, i)
plt.plot(range(1, len(imfs[i-1][-762:]) + 1), imfs[i-1][-762:], label=f'imf {i}')
plt.plot(range(1, len(lstm_re[i-1]) + 1), lstm_re[i-1], label=f'lstm {i}')
plt.legend()
plt.show()
进一步观察模型在每个IMF上的预测, 可以发现LSTM模型在趋势项上预测的最为精确, 在残差项上预测的较差