请帮我检查并优化代码,尤其关注避免数据泄露,调整灵活权重等问题:import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import TimeSeriesSplit
import joblib
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Layer
from tensorflow.keras.optimizers import Adam
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')
# ===== 自定义注意力层 =====
class AttentionLayer(Layer):
def __init__(self, **kwargs):
super(AttentionLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.time_steps = input_shape[1]
self.feature_dim = input_shape[2]
# 创建可训练权重
self.W = self.add_weight(
name='att_weight',
shape=(self.feature_dim, self.feature_dim),
initializer='glorot_uniform',
trainable=True
)
self.b = self.add_weight(
name='att_bias',
shape=(self.feature_dim,),
initializer='zeros',
trainable=True
)
self.V = self.add_weight(
name='att_v',
shape=(self.feature_dim, 1),
initializer='glorot_uniform',
trainable=True
)
super(AttentionLayer, self).build(input_shape)
# 修改call方法
def call(self, inputs):
# 动态计算每个时间步的重要性
score = tf.matmul(tf.tanh(tf.matmul(inputs, self.W) + self.b), self.V)
score = tf.squeeze(score, axis=-1)
alpha = tf.nn.softmax(score, axis=1) # 修正为按时间步归一化
alpha = tf.expand_dims(alpha, axis=-1)
context = tf.reduce_sum(alpha * inputs, axis=1)
return context
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[2]) # (batch_size, feature_dim)
# ===== 模型构建函数 =====
# 改进的模型结构
def build_model(input_shape):
inputs = Input(shape=input_shape)
# 双向LSTM捕获时序特征
lstm_out = Bidirectional(LSTM(128, return_sequences=True))(inputs)
# 注意力层
att_out = AttentionLayer()(lstm_out)
# 红球分支(6个独立预测)
red_outputs = []
for i in range(6):
branch = Dense(64, activation='relu')(att_out)
red_outputs.append(Dense(33, activation='softmax', name=f'red_{i}')(branch))
# 蓝球分支
blue_branch = Dense(32, activation='relu')(att_out)
blue_output = Dense(16, activation='softmax', name='blue_output')(blue_branch)
model = Model(inputs, red_outputs + [blue_output])
# 调整损失权重(红球重要性更高)
loss_weights = {f'red_{i}': 0.15 for i in range(6)}
loss_weights['blue_output'] = 0.1
model.compile(optimizer=Adam(0.001),
loss='categorical_crossentropy',
loss_weights=loss_weights)
return model
def build_attention_lstm_model(input_shape):
time_steps, num_features = input_shape
inputs = Input(shape=input_shape)
# LSTM层
lstm_out = LSTM(128, return_sequences=True)(inputs)
lstm_out = Dropout(0.3)(lstm_out)
# 使用自定义注意力层
attention_out = AttentionLayer()(lstm_out)
# 红球分支
red_branch = Dense(64, activation='relu')(attention_out)
red_branch = Dropout(0.2)(red_branch)
# 修改为6个输出节点(每个红球位置独立预测)
red_outputs = []
for i in range(6):
branch = Dense(32, activation='relu')(attention_out)
red_outputs.append(Dense(33, activation='softmax', name=f'red_{i}')(branch))
# 蓝球分支
blue_branch = Dense(32, activation='relu')(attention_out)
blue_branch = Dropout(0.2)(blue_branch)
blue_output = Dense(16, activation='sigmoid', name='blue_output')(blue_branch)
model = Model(inputs=inputs, outputs=[red_output, blue_output])
optimizer = Adam(learning_rate=0.001)
model.compile(
optimizer=optimizer,
loss={'red_output': 'binary_crossentropy', 'blue_output': 'binary_crossentropy'},
metrics={'red_output': 'binary_accuracy', 'blue_output': 'binary_accuracy'},
# 关键修改:使用字典形式指定损失权重
loss_weights={'red_output': 0.7, 'blue_output': 0.3}
)
model.summary()
return model
# ===== 数据预处理函数 =====
#窗口特征计算前必须进行时间序列分割
#建议将create_features中的目标构建移至prepare_data
def step1_format_data():
"""格式化原始数据"""
print("===== 步骤1: 格式化原始数据 =====")
df = pd.read_excel('01hand.xlsx', sheet_name='Sheet1', header=None)
# 提取A列和C列数据
new_df = pd.DataFrame({
'A': pd.to_numeric(df.iloc[:, 0], errors='coerce'),
'B': pd.to_numeric(df.iloc[:, 2], errors='coerce')
}).dropna()
# 保存新文件
new_df.to_excel('01hand2.xlsx', index=False, header=False)
print(f"新表格 '01hand2.xlsx' 创建成功! 包含 {len(new_df)} 行数据")
def step2_process_data():
"""数据去重和排序"""
print("\n===== 步骤2: 数据去重和排序 =====")
input_file = "01hand2.xlsx"
output_file1 = "02resultA.xlsx" # 降序输出
output_file2 = "02resultB.xlsx" # 升序输出
# 读取数据并转换为长格式
df = pd.read_excel(input_file, header=None)
all_values = df.stack().dropna().astype(str).tolist()
# 确保数据长度是8的倍数
valid_length = len(all_values) - (len(all_values) % 8)
if len(all_values) != valid_length:
print(f"警告: 数据总量 {len(all_values)} 不符合8的倍数, 截断至 {valid_length} 个元素")
all_values = all_values[:valid_length]
# 转换数据格式
new_data = []
for i in range(0, len(all_values), 8):
group = all_values[i:i+8]
try:
# 转换日期和数字
date = int(group[0])
numbers = [int(float(num)) if '.' in num else int(num) for num in group[1:]]
new_data.append([date] + numbers)
except:
continue
# 创建DataFrame并去重
columns = ['日期', '数字1', '数字2', '数字3', '数字4', '数字5', '数字6', '数字7']
df = pd.DataFrame(new_data, columns=columns)
df = df.drop_duplicates(subset='日期').dropna()
# 保存降序文件
df.sort_values('日期', ascending=False).to_excel(output_file1, index=False)
print(f"降序文件保存至: {output_file1}")
# 保存升序文件
df.sort_values('日期', ascending=True).to_excel(output_file2, index=False)
print(f"升序文件保存至: {output_file2}")
print(f"最终数据维度: {df.shape}")
return df
# ===== 特征工程函数 =====
def create_features(df, save_features=True):
"""创建模型特征并保存特征处理器"""
print("\n===== 步骤3: 特征工程 =====")
features = df[['日期']].copy()
red_cols = ['数字1', '数字2', '数字3', '数字4', '数字5', '数字6']
# 基础特征
features['红球和值'] = df[red_cols].sum(axis=1)
features['蓝球值'] = df['数字7']
features['奇偶比'] = df[red_cols].applymap(lambda x: x % 2).sum(axis=1)
features['大小比'] = df[red_cols].applymap(lambda x: 1 if x > 16 else 0).sum(axis=1)
for num in range(1, 34):
features[f'red_{num}_missing'] = features.index - features[f'red_{num}_last']
# 添加质数比
prime_nums = [2,3,5,7,11,13,17,19,23,29,31]
features['prime_ratio'] = df[red_cols].applymap(lambda x: x in prime_nums).sum(axis=1)
# 窗口特征 (窗口大小10)
window_size = 10
for col in ['红球和值', '奇偶比', '大小比']:
features[f'{col}_MA{window_size}'] = features[col].rolling(window=window_size).mean()
features[f'{col}_STD{window_size}'] = features[col].rolling(window=window_size).std()
# 滞后特征 (滞后1-9期)
for lag in range(1, 10):
for col in red_cols + ['数字7']:
features[f'{col}_lag{lag}'] = df[col].shift(lag)
# 目标变量 (下一期开奖结果)
red_targets = []
blue_targets = []
for i in range(len(df) - 1):
next_row = df.iloc[i + 1]
# 红球目标 (33选6)
red_target = [1 if num in next_row[red_cols].values else 0 for num in range(1, 34)]
# 蓝球目标 (16选1)
blue_target = [1 if i == next_row['数字7'] else 0 for i in range(1, 17)]
red_targets.append(red_target)
blue_targets.append(blue_target)
# 转换为numpy数组
red_targets = np.array(red_targets)
blue_targets = np.array(blue_targets)
# 移除无效数据 (前window_size行和最后一行)
features = features.iloc[window_size:-1].reset_index(drop=True)
red_targets = red_targets[window_size-1:-1] # 对齐索引
blue_targets = blue_targets[window_size-1:-1]
# 保存特征处理器
feature_columns = features.drop(columns=['日期']).columns.tolist()
joblib.dump(feature_columns, 'feature_columns.pkl')
print(f"特征列名已保存: {len(feature_columns)}个特征")
if save_features:
features.to_excel('04_features.xlsx', index=False)
print(f"特征工程完成, 维度: {features.shape}")
return features, red_targets, blue_targets
# ===== 模型构建函数 =====
def prepare_data(features, red_targets, blue_targets):
"""准备训练数据并保存数据处理器"""
print("\n===== 步骤4: 数据准备 =====")
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(features.drop(columns=['日期']))
# 保存特征处理器
joblib.dump(scaler_X, 'scaler_X.save')
print("特征缩放器已保存")
# 创建时间序列数据
time_steps = 10
X_seq, y_red_seq, y_blue_seq = [], [], []
for i in range(time_steps, len(X_scaled)):
X_seq.append(X_scaled[i-time_steps:i, :])
y_red_seq.append(red_targets[i-1]) # 使用当前时间步的目标
y_blue_seq.append(blue_targets[i-1])
X_seq = np.array(X_seq)
y_red_seq = np.array(y_red_seq)
y_blue_seq = np.array(y_blue_seq)
print(f"时间序列数据形状: X={X_seq.shape}, y_red={y_red_seq.shape}, y_blue={y_blue_seq.shape}")
# 保存历史数据用于预测
joblib.dump(X_scaled[-10:], 'historical_data.pkl')
print("历史数据已保存用于预测")
return X_seq, y_red_seq, y_blue_seq, scaler_X
# ===== 模型训练函数 =====
# 在train_model中添加
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 使用pad_sequences处理不等长序列
X_padded = pad_sequences(X, maxlen=10, padding='pre', dtype='float32')
# 修改早停策略
callbacks.append(EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True))
def train_model(X, y_red, y_blue):
"""训练模型"""
print("\n===== 步骤5: 模型训练 =====")
best_models = []
tscv = TimeSeriesSplit(n_splits=3)
for fold, (train_index, val_index) in enumerate(tscv.split(X)):
print(f"\n===== 训练 Fold {fold+1}/3 =====")
X_train, X_val = X[train_index], X[val_index]
y_red_train, y_red_val = y_red[train_index], y_red[val_index]
y_blue_train, y_blue_val = y_blue[train_index], y_blue[val_index]
model = build_attention_lstm_model((X_train.shape[1], X_train.shape[2]))
callbacks = [
EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-6, verbose=1)
]
history = model.fit(
X_train,
{'red_output': y_red_train, 'blue_output': y_blue_train},
epochs=100,
batch_size=32,
validation_data=(X_val, {'red_output': y_red_val, 'blue_output': y_blue_val}),
callbacks=callbacks,
verbose=1
)
model.save(f'best_model_fold{fold+1}.h5')
best_models.append(model)
# 保存训练历史图
plot_training_history(history, fold+1)
return best_models
def plot_training_history(history, fold):
"""绘制训练历史图表"""
plt.figure(figsize=(15, 10))
# 损失曲线
plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title(f'Fold {fold} - 总损失曲线')
plt.ylabel('损失')
plt.xlabel('Epoch')
plt.legend()
# 红球准确率
plt.subplot(2, 2, 2)
plt.plot(history.history['red_output_binary_accuracy'], label='红球训练准确率')
plt.plot(history.history['val_red_output_binary_accuracy'], label='红球验证准确率')
plt.title(f'Fold {fold} - 红球准确率')
plt.ylabel('准确率')
plt.xlabel('Epoch')
plt.legend()
# 蓝球准确率
plt.subplot(2, 2, 3)
plt.plot(history.history['blue_output_binary_accuracy'], label='蓝球训练准确率')
plt.plot(history.history['val_blue_output_binary_accuracy'], label='蓝球验证准确率')
plt.title(f'Fold {fold} - 蓝球准确率')
plt.ylabel('准确率')
plt.xlabel('Epoch')
plt.legend()
# 学习率
plt.subplot(2, 2, 4)
if 'lr' in history.history:
plt.plot(history.history['lr'], label='学习率')
plt.title(f'Fold {fold} - 学习率变化')
plt.ylabel('学习率')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.savefig(f'training_history_fold{fold}.png')
plt.close()
# ===== 预测准备函数 =====
def prepare_prediction_input(df, features, scaler_X):
"""准备预测输入,确保特征一致性"""
print("\n===== 准备预测输入 =====")
# 加载特征列名
feature_columns = joblib.load('feature_columns.pkl')
print(f"预期特征数量: {len(feature_columns)}")
# 创建空DataFrame
prediction_features = pd.DataFrame(columns=feature_columns)
# 获取最后10行有效数据
last_10 = features.iloc[-10:]
# 填充基础特征
red_cols = ['数字1', '数字2', '数字3', '数字4', '数字5', '数字6']
current_row = df.iloc[-1]
prediction_features.loc[0, '红球和值'] = current_row[red_cols].sum()
prediction_features.loc[0, '蓝球值'] = current_row['数字7']
prediction_features.loc[0, '奇偶比'] = current_row[red_cols].apply(lambda x: x % 2).sum()
prediction_features.loc[0, '大小比'] = current_row[red_cols].apply(lambda x: 1 if x > 16 else 0).sum()
# 填充窗口特征
window_size = 10
for col in ['红球和值', '奇偶比', '大小比']:
col_values = features[col].iloc[-window_size:]
prediction_features.loc[0, f'{col}_MA{window_size}'] = col_values.mean()
prediction_features.loc[0, f'{col}_STD{window_size}'] = col_values.std()
# 填充滞后特征 - 修正逻辑
for lag in range(1, 10):
# 确保滞后索引有效
lag_index = -lag - 1 # 从当前行向前追溯
for col in red_cols + ['数字7']:
feature_name = f'{col}_lag{lag}'
if feature_name in feature_columns:
if len(df) > lag:
prediction_features.loc[0, feature_name] = df[col].iloc[lag_index]
else:
# 数据不足时使用平均值
prediction_features.loc[0, feature_name] = df[col].mean()
# 处理缺失特征
missing_cols = set(feature_columns) - set(prediction_features.columns)
for col in missing_cols:
prediction_features[col] = 0 # 默认填充0
# 确保顺序一致
prediction_features = prediction_features[feature_columns]
# 标准化
X_pred = scaler_X.transform(prediction_features)
print(f"预测输入形状: {X_pred.shape}")
return X_pred
# ===== 预测函数 =====
def predict_next_period(models):
# 加载特征处理器
scaler = joblib.load('scaler_X.save')
# 动态获取最新10期数据(避免文件依赖)
raw_data = pd.read_excel('02resultB.xlsx').iloc[-10:]
processed = create_features(raw_data, save_features=False)[0]
X_seq = scaler.transform(processed.drop(columns=['日期']))
# 概率集成(考虑模型验证集性能)
val_scores = [model.evaluate(X_val, y_val)[0] for model in models]
weights = np.array([1/score for score in val_scores])
# 加权预测
red_probs, blue_probs = [], []
for model, weight in zip(models, weights):
preds = model.predict(np.expand_dims(X_seq, axis=0))
red_probs.append(preds[:6] * weight)
blue_probs.append(preds[6] * weight)
# 合并结果
final_red = np.mean(red_probs, axis=0)
final_blue = np.mean(blue_probs, axis=0)
# 归一化概率
red_probs /= total_weight
blue_probs /= total_weight
# 获取预测结果
red_indices = np.argsort(red_probs[0])[::-1][:6]
blue_indices = np.argsort(blue_probs[0])[::-1][:3]
return (
[i+1 for i in red_indices],
[red_probs[0][i] for i in red_indices],
[i+1 for i in blue_indices],
[blue_probs[0][i] for i in blue_indices]
)
# ===== 主函数 =====
def main():
# 执行数据处理步骤
step1_format_data()
df = step2_process_data()
# 特征工程
features, red_targets, blue_targets = create_features(df)
# 准备训练数据
X, y_red, y_blue, scaler_X = prepare_data(features, red_targets, blue_targets)
# 训练模型
models = train_model(X, y_red, y_blue)
# 预测
red_nums, red_probs, blue_nums, blue_probs = predict_next_period(models)
# 打印结果
print("\n" + "="*50)
print("双色球下一期预测结果")
print("="*50)
print("\n红球预测 (前6个):")
for num, prob in zip(red_nums, red_probs):
print(f"号码 {num:2d} : 概率 {prob:.4f}")
print("\n蓝球预测 (前3个):")
for num, prob in zip(blue_nums, blue_probs):
print(f"号码 {num:2d} : 概率 {prob:.4f}")
# 保存结果
result_df = pd.DataFrame({
'红球预测': red_nums,
'红球概率': red_probs,
'蓝球预测': blue_nums,
'蓝球概率': blue_probs
})
result_df.to_excel('prediction_results.xlsx', index=False)
print("\n预测结果已保存至: prediction_results.xlsx")
if __name__ == "__main__":
main()
def coverage_rate(y_true, y_pred):
# 计算预测号码与实际开奖的重合数量
return K.mean(K.sum(K.cast(K.equal(y_true, y_pred), axis=-1))
# main.py
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Layer
from tensorflow.keras.optimizers import Adam
# 内联定义自定义层(避免导入问题)
class AttentionLayer(Layer):
"""自定义注意力层"""
def __init__(self, **kwargs):
super(AttentionLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.time_steps = input_shape[1]
self.feature_dim = input_shape[2]
self.W = self.add_weight(
name='att_weight',
shape=(self.feature_dim, self.feature_dim),
initializer='glorot_uniform',
trainable=True
)
self.b = self.add_weight(
name='att_bias',
shape=(self.feature_dim,),
initializer='zeros',
trainable=True
)
self.V = self.add_weight(
name='att_v',
shape=(self.feature_dim, 1),
initializer='glorot_uniform',
trainable=True
)
super(AttentionLayer, self).build(input_shape)
def call(self, inputs):
score = tf.tanh(tf.matmul(inputs, self.W) + self.b)
score = tf.matmul(score, self.V)
score = tf.squeeze(score, axis=-1)
alpha = tf.nn.softmax(score, axis=-1)
alpha = tf.expand_dims(alpha, axis=-1)
context = tf.reduce_sum(alpha * inputs, axis=1)
return context
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[2])
# 模型构建函数
def build_attention_lstm_model(input_shape):
inputs = Input(shape=input_shape)
# LSTM层
lstm_out = LSTM(128, return_sequences=True)(inputs)
lstm_out = Dropout(0.3)(lstm_out)
# 使用自定义层
attention_out = AttentionLayer()(lstm_out)
# 输出分支
red_branch = Dense(64, activation='relu')(attention_out)
red_output = Dense(33, activation='sigmoid', name='red_output')(red_branch)
blue_branch = Dense(32, activation='relu')(attention_out)
blue_output = Dense(16, activation='sigmoid', name='blue_output')(blue_branch)
model = Model(inputs=inputs, outputs=[red_output, blue_output])
model.compile(
optimizer=Adam(0.001),
loss={'red_output': 'binary_crossentropy', 'blue_output': 'binary_crossentropy'},
# 这里也需要修改为字典形式
loss_weights={'red_output': 0.7, 'blue_output': 0.3}
)
return model
# 测试模型构建
if __name__ == "__main__":
model = build_attention_lstm_model(input_shape=(10, 20))
model.summary()
print("模型构建成功!")
最新发布