tf.squeeze(output[:, idx, :], axis=[1])

部署运行你感兴趣的模型镜像

tf.squeeze(output[:, idx, :], axis=[1])
先看几个关键点:

切片 output[:, idx, :]

output 假设形状是 [batch_size, seq_len, hidden_dim]

取第 idx 个位置的 token 表示:output[:, idx, :]

得到的张量形状是 [batch_size, 1, hidden_dim](因为 idx 是个标量,但 TensorFlow 的切片保持维度时默认会在那一维留下长度为 1 的维度)。

tf.squeeze(..., axis=[1])

squeeze 的作用是去掉指定的维度,前提是这个维度大小为 1。

axis=[1] 表示只把第 1 个维度(即 seq_len 那一维)去掉。

结果形状从 [batch_size, 1, hidden_dim] → [batch_size, hidden_dim]。

所以,这里写 squeeze(axis=[1]) 的目的就是:
把多余的长度为 1 的维度去掉,得到干净的 [batch_size, hidden_dim] 张量。

如果不写 axis,即:tf.squeeze(output[:, idx, :])也能得到 [batch_size, hidden_dim],但区别在于:不指定 axis 会把所有为 1

您可能感兴趣的与本文相关的镜像

TensorFlow-v2.15

TensorFlow-v2.15

TensorFlow

TensorFlow 是由Google Brain 团队开发的开源机器学习框架,广泛应用于深度学习研究和生产环境。 它提供了一个灵活的平台,用于构建和训练各种机器学习模型

# -*- coding: utf-8 -*- """ DKT-DSC for Assistment2012 (完整可运行版) 最后更新: 2024-07-01 """ import os import sys import numpy as np import tensorflow.compat.v1 as tf os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.disable_v2_behavior() try: import psutil HAS_PSUTIL = True except ImportError: HAS_PSUTIL = False print("警告: psutil模块未安装,内存监控功能受限") from scipy.sparse import coo_matrix from tensorflow.contrib import rnn import pandas as pd from tqdm import tqdm from sklearn.metrics import mean_squared_error, r2_score, roc_curve, auc import math import random from datetime import datetime import warnings # 忽略警告 warnings.filterwarnings('ignore') # ==================== 配置部分 ==================== DATA_BASE_PATH = './data/' data_name = 'Assist_2012' # 模拟知识图谱路径(实际使用时替换为真实路径) KNOWLEDGE_GRAPH_PATHS = { 'graphml': './output_assist2012_gat_improved/knowledge_graph.graphml', 'nodes': './output_assist2012_gat_improved/graph_nodes.csv', 'edges': './output_assist2012_gat_improved/graph_edges.csv' } # 创建模拟数据路径 os.makedirs(DATA_BASE_PATH, exist_ok=True) os.makedirs(os.path.dirname(KNOWLEDGE_GRAPH_PATHS['nodes']), exist_ok=True) # ==================== 模拟数据生成 ==================== def generate_mock_data(): """生成模拟数据用于测试""" # 生成模拟训练数据 (300条记录) train_data = pd.DataFrame({ 'user_id': np.repeat(range(10), 30), 'problem_id': np.random.randint(1, 100, 300), 'correct': np.random.randint(0, 2, 300), 'start_time': np.arange(300) # 使用简单递增数字模拟时间戳 }) train_data.to_csv(os.path.join(DATA_BASE_PATH, f'{data_name}_train.csv'), index=False) # 生成模拟测试数据 (100条记录) test_data = pd.DataFrame({ 'user_id': np.repeat(range(5), 20), 'problem_id': np.random.randint(1, 100, 100), 'correct': np.random.randint(0, 2, 100), 'start_time': np.arange(100) + 300 # 时间戳接续训练数据 }) test_data.to_csv(os.path.join(DATA_BASE_PATH, f'{data_name}_test.csv'), index=False) # 生成模拟知识图谱节点数据 node_ids = [f'problem_{i}' for i in range(1, 101)] + \ [f'concept_{i}' for i in range(1, 21)] node_types = ['problem'] * 100 + ['concept'] * 20 mock_node_data = pd.DataFrame({ 'node_id': node_ids, 'type': node_types, 'difficulty': np.random.rand(120), 'avg_accuracy': np.random.rand(120), 'total_attempts': np.random.randint(100, 1000, 120), 'avg_confidence': np.random.rand(120) }) mock_node_data.to_csv(KNOWLEDGE_GRAPH_PATHS['nodes'], index=False) # 生成模拟边数据 sources = np.random.choice(node_ids, 500) targets = np.random.choice(node_ids, 500) weights = np.random.rand(500) mock_edge_data = pd.DataFrame({ 'source': sources, 'target': targets, 'weight': weights }) mock_edge_data.to_csv(KNOWLEDGE_GRAPH_PATHS['edges'], index=False) # 检查并生成模拟数据 if not os.path.exists(os.path.join(DATA_BASE_PATH, f'{data_name}_train.csv')): print("[系统] 检测到缺少数据文件,正在生成模拟数据...") generate_mock_data() # ==================== Flags配置 ==================== tf.flags.DEFINE_float("epsilon", 1e-8, "Adam优化器的epsilon值") tf.flags.DEFINE_float("l2_lambda", 0.003, "L2正则化系数") tf.flags.DEFINE_float("learning_rate", 2e-4, "学习率") tf.flags.DEFINE_float("max_grad_norm", 5.0, "梯度裁剪阈值") tf.flags.DEFINE_float("keep_prob", 0.7, "Dropout保留概率") tf.flags.DEFINE_integer("hidden_layer_num", 2, "隐藏层数量") tf.flags.DEFINE_integer("hidden_size", 64, "隐藏层大小") tf.flags.DEFINE_integer("evaluation_interval", 1, "评估间隔周期数") tf.flags.DEFINE_integer("batch_size", 32, "批次大小") # 减小批次大小以便在模拟数据上运行 tf.flags.DEFINE_integer("problem_len", 20, "问题序列长度") tf.flags.DEFINE_integer("epochs", 5, "训练周期数") # 减少epoch以便快速测试 tf.flags.DEFINE_boolean("allow_soft_placement", True, "允许软设备放置") tf.flags.DEFINE_boolean("log_device_placement", False, "记录设备放置信息") tf.flags.DEFINE_string("train_data_path", os.path.join(DATA_BASE_PATH, f'{data_name}_train.csv'), "训练数据路径") tf.flags.DEFINE_string("test_data_path", os.path.join(DATA_BASE_PATH, f'{data_name}_test.csv'), "测试数据路径") FLAGS = tf.flags.FLAGS # 焦点损失参数 FOCAL_LOSS_GAMMA = 2.0 FOCAL_LOSS_ALPHA = 0.25 # 学习率衰减参数 DECAY_STEPS = 100 DECAY_RATE = 0.97 # 早停参数 EARLY_STOP_PATIENCE = 3 def memory_usage(): if HAS_PSUTIL: try: process = psutil.Process(os.getpid()) return process.memory_info().rss / (1024 ** 2) except: return 0.0 return 0.0 # ==================== 时间戳处理工具函数 ==================== def parse_timestamp(timestamp_str): """尝试多种格式解析时间戳""" if isinstance(timestamp_str, (int, float, np.number)): return float(timestamp_str) if isinstance(timestamp_str, str): timestamp_str = timestamp_str.strip('"\' ') # 尝试常见时间格式 for fmt in ('%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M', '%Y-%m-%d', '%s'): try: if fmt == '%s': # Unix时间戳 return float(timestamp_str) dt = datetime.strptime(timestamp_str, fmt) return dt.timestamp() except ValueError: continue return np.nan # ==================== 知识图谱加载器 ==================== class KnowledgeGraphLoader: def __init__(self): self.node_features = None self.adj_matrix = None self.problem_to_node = {} self.node_id_map = {} self.static_node_count = 0 self._rows = None self._cols = None def load(self): print("\n[KG] 加载知识图谱...") try: if not os.path.exists(KNOWLEDGE_GRAPH_PATHS['nodes']): raise FileNotFoundError(f"节点文件未找到: {KNOWLEDGE_GRAPH_PATHS['nodes']}") if not os.path.exists(KNOWLEDGE_GRAPH_PATHS['edges']): raise FileNotFoundError(f"边文件未找到: {KNOWLEDGE_GRAPH_PATHS['edges']}") node_df = pd.read_csv(KNOWLEDGE_GRAPH_PATHS['nodes']) self.static_node_count = len(node_df) print(f"[KG] 总节点数: {self.static_node_count}") # 处理空值 print("[KG] 处理特征空值...") feature_cols = [col for col in node_df.columns if col not in ['node_id', 'type']] for col in feature_cols: if node_df[col].isna().any(): if 'accuracy' in col or 'confidence' in col: median_val = node_df[col].median() node_df[col] = node_df[col].fillna(median_val) else: for node_type in ['problem', 'concept']: mask = node_df['type'] == node_type type_median = node_df.loc[mask, col].median() node_df.loc[mask, col] = node_df.loc[mask, col].fillna(type_median) # 特征标准化 raw_features = node_df[feature_cols].values raw_features = np.nan_to_num(raw_features) feature_mean = np.mean(raw_features, axis=0) feature_std = np.std(raw_features, axis=0) + 1e-8 self.node_features = np.array( (raw_features - feature_mean) / feature_std, dtype=np.float32 ) # 创建映射 self.node_id_map = {row['node_id']: idx for idx, row in node_df.iterrows()} # 创建问题映射 self.problem_to_node = {} problem_count = 0 for idx, row in node_df.iterrows(): if row['type'] == 'problem': try: problem_id = int(row['node_id'].split('_')[1]) self.problem_to_node[problem_id] = idx problem_count += 1 except (IndexError, ValueError): continue print(f"[KG] 已加载 {problem_count} 个问题节点映射") # 加载边数据 edge_df = pd.read_csv(KNOWLEDGE_GRAPH_PATHS['edges']) rows, cols, data = [], [], [] grouped = edge_df.groupby('source') for src, group in tqdm(grouped, total=len(grouped), desc="处理边数据"): src_idx = self.node_id_map.get(src, -1) if src_idx == -1: continue neighbors = [] for _, row in group.iterrows(): tgt_idx = self.node_id_map.get(row['target'], -1) if tgt_idx != -1: neighbors.append((tgt_idx, row['weight'])) neighbors.sort(key=lambda x: x[1], reverse=True) top_k = min(100, len(neighbors)) for i in range(top_k): rows.append(src_idx) cols.append(neighbors[i][0]) data.append(neighbors[i][1]) # 添加自环 for i in range(self.static_node_count): rows.append(i) cols.append(i) data.append(1.0) # 创建稀疏矩阵 adj_coo = coo_matrix( (data, (rows, cols)), shape=(self.static_node_count, self.static_node_count), dtype=np.float32 ) self.adj_matrix = adj_coo.tocsc() self._rows = np.array(rows) self._cols = np.array(cols) except Exception as e: print(f"知识图谱加载失败: {str(e)}") raise # ==================== 图注意力层 ==================== class GraphAttentionLayer: def __init__(self, input_dim, output_dim, kg_loader, scope=None): self.kg_loader = kg_loader self.node_count = kg_loader.static_node_count self._rows = kg_loader._rows self._cols = kg_loader._cols with tf.variable_scope(scope or "GAT"): self.W = tf.get_variable( "W", [input_dim, output_dim], initializer=tf.initializers.variance_scaling( scale=0.1, mode='fan_avg', distribution='uniform') ) self.attn_kernel = tf.get_variable( "attn_kernel", [output_dim * 2, 1], initializer=tf.initializers.variance_scaling( scale=0.1, mode='fan_avg', distribution='uniform') ) self.bias = tf.get_variable( "bias", [output_dim], initializer=tf.zeros_initializer() ) def __call__(self, inputs): inputs = tf.clip_by_value(inputs, -5, 5) h = tf.matmul(inputs, self.W) h = tf.clip_by_value(h, -5, 5) h_src = tf.gather(h, self._rows) h_dst = tf.gather(h, self._cols) h_concat = tf.concat([h_src, h_dst], axis=1) edge_logits = tf.squeeze(tf.matmul(h_concat, self.attn_kernel), axis=1) edge_logits = tf.clip_by_value(edge_logits, -10, 10) edge_attn = tf.nn.leaky_relu(edge_logits, alpha=0.2) edge_indices = tf.constant(np.column_stack((self._rows, self._cols)), dtype=tf.int64) sparse_attn = tf.SparseTensor( indices=edge_indices, values=edge_attn, dense_shape=[self.node_count, self.node_count] ) sparse_attn_weights = tf.sparse_softmax(sparse_attn) output = tf.sparse_tensor_dense_matmul(sparse_attn_weights, h) output = tf.clip_by_value(output, -5, 5) output += self.bias output = tf.nn.elu(output) return output # ==================== 学生知识追踪模型 ==================== class StudentModel: def __init__(self, is_training, config): self.batch_size = config.batch_size # 添加这行 self.batch_size_tensor = tf.placeholder(tf.int32, [], name='batch_size_placeholder') self.num_skills = config.num_skills self.num_steps = config.num_steps self.current = tf.placeholder(tf.int32, [None, self.num_steps], name='current') self.next = tf.placeholder(tf.int32, [None, self.num_steps], name='next') self.target_id = tf.placeholder(tf.int32, [None], name='target_ids') self.target_correctness = tf.placeholder(tf.float32, [None], name='target_correctness') with tf.device('/gpu:0'), tf.variable_scope("KnowledgeGraph", reuse=tf.AUTO_REUSE): kg_loader = KnowledgeGraphLoader() kg_loader.load() kg_node_features = tf.constant(kg_loader.node_features, dtype=tf.float32) # 增强GAT结构 gat_output = kg_node_features for i in range(2): with tf.variable_scope(f"GAT_Layer_{i + 1}"): dim = 64 if i == 0 else 32 gat_layer = GraphAttentionLayer( input_dim=gat_output.shape[1] if i > 0 else kg_node_features.shape[1], output_dim=dim, kg_loader=kg_loader ) gat_output = gat_layer(gat_output) gat_output = tf.nn.leaky_relu(gat_output, alpha=0.1) self.skill_embeddings = gat_output with tf.variable_scope("FeatureProcessing"): # 使用实际batch_size的placeholder batch_size = tf.shape(self.next)[0] # 初始化方法1:使用tf.zeros_like和tile dummy_vector = tf.zeros([1, 1], dtype=tf.float32) history_init = tf.tile(dummy_vector, [batch_size, 1]) elapsed_init = tf.tile(dummy_vector, [batch_size, 1]) # 或者初始化方法2:直接使用tf.fill # history_init = tf.fill([batch_size, 1], 0.0) # elapsed_init = tf.fill([batch_size, 1], 0.0) current_indices = tf.minimum(self.current, kg_loader.static_node_count - 1) current_embed = tf.nn.embedding_lookup(self.skill_embeddings, current_indices) inputs = [] valid_mask = tf.cast(tf.not_equal(self.current, 0), tf.float32) answers_float = tf.cast(self.next, tf.float32) # 初始化历史和耗时特征 history = history_init elapsed_time = elapsed_init for t in range(self.num_steps): if t > 0: past_answers = answers_float[:, :t] past_valid_mask = valid_mask[:, :t] correct_count = tf.reduce_sum(past_answers * past_valid_mask, axis=1, keepdims=True) total_valid = tf.reduce_sum(past_valid_mask, axis=1, keepdims=True) history = correct_count / (total_valid + 1e-8) elapsed_time = tf.fill([batch_size, 1], tf.cast(t, tf.float32)) with tf.variable_scope(f"feature_extraction_t{t}"): # 基础特征 current_feat = current_embed[:, t, :] # 知识图谱特征 difficulty_feature = tf.gather( kg_loader.node_features[:, 0], tf.minimum(self.current[:, t], kg_loader.static_node_count - 1) ) difficulty_feature = tf.reshape(difficulty_feature, [-1, 1]) # 情感特征 affect_features = [] for i in range(1, 3): try: affect_feature = tf.gather( kg_loader.node_features[:, i], tf.minimum(self.current[:, t], kg_loader.static_node_count - 1) ) affect_feature = tf.reshape(affect_feature, [-1, 1]) affect_features.append(affect_feature) except Exception as e: tf.logging.warning(f"情感特征{i}提取失败: {str(e)}") affect_features.append(tf.zeros_like(difficulty_feature)) # 确保所有特征都是2维的 features_to_concat = [current_feat, history, elapsed_time, difficulty_feature] + affect_features features_to_concat = [ f if len(f.shape) == 2 else tf.reshape(f, [-1, 1]) for f in features_to_concat ] # 调试信息(可选) if is_training: features_to_concat = [ tf.Print(f, [tf.shape(f)], message=f"Feature {i} shape at step {t}: ") for i, f in enumerate(features_to_concat) ] combined = tf.concat(features_to_concat, axis=1) inputs.append(combined) # 增强RNN结构 with tf.variable_scope("RNN"): cells = [] for i in range(2): cell = rnn.LSTMCell( FLAGS.hidden_size, initializer=tf.orthogonal_initializer(), forget_bias=1.0 ) if is_training and FLAGS.keep_prob < 1.0: cell = rnn.DropoutWrapper(cell, output_keep_prob=FLAGS.keep_prob) cells.append(cell) stacked_cell = rnn.MultiRNNCell(cells) outputs, _ = tf.nn.dynamic_rnn( stacked_cell, tf.stack(inputs, axis=1), dtype=tf.float32 ) output = tf.reshape(outputs, [-1, FLAGS.hidden_size]) with tf.variable_scope("Output"): hidden = tf.layers.dense( output, units=32, activation=tf.nn.relu, kernel_initializer=tf.initializers.glorot_uniform() ) logits = tf.layers.dense( hidden, units=1, kernel_initializer=tf.initializers.glorot_uniform() ) self._all_logits = tf.clip_by_value(logits, -20, 20) selected_logits = tf.gather(tf.reshape(self._all_logits, [-1]), self.target_id) self.pred = tf.clip_by_value(tf.sigmoid(selected_logits), 1e-8, 1 - 1e-8) with tf.variable_scope("Loss"): labels = tf.clip_by_value(self.target_correctness, 0.05, 0.95) pos_weight = tf.reduce_sum(1.0 - labels) / (tf.reduce_sum(labels) + 1e-8) bce_loss = tf.nn.weighted_cross_entropy_with_logits( targets=labels, logits=selected_logits, pos_weight=pos_weight ) confidence_penalty = tf.reduce_mean( tf.square(tf.sigmoid(selected_logits) - 0.5) ) loss = tf.reduce_mean(bce_loss) + 0.1 * confidence_penalty l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) * FLAGS.l2_lambda self.cost = loss + l2_loss # ==================== 数据加载 ==================== def read_data_from_csv_file(path, kg_loader, is_training=False): students = [] student_ids = [] max_skill = 0 missing_problems = set() if not os.path.exists(path): print(f"❌ 文件不存在: {path}") return [], [], [], 0, 0, 0 try: print(f"[数据] 加载数据文件: {path}") try: data_df = pd.read_csv(path) except Exception as e: print(f"CSV读取失败: {str(e)}") encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252'] for encoding in encodings: try: data_df = pd.read_csv(path, encoding=encoding) break except: continue if 'data_df' not in locals(): return [], [], [], 0, 0, 0 # 列名标准化 possible_columns = { 'user_id': ['user_id', 'userid', 'student_id', 'studentid'], 'problem_id': ['problem_id', 'problemid', 'skill_id', 'skillid'], 'correct': ['correct', 'correctness', 'answer', 'accuracy'], 'start_time': ['start_time', 'timestamp', 'time', 'date'] } actual_columns = {} for col_type, possible_names in possible_columns.items(): found = False for name in possible_names: if name in data_df.columns: actual_columns[col_type] = name found = True break if not found: print(f"❌ 错误: 找不到 {col_type} 列") return [], [], [], 0, 0, 0 data_df = data_df.rename(columns={ actual_columns['user_id']: 'user_id', actual_columns['problem_id']: 'problem_id', actual_columns['correct']: 'correct', actual_columns['start_time']: 'start_time' }) # 时间戳转换 print("[数据] 转换时间戳...") timestamp_col = data_df['start_time'] if isinstance(timestamp_col.iloc[0], str): try: data_df['start_time'] = timestamp_col.astype(float) except ValueError: parsed_times = timestamp_col.apply(parse_timestamp) nan_count = parsed_times.isna().sum() if nan_count > 0: print(f"⚠️ 警告: {nan_count}个时间戳无法解析,将设为0") parsed_times = parsed_times.fillna(0) data_df['start_time'] = parsed_times else: data_df['start_time'] = timestamp_col.astype(float) # 按学生分组 grouped = data_df.groupby('user_id') for user_id, group in tqdm(grouped, total=len(grouped), desc="处理学生数据"): try: group = enhanced_data_validation(group, kg_loader) if group is None: continue problems = group['problem_id'].values answers = group['correct'].values.astype(int) timestamps = group['start_time'].values.astype(float) valid_data = [] invalid_count = 0 for i, (p, a) in enumerate(zip(problems, answers)): if p in kg_loader.problem_to_node and a in (0, 1): valid_data.append((p, a)) else: invalid_count += 1 if p != 0 and p not in missing_problems: missing_problems.add(p) if len(valid_data) < 2: continue problems, answers = zip(*valid_data) n_split = (len(problems) + FLAGS.problem_len - 1) // FLAGS.problem_len for k in range(n_split): start = k * FLAGS.problem_len end = (k + 1) * FLAGS.problem_len seg_problems = list(problems[start:end]) seg_answers = list(answers[start:end]) if len(seg_problems) < FLAGS.problem_len: pad_len = FLAGS.problem_len - len(seg_problems) seg_problems += [0] * pad_len seg_answers += [0] * pad_len mapped_problems = [kg_loader.problem_to_node.get(p, 0) for p in seg_problems] students.append(([user_id, k], mapped_problems, seg_answers)) max_skill = max(max_skill, max(mapped_problems)) student_ids.append(user_id) except Exception as e: print(f"处理学生 {user_id} 时出错: {str(e)}") continue except Exception as e: print(f"数据加载失败: {str(e)}") return [], [], [], 0, 0, 0 return students, [], student_ids, max_skill, 0, 0 def enhanced_data_validation(group, kg_loader): """增强数据验证""" problems = group['problem_id'].values timestamps = group['start_time'].values.astype(float) valid_indices = np.where(~np.isnan(timestamps))[0] if len(valid_indices) > 1: time_diffs = np.diff(timestamps[valid_indices]) if np.any(time_diffs < 0): sort_idx = np.argsort(timestamps) group = group.iloc[sort_idx].reset_index(drop=True) valid_mask = [p in kg_loader.problem_to_node for p in problems] if not any(valid_mask): return None return group[valid_mask] # ==================== 训练流程 ==================== def run_epoch(session, model, data, run_type, eval_op, verbose=False): """执行一个epoch的训练或评估 Args: session: TF会话 model: 模型对象 data: 输入数据 run_type: '训练'或'测试' eval_op: 训练op或tf.no_op() verbose: 是否显示详细进度 Returns: dict: 包含loss, auc, rmse, r2的字典 """ preds = [] labels = [] total_loss = 0.0 processed_count = 0 # 禁用TF调试信息 tf.logging.set_verbosity(tf.logging.ERROR) index = 0 batch_size = model.batch_size # 可选:使用tqdm进度条(verbose模式下) iterator = tqdm(range(0, len(data), batch_size), desc=f"{run_type}处理中") if verbose else range(0, len(data), batch_size) for start in iterator: end = min(start + batch_size, len(data)) batch_data = data[start:end] # 准备批次数据 current_batch, next_batch, target_ids, target_correctness = [], [], [], [] for idx, (stu_id, problems, answers) in enumerate(batch_data): valid_length = sum(1 for p in problems if p != 0) if valid_length < 1: continue current_batch.append(problems) next_batch.append(answers) last_step = valid_length - 1 target_ids.append(idx * model.num_steps + last_step) target_correctness.append(answers[last_step]) if not current_batch: continue actual_batch_size = len(current_batch) feed_dict = { model.current: np.array(current_batch, dtype=np.int32), model.next: np.array(next_batch, dtype=np.int32), model.target_id: np.array(target_ids, dtype=np.int32), model.target_correctness: np.array(target_correctness, dtype=np.float32) } try: if eval_op != tf.no_op(): _, pred, loss = session.run( [eval_op, model.pred, model.cost], feed_dict=feed_dict ) else: pred, loss = session.run( [model.pred, model.cost], feed_dict=feed_dict ) preds.extend(pred.flatten().tolist()) labels.extend(target_correctness) total_loss += loss * actual_batch_size processed_count += actual_batch_size except Exception as e: print(f"\n{run_type}错误 (批次 {start}-{end}): {str(e)}", file=sys.stderr) continue # 计算指标 if processed_count == 0: return None avg_loss = total_loss / processed_count # 确保标签和预测值在有效范围内 labels = np.clip(np.array(labels), 1e-7, 1 - 1e-7) preds = np.clip(np.array(preds), 1e-7, 1 - 1e-7) metrics = { 'loss': avg_loss, 'auc': roc_auc_score(labels, preds) if len(set(labels)) > 1 else 0.5, 'rmse': np.sqrt(mean_squared_error(labels, preds)), 'r2': r2_score(labels, preds) } return metrics def main(_): """主训练流程""" # 1. 加载配置和数据 config = ModelConfig() # 假设已定义 train_data, test_data = load_data() # 假设已定义 # 2. 构建模型 with tf.variable_scope("Model", reuse=False): train_model = StudentModel(is_training=True, config=config) with tf.variable_scope("Model", reuse=True): test_model = StudentModel(is_training=False, config=config) # 3. 创建会话 sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: # 4. 初始化变量 sess.run(tf.global_variables_initializer()) # 5. 训练循环 best_auc = 0.0 for epoch in range(1, FLAGS.max_epochs + 1): # 训练阶段 train_metrics = run_epoch( sess, train_model, train_data, '训练', train_op, # train_op应已定义 verbose=(epoch % FLAGS.display_freq == 0) ) # 测试阶段 test_metrics = run_epoch( sess, test_model, test_data, '测试', tf.no_op(), verbose=False ) # 6. 输出关键指标 print(f"Epoch {epoch}") print( f"训练集 - 损失: {train_metrics['loss']:.4f}, RMSE: {train_metrics['rmse']:.4f}, AUC: {train_metrics['auc']:.4f}, R²: {train_metrics['r2']:.4f}") print( f"测试集 - 损失: {test_metrics['loss']:.4f}, RMSE: {test_metrics['rmse']:.4f}, AUC: {test_metrics['auc']:.4f}, R²: {test_metrics['r2']:.4f}") sys.stdout.flush() # 7. 保存最佳模型 if test_metrics['auc'] > best_auc: best_auc = test_metrics['auc'] saver.save(sess, FLAGS.model_path) # saver应已定义 print("训练完成!") print(f"最佳测试AUC: {best_auc:.4f}") if __name__ == "__main__": # 生成模拟数据(仅当真实数据不存在时) if not os.path.exists(FLAGS.train_data_path) or not os.path.exists(FLAGS.test_data_path): generate_mock_data() tf.app.run() 在这个基础上修改得到的完整代码,你给的不完整,不要省略!!!!
07-02
# -*- coding: utf-8 -*- """ DKT-DSC for Assistment2012 (优化版) - 修复数据泄露问题 最后更新: 2024-07-01 """ import os import sys import numpy as np import tensorflow.compat.v1 as tf os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.disable_v2_behavior() # 安全导入psutil模块 try: import psutil HAS_PSUTIL = True except ImportError: HAS_PSUTIL = False print("警告: psutil模块未安装,内存监控功能受限") from scipy.sparse import coo_matrix from tensorflow.contrib import rnn import pandas as pd from tqdm import tqdm from sklearn.metrics import mean_squared_error, r2_score, roc_curve, auc import math import random # ==================== 配置部分 ==================== # 使用实际数据路径 DATA_BASE_PATH = '/home/yhh/students/jianglu/DKT2/DKT/data/' data_name = 'Assist_2012' # 修正数据集名称 KNOWLEDGE_GRAPH_PATHS = { 'graphml': './output_assist2012_gat_improved/knowledge_graph.graphml', 'nodes': './output_assist2012_gat_improved/graph_nodes.csv', 'edges': './output_assist2012_gat_improved/graph_edges.csv' } # ==================== Flags配置 ==================== tf.flags.DEFINE_float("epsilon", 1e-8, "Adam优化器的epsilon值") tf.flags.DEFINE_float("l2_lambda", 0.005, "L2正则化系数") # 减小正则化强度 tf.flags.DEFINE_float("learning_rate", 1e-4, "学习率") tf.flags.DEFINE_float("max_grad_norm", 3.0, "梯度裁剪阈值") # 更严格的梯度裁剪 tf.flags.DEFINE_float("keep_prob", 0.8, "Dropout保留概率") # 减小dropout tf.flags.DEFINE_integer("hidden_layer_num", 1, "隐藏层数量") tf.flags.DEFINE_integer("hidden_size", 48, "隐藏层大小") # 增加隐藏层大小 tf.flags.DEFINE_integer("evaluation_interval", 2, "评估间隔周期数") tf.flags.DEFINE_integer("batch_size", 128, "批次大小") tf.flags.DEFINE_integer("problem_len", 15, "问题序列长度") # 增加序列长度 tf.flags.DEFINE_integer("epochs", 100, "训练周期数") tf.flags.DEFINE_boolean("allow_soft_placement", True, "允许软设备放置") tf.flags.DEFINE_boolean("log_device_placement", False, "记录设备放置信息") tf.flags.DEFINE_string("train_data_path", f'{DATA_BASE_PATH}{data_name}_train.csv', "训练数据路径") tf.flags.DEFINE_string("test_data_path", f'{DATA_BASE_PATH}{data_name}_test.csv', "测试数据路径") FLAGS = tf.flags.FLAGS # 焦点损失参数 FOCAL_LOSS_GAMMA = 1.5 # 调整焦点损失参数 FOCAL_LOSS_ALPHA = 0.3 # 学习率衰减参数 DECAY_STEPS = 2000 DECAY_RATE = 0.95 # 学习率预热步数 WARMUP_STEPS = 2000 # 内存监控函数 def memory_usage(): """增强的内存监控函数,处理psutil缺失情况""" if HAS_PSUTIL: try: process = psutil.Process(os.getpid()) return process.memory_info().rss / (1024 ** 2) except: return 0.0 return 0.0 # ==================== 知识图谱加载器 ==================== class KnowledgeGraphLoader: def __init__(self): self.node_features = None self.adj_matrix = None self.problem_to_node = {} self.node_id_map = {} self.static_node_count = 0 self._rows = None self._cols = None def load(self): """加载知识图谱数据并进行严格的数据验证""" print("\n[KG] 加载知识图谱...") try: if not os.path.exists(KNOWLEDGE_GRAPH_PATHS['nodes']): raise FileNotFoundError(f"节点文件未找到: {KNOWLEDGE_GRAPH_PATHS['nodes']}") if not os.path.exists(KNOWLEDGE_GRAPH_PATHS['edges']): raise FileNotFoundError(f"边文件未找到: {KNOWLEDGE_GRAPH_PATHS['edges']}") node_df = pd.read_csv(KNOWLEDGE_GRAPH_PATHS['nodes']) self.static_node_count = len(node_df) print(f"[KG] 总节点数: {self.static_node_count}") # 处理空值 - 根据验证报告中的发现 print("[KG] 处理特征空值...") feature_cols = [col for col in node_df.columns if col not in ['node_id', 'type']] # 特别处理total_attempts特征 if 'total_attempts' in feature_cols: # 概念节点使用概念节点中位数填充 concept_mask = node_df['type'] == 'concept' concept_median = node_df.loc[concept_mask, 'total_attempts'].median() # 处理NaN值 if pd.isna(concept_median): concept_median = 0.0 node_df.loc[concept_mask, 'total_attempts'] = node_df.loc[concept_mask, 'total_attempts'].fillna(concept_median) # 问题节点使用问题节点中位数填充 problem_mask = node_df['type'] == 'problem' problem_median = node_df.loc[problem_mask, 'total_attempts'].median() # 处理NaN值 if pd.isna(problem_median): problem_median = 0.0 node_df.loc[problem_mask, 'total_attempts'] = node_df.loc[problem_mask, 'total_attempts'].fillna(problem_median) print(f" 填充 total_attempts 缺失值: 概念节点={concept_median}, 问题节点={problem_median}") # 处理其他数值特征 other_cols = [col for col in feature_cols if col != 'total_attempts'] for col in other_cols: # 分类型填充 if 'confidence' in col or 'affect' in col: # 情感特征使用全局平均值填充 global_mean = node_df[col].mean() # 处理NaN值 if pd.isna(global_mean): global_mean = 0.0 node_df[col] = node_df[col].fillna(global_mean) print(f" 填充 {col} 缺失值: 全局均值={global_mean:.4f}") else: # 其他特征按问题类型分组填充 problem_mask = node_df['type'] == 'problem' problem_mean = node_df.loc[problem_mask, col].mean() # 处理NaN值 if pd.isna(problem_mean): problem_mean = 0.0 node_df.loc[problem_mask, col] = node_df.loc[problem_mask, col].fillna(problem_mean) concept_mask = node_df['type'] == 'concept' concept_mean = node_df.loc[concept_mask, col].mean() # 处理NaN值 if pd.isna(concept_mean): concept_mean = 0.0 node_df.loc[concept_mask, col] = node_df.loc[concept_mask, col].fillna(concept_mean) print(f" 填充 {col} 缺失值: 问题节点={problem_mean:.4f}, 概念节点={concept_mean:.4f}") print("\n[KG诊断] 特征分析...") if feature_cols: raw_features = node_df[feature_cols].values nan_count = np.isnan(raw_features).sum() inf_count = np.isinf(raw_features).sum() print(f" 总特征值数: {raw_features.size}") print(f" NaN特征数: {nan_count}") print(f" Inf特征数: {inf_count}") if nan_count > 0 or inf_count > 0: print(f"⚠️ 警告: 节点特征包含 {nan_count} 个NaN和 {inf_count} 个Inf值,将被替换为0") raw_features = np.nan_to_num(raw_features) # 标准化特征并确保为float32类型 feature_mean = np.mean(raw_features, axis=0) feature_std = np.std(raw_features, axis=0) + 1e-8 self.node_features = np.array( (raw_features - feature_mean) / feature_std, dtype=np.float32 # 显式指定为float32 ) self.node_features = np.nan_to_num(self.node_features) # 再次确保无NaN else: print("警告: 节点文件中没有特征列") self.node_features = np.zeros((self.static_node_count, 1), dtype=np.float32) # 创建节点ID映射 self.node_id_map = {} for idx, row in node_df.iterrows(): self.node_id_map[row['node_id']] = idx # 创建问题ID到节点索引的映射 self.problem_to_node = {} problem_count = 0 for idx, row in node_df.iterrows(): if row['type'] == 'problem': try: parts = row['node_id'].split('_') if len(parts) < 2: continue problem_id = int(parts[1]) self.problem_to_node[problem_id] = idx problem_count += 1 except: continue print(f"[KG] 已加载 {problem_count} 个问题节点映射") # 加载边数据并进行优化 edge_df = pd.read_csv(KNOWLEDGE_GRAPH_PATHS['edges']) print("[KG] 优化邻接矩阵(保留每个节点的前100个邻居)...") rows, cols, data = [], [], [] valid_edge_count = 0 invalid_edge_count = 0 # 限制每个节点的邻居数量以提高效率 grouped = edge_df.groupby('source') for src, group in tqdm(grouped, total=len(grouped), desc="处理边数据"): src_idx = self.node_id_map.get(src, -1) if src_idx == -1: invalid_edge_count += len(group) continue neighbors = [] for _, row in group.iterrows(): tgt_idx = self.node_id_map.get(row['target'], -1) if tgt_idx != -1: neighbors.append((tgt_idx, row['weight'])) # 根据权重排序并取Top 100 neighbors.sort(key=lambda x: x[1], reverse=True) top_k = min(100, len(neighbors)) # 限制邻居数量 for i in range(top_k): tgt_idx, weight = neighbors[i] rows.append(src_idx) cols.append(tgt_idx) data.append(weight) valid_edge_count += 1 # 添加自环 for i in range(self.static_node_count): rows.append(i) cols.append(i) data.append(1.0) valid_edge_count += 1 # 创建稀疏邻接矩阵 adj_coo = coo_matrix( (data, (rows, cols)), shape=(self.static_node_count, self.static_node_count), dtype=np.float32 ) self.adj_matrix = adj_coo.tocsc() self._rows = np.array(rows) self._cols = np.array(cols) print(f"[KG] 邻接矩阵构建完成 | 节点: {self.static_node_count} | 边: {len(data)}") print(f"[KG优化] 最大行索引: {np.max(self._rows)} | 最大列索引: {np.max(self._cols)}") except Exception as e: import traceback print(f"知识图谱加载失败: {str(e)}") traceback.print_exc() raise RuntimeError(f"知识图谱加载失败: {str(e)}") from e # ==================== 图注意力层 ==================== class GraphAttentionLayer: def __init__(self, input_dim, output_dim, kg_loader, scope=None): self.kg_loader = kg_loader self.node_count = kg_loader.static_node_count self._rows = kg_loader._rows self._cols = kg_loader._cols with tf.variable_scope(scope or "GAT"): self.W = tf.get_variable( "W", [input_dim, output_dim], initializer=tf.initializers.variance_scaling( scale=0.1, mode='fan_avg', distribution='uniform') ) self.attn_kernel = tf.get_variable( "attn_kernel", [output_dim * 2, 1], initializer=tf.initializers.variance_scaling( scale=0.1, mode='fan_avg', distribution='uniform') ) self.bias = tf.get_variable( "bias", [output_dim], initializer=tf.zeros_initializer() ) def __call__(self, inputs): inputs = tf.clip_by_value(inputs, -5, 5) inputs = tf.check_numerics(inputs, "GAT输入包含NaN或Inf") # 特征变换 h = tf.matmul(inputs, self.W) h = tf.clip_by_value(h, -5, 5) h = tf.check_numerics(h, "特征变换后包含NaN或Inf") # 注意力机制 h_src = tf.gather(h, self._rows) h_dst = tf.gather(h, self._cols) h_concat = tf.concat([h_src, h_dst], axis=1) edge_logits = tf.squeeze(tf.matmul(h_concat, self.attn_kernel), axis=1) edge_logits = tf.clip_by_value(edge_logits, -10, 10) edge_attn = tf.nn.leaky_relu(edge_logits, alpha=0.2) # 创建稀疏注意力矩阵 edge_indices = tf.constant(np.column_stack((self._rows, self._cols)), dtype=tf.int64) sparse_attn = tf.SparseTensor( indices=edge_indices, values=edge_attn, dense_shape=[self.node_count, self.node_count] ) # 稀疏softmax和矩阵乘法 sparse_attn_weights = tf.sparse_softmax(sparse_attn) output = tf.sparse_tensor_dense_matmul(sparse_attn_weights, h) output = tf.clip_by_value(output, -5, 5) output += self.bias output = tf.nn.elu(output) output = tf.check_numerics(output, "最终GAT输出包含NaN或Inf") return output # ==================== 学生知识追踪模型 ==================== class StudentModel: def __init__(self, is_training, config): self.batch_size = config.batch_size self.num_skills = config.num_skills self.num_steps = config.num_steps self.current = tf.placeholder(tf.int32, [None, self.num_steps], name='current') self.next = tf.placeholder(tf.int32, [None, self.num_steps], name='next') self.target_id = tf.placeholder(tf.int32, [None], name='target_ids') self.target_correctness = tf.placeholder(tf.float32, [None], name='target_correctness') with tf.device('/gpu:0'), tf.variable_scope("KnowledgeGraph", reuse=tf.AUTO_REUSE): # 加载知识图谱 kg_loader = KnowledgeGraphLoader() kg_loader.load() kg_node_features = tf.constant(kg_loader.node_features, dtype=tf.float32) kg_node_features = tf.check_numerics(kg_node_features, "知识图谱节点特征包含NaN或Inf") # 精简GAT层 - 减少层数和维度 gat_output = kg_node_features for i in range(2): # 减少GAT层数为2 with tf.variable_scope(f"GAT_Layer_{i + 1}"): gat_layer = GraphAttentionLayer( input_dim=gat_output.shape[1] if i > 0 else kg_node_features.shape[1], output_dim=24 if i == 0 else 16, # 减少输出维度 kg_loader=kg_loader ) gat_output = gat_layer(gat_output) gat_output = tf.nn.elu(gat_output) self.skill_embeddings = gat_output with tf.variable_scope("FeatureProcessing"): batch_size = tf.shape(self.next)[0] # 动态获取批次大小 # 当前问题嵌入 current_indices = tf.minimum(self.current, kg_loader.static_node_count - 1) current_embed = tf.nn.embedding_lookup(self.skill_embeddings, current_indices) # 构建输入序列 - 移除下一问题嵌入(修复数据泄露) inputs = [] # 使用当前问题作为有效掩码(而不是下一个问题) valid_mask = tf.cast(tf.not_equal(self.current, 0), tf.float32) answers_float = tf.cast(self.next, tf.float32) # 历史表现特征 - 修复符号张量问题 zero_vector = tf.zeros([1, 1], dtype=tf.float32) history = tf.tile(zero_vector, [batch_size, 1]) elapsed_time = tf.tile(zero_vector, [batch_size, 1]) # 循环处理每个时间步 for t in range(self.num_steps): # 创建时间相关的特征 if t > 0: # 计算历史表现(只使用t-1及之前的信息) past_answers = answers_float[:, :t] # 只使用当前时间步之前的信息 past_valid_mask = valid_mask[:, :t] correct_count = tf.reduce_sum(past_answers * past_valid_mask, axis=1, keepdims=True) total_valid = tf.reduce_sum(past_valid_mask, axis=1, keepdims=True) history = correct_count / (total_valid + 1e-8) # 计算经过的时间 elapsed_time = tf.fill([batch_size, 1], tf.cast(t, tf.float32)) # 难度特征 - 使用知识图谱中的准确率特征 # 确保只使用当前问题的特征 difficulty_feature = tf.gather( kg_loader.node_features[:, 0], # 假设第一个特征是准确率 tf.minimum(self.current[:, t], kg_loader.static_node_count - 1) ) difficulty_feature = tf.cast(difficulty_feature, tf.float32) # 情感特征 - 使用知识图谱中的情感特征 affect_features = [] for i in range(1, 5): # 使用前4个情感特征 affect_feature = tf.gather( kg_loader.node_features[:, i], tf.minimum(self.current[:, t], kg_loader.static_node_count - 1) ) affect_feature = tf.cast(affect_feature, tf.float32) affect_features.append(tf.reshape(affect_feature, [-1, 1])) # 组合所有特征 - 移除了下一问题嵌入(修复数据泄露) combined = tf.concat([ current_embed[:, t, :], history, elapsed_time, tf.reshape(difficulty_feature, [-1, 1]), *affect_features ], axis=1) inputs.append(combined) # RNN模型 with tf.variable_scope("RNN"): cell = rnn.LSTMCell( FLAGS.hidden_size, initializer=tf.initializers.glorot_uniform(), forget_bias=1.0 ) if is_training and FLAGS.keep_prob < 1.0: cell = rnn.DropoutWrapper(cell, output_keep_prob=FLAGS.keep_prob) outputs, _ = tf.nn.dynamic_rnn( cell, tf.stack(inputs, axis=1), dtype=tf.float32 ) output = tf.reshape(outputs, [-1, FLAGS.hidden_size]) # 输出层 with tf.variable_scope("Output"): hidden = tf.layers.dense( output, units=32, activation=tf.nn.relu, kernel_initializer=tf.initializers.glorot_uniform(), name="hidden_layer" ) logits = tf.layers.dense( hidden, units=1, kernel_initializer=tf.initializers.glorot_uniform(), name="output_layer" ) # 损失计算 self._all_logits = tf.clip_by_value(logits, -20, 20) selected_logits = tf.gather(tf.reshape(self._all_logits, [-1]), self.target_id) self.pred = tf.clip_by_value(tf.sigmoid(selected_logits), 1e-8, 1 - 1e-8) # 焦点损失 labels = tf.clip_by_value(self.target_correctness, 0.05, 0.95) pos_weight = tf.reduce_sum(1.0 - labels) / (tf.reduce_sum(labels) + 1e-8) bce_loss = tf.nn.weighted_cross_entropy_with_logits( targets=labels, logits=selected_logits, pos_weight=pos_weight ) loss = tf.reduce_mean(bce_loss) # L2正则化 l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) * FLAGS.l2_lambda self.cost = loss + l2_loss # ==================== 数据加载 ==================== def read_data_from_csv_file(path, kg_loader, is_training=False): """更鲁棒的数据加载函数""" students = [] student_ids = [] max_skill = 0 missing_problems = set() # 增强文件存在性检查 if not os.path.exists(path): print(f"❌ 严重错误: 数据文件不存在: {path}") print("请检查以下可能原因:") print("1. 文件路径是否正确") print("2. 文件名是否匹配") print("3. 文件权限是否足够") # 尝试列出目录内容以便调试 dir_path = os.path.dirname(path) print(f"目录内容: {os.listdir(dir_path) if os.path.exists(dir_path) else '目录不存在'}") return [], [], [], 0, 0, 0 try: # 打印正在加载的文件路径 print(f"[数据] 加载数据文件: {path}") # 读取数据集 - 增强CSV读取兼容性 try: data_df = pd.read_csv(path) except Exception as e: print(f"CSV读取失败: {str(e)}") print("尝试使用备用方法读取...") # 尝试不同编码 encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252'] for encoding in encodings: try: data_df = pd.read_csv(path, encoding=encoding) print(f"成功使用 {encoding} 编码读取文件") break except Exception as e: print(f"编码 {encoding} 尝试失败: {str(e)}") continue if 'data_df' not in locals(): print("所有编码尝试失败,无法读取文件") return [], [], [], 0, 0, 0 print(f"[数据] 加载完成 | 记录数: {len(data_df)}") # 检查必要的列是否存在 - 支持多种列名变体 # 可能的列名变体 possible_columns = { 'user_id': ['user_id', 'userid', 'student_id', 'studentid'], 'problem_id': ['problem_id', 'problemid', 'skill_id', 'skillid'], 'correct': ['correct', 'correctness', 'answer', 'accuracy'], 'start_time': ['start_time', 'timestamp', 'time', 'date'] } # 查找实际列名 actual_columns = {} for col_type, possible_names in possible_columns.items(): found = False for name in possible_names: if name in data_df.columns: actual_columns[col_type] = name found = True break if not found: print(f"❌ 错误: 找不到 {col_type} 列") print(f"数据列: {list(data_df.columns)}") return [], [], [], 0, 0, 0 # 重命名列为标准名称以便后续处理 data_df = data_df.rename(columns={ actual_columns['user_id']: 'user_id', actual_columns['problem_id']: 'problem_id', actual_columns['correct']: 'correct', actual_columns['start_time']: 'start_time' }) print(f"[数据] 使用列: user_id, problem_id, correct, start_time") # 按学生分组 grouped = data_df.groupby('user_id') print(f"[数据] 分组完成 | 学生数: {len(grouped)}") for user_id, group in tqdm(grouped, total=len(grouped), desc="处理学生数据"): # 按时间排序 group = group.sort_values('start_time') problems = group['problem_id'].values answers = group['correct'].values.astype(int) # 筛选有效数据 - 添加详细日志 valid_data = [] invalid_count = 0 for i, (p, a) in enumerate(zip(problems, answers)): # 检查问题是否在知识图谱中 if p in kg_loader.problem_to_node and a in (0, 1): # 额外检查:确保问题特征不包含学生作答信息 node_idx = kg_loader.problem_to_node[p] if 'accuracy' in kg_loader.node_features[node_idx]: # 如果特征中包含准确率,警告可能的数据泄露 print(f"警告: 问题 {p} 的特征包含准确率信息,可能导致数据泄露") valid_data.append((p, a)) else: invalid_count += 1 if p != 0 and p not in missing_problems: print(f"警告: 问题ID {p} 不在知识图谱中 (学生: {user_id}, 位置: {i})") missing_problems.add(p) if len(valid_data) < 2: print(f"跳过数据不足的学生 {user_id} (有效交互: {len(valid_data)}, 无效: {invalid_count})") continue # 分割序列 problems, answers = zip(*valid_data) n_split = (len(problems) + FLAGS.problem_len - 1) // FLAGS.problem_len for k in range(n_split): start = k * FLAGS.problem_len end = (k + 1) * FLAGS.problem_len seg_problems = list(problems[start:end]) seg_answers = list(answers[start:end]) # 填充短序列 if len(seg_problems) < FLAGS.problem_len: pad_len = FLAGS.problem_len - len(seg_problems) seg_problems += [0] * pad_len seg_answers += [0] * pad_len # 训练数据增强 if is_training: valid_indices = [i for i, p in enumerate(seg_problems) if p != 0] if len(valid_indices) > 1 and random.random() > 0.5: random.shuffle(valid_indices) seg_problems = [seg_problems[i] for i in valid_indices] + seg_problems[len(valid_indices):] seg_answers = [seg_answers[i] for i in valid_indices] + seg_answers[len(valid_indices):] # 映射问题ID到知识图谱节点 mapped_problems = [] for p in seg_problems: if p == 0: mapped_problems.append(0) elif p in kg_loader.problem_to_node: mapped_problems.append(kg_loader.problem_to_node[p]) else: mapped_problems.append(0) students.append(([user_id, k], mapped_problems, seg_answers)) max_skill = max(max_skill, max(mapped_problems)) student_ids.append(user_id) except Exception as e: print(f"数据加载失败: {str(e)}") import traceback traceback.print_exc() return [], [], [], 0, 0, 0 avg_length = sum(len(s[1]) for s in students) / len(students) if students else 0 print(f"[数据统计] 学生数: {len(student_ids)} | 序列数: {len(students)}") print(f" 最大技能ID: {max_skill} | 平均序列长度: {avg_length:.1f}") print(f" 缺失问题数: {len(missing_problems)}") return students, [], student_ids, max_skill, 0, 0 # ==================== 训练流程 ==================== def run_epoch(session, model, data, run_type, eval_op, global_step=None): preds = [] labels = [] total_loss = 0.0 step = 0 processed_count = 0 total_batches = max(1, len(data) // model.batch_size) with tqdm(total=total_batches, desc=f"{run_type} Epoch") as pbar: index = 0 while index < len(data): # 准备批次数据 current_batch = [] next_batch = [] target_ids = [] target_correctness = [] for i in range(model.batch_size): if index >= len(data): break stu_id, problems, answers = data[index] valid_length = sum(1 for p in problems if p != 0) if valid_length < 1: index += 1 continue current_batch.append(problems) next_batch.append(answers) last_step = valid_length - 1 target_ids.append(i * model.num_steps + last_step) target_correctness.append(answers[last_step]) index += 1 if len(current_batch) == 0: pbar.update(1) step += 1 continue # 创建feed_dict feed = { model.current: np.array(current_batch, dtype=np.int32), model.next: np.array(next_batch, dtype=np.int32), model.target_id: np.array(target_ids, dtype=np.int32), model.target_correctness: np.array(target_correctness, dtype=np.float32) } # 运行计算 try: results = session.run( [model.pred, model.cost, eval_op], feed_dict=feed ) pred, loss = results[:2] preds.extend(pred.tolist()) labels.extend(target_correctness) total_loss += loss * len(current_batch) processed_count += len(current_batch) pbar.set_postfix( loss=f"{loss:.4f}", mem=f"{memory_usage():.1f}MB" ) pbar.update(1) step += 1 except Exception as e: print(f"\n训练错误: {str(e)}") import traceback traceback.print_exc() break # 计算指标 if not labels or not preds: print(f"{run_type}周期: 无有效样本!") return float('nan'), 0.5, 0.0, 0.0 labels = np.array(labels, dtype=np.float32) preds = np.array(preds, dtype=np.float32) mask = np.isfinite(labels) & np.isfinite(preds) if not mask.any(): print(f"{run_type}周期: 所有样本包含无效值!") return float('nan'), 0.5, 0.0, 0.0 labels = labels[mask] preds = preds[mask] try: rmse = np.sqrt(mean_squared_error(labels, preds)) fpr, tpr, _ = roc_curve(labels, preds) auc_score = auc(fpr, tpr) r2 = r2_score(labels, preds) avg_loss = total_loss / processed_count if processed_count > 0 else 0.0 print(f"\n{run_type}周期总结:") print(f" 样本数: {len(labels)} | 正样本比例: {np.mean(labels > 0.5):.3f}") print(f" Loss: {avg_loss:.4f} | RMSE: {rmse:.4f} | AUC: {auc_score:.4f} | R²: {r2:.4f}") # 添加预测值分布分析 print("\n预测值分布分析:") print(f" 最小值: {np.min(preds):.4f} | 最大值: {np.max(preds):.4f}") print(f" 均值: {np.mean(preds):.4f} | 中位数: {np.median(preds):.4f}") print(f" 标准差: {np.std(preds):.4f}") # 检查完美预测的情况 perfect_preds = np.sum((preds < 1e-5) | (preds > 1 - 1e-5)) if perfect_preds > 0: perfect_ratio = perfect_preds / len(preds) print(f" 警告: {perfect_preds}个样本({perfect_ratio*100:.2f}%)预测值为0或1") # 检查预测值是否全部相同 if np.all(preds == preds[0]): print(f" 严重警告: 所有预测值相同 ({preds[0]:.4f})") return rmse, auc_score, r2, avg_loss except Exception as e: print(f"指标计算错误: {str(e)}") return float('nan'), 0.5, 0.0, 0.0 # ==================== 主函数 ==================== def main(_): print(f"[系统] 训练数据路径: {FLAGS.train_data_path}") print(f"[系统] 测试数据路径: {FLAGS.test_data_path}") # 检查文件是否存在 if not os.path.exists(FLAGS.train_data_path): print(f"❌ 训练文件不存在: {FLAGS.train_data_path}") if not os.path.exists(FLAGS.test_data_path): print(f"❌ 测试文件不存在: {FLAGS.test_data_path}") print(f"⚠️ 优化设置: batch_size={FLAGS.batch_size}, hidden_size={FLAGS.hidden_size}, lr={FLAGS.learning_rate}") session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, operation_timeout_in_ms=60000 ) session_conf.gpu_options.allow_growth = True with tf.Session(config=session_conf) as sess: # 加载知识图谱 kg_loader = KnowledgeGraphLoader() kg_loader.load() # 加载数据 print("\n[系统] 加载训练数据...") train_data = read_data_from_csv_file(FLAGS.train_data_path, kg_loader, is_training=True) print("[系统] 加载测试数据...") test_data = read_data_from_csv_file(FLAGS.test_data_path, kg_loader) if not train_data[0] or not test_data[0]: print("❌ 错误: 训练或测试数据为空!") return # 模型配置 class ModelConfig: def __init__(self): self.batch_size = FLAGS.batch_size self.num_skills = kg_loader.static_node_count + 100 # 添加缓冲区 self.num_steps = FLAGS.problem_len self.keep_prob = FLAGS.keep_prob model_config = ModelConfig() print(f"[配置] 技能数量: {model_config.num_skills}") print(f"[配置] 序列长度: {model_config.num_steps}") # 构建模型 print("\n[系统] 构建模型...") with tf.variable_scope("Model"): train_model = StudentModel(is_training=True, config=model_config) tf.get_variable_scope().reuse_variables() test_model = StudentModel(is_training=False, config=model_config) # 优化器和训练操作 global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, global_step, DECAY_STEPS, DECAY_RATE, staircase=True ) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=FLAGS.epsilon ) grads_and_vars = optimizer.compute_gradients(train_model.cost) grads, variables = zip(*grads_and_vars) clipped_grads, _ = tf.clip_by_global_norm(grads, FLAGS.max_grad_norm) train_op = optimizer.apply_gradients(zip(clipped_grads, variables), global_step=global_step) # 初始化变量 sess.run(tf.global_variables_initializer()) print(f"[系统] 训练开始 | 批次: {FLAGS.batch_size} | 学习率: {FLAGS.learning_rate}") # 模型保存 checkpoint_dir = "checkpoints_assist2012" os.makedirs(checkpoint_dir, exist_ok=True) saver = tf.train.Saver(max_to_keep=3) best_auc = 0.0 # 训练循环 for epoch in range(FLAGS.epochs): print(f"\n==== Epoch {epoch + 1}/{FLAGS.epochs} ====") current_lr = sess.run(learning_rate) print(f"[学习率] 当前学习率: {current_lr:.7f}") # 训练 train_rmse, train_auc, train_r2, train_loss = run_epoch( sess, train_model, train_data[0], '训练', train_op ) # 评估 if (epoch + 1) % FLAGS.evaluation_interval == 0: test_rmse, test_auc, test_r2, test_loss = run_epoch( sess, test_model, test_data[0], '测试', tf.no_op() ) # 保存最佳模型 if test_auc > best_auc: best_auc = test_auc save_path = saver.save(sess, f"{checkpoint_dir}/best_model.ckpt") print(f"保存最佳模型: {save_path}, AUC={best_auc:.4f}") print("\n训练完成!") if __name__ == "__main__": tf.app.run() 训练代码的测试集的auc 20轮只达到了0.7658;哪里出了问题,如何提高auc
07-02
def preprocess_train(example_batch, static_data_batch, labels_batch): # 处理图像数据并将每张图片切分成多个patches pixel_values = split_and_concat(example_batch["image"].convert("RGB")) pixel_values = tf.convert_to_tensor(pixel_values, dtype=tf.float32) static_data_batch = tf.convert_to_tensor(static_data_batch, dtype=tf.float32) labels_batch = tf.convert_to_tensor(labels_batch, dtype=tf.float32) return pixel_values, static_data_batch, labels_batch def create_dataset(dataset, labels, batch_size): image_data, static_data = dataset # 创建一个生成器函数 def generator(): pixel_values_list = [] static_data_list = [] labels_list = [] for example_batch, static_data_batch, labels_batch in zip(image_data, static_data, labels): pixel_values, static_data_batch, labels_batch = preprocess_train(example_batch, static_data_batch, labels_batch) # yield (pixel_values, static_data_batch), labels_batch labels_batch = tf.squeeze(labels_batch, axis=-1) # 将返回的样本逐一添加到列表 pixel_values_list.append(pixel_values) static_data_list.append(static_data_batch) labels_list.append(labels_batch) # 如果已经累积到 batch_size,进行拼接并返回 if len(pixel_values_list) == batch_size: # 拼接为 batch pixel_values_batch = tf.stack(pixel_values_list, axis=0) # (batch_size, 64, 64, 102) static_data_batch = tf.stack(static_data_list, axis=0) # (batch_size, static_data.shape[1]) labels_batch = tf.stack(labels_list, axis=0) # (batch_size,) pixel_values_list = [] static_data_list = [] labels_list = [] yield (pixel_values_batch, static_data_batch), labels_batch dataset = tf.data.Dataset.from_generator( generator, # 传递生成器函数 output_signature=( ( tf.TensorSpec(shape=(None, 64, 64, 102), dtype=tf.float32), # 图像数据形状 tf.TensorSpec(shape=(None, static_data.shape[1]), dtype=tf.float32) # 静态数据形状 ), tf.TensorSpec(shape=(None,), dtype=tf.float32) # 标签的 shape ) ) dataset = dataset.prefetch(tf.data.AUTOTUNE) # 自动预取,提升性能 return dataset对于一个二分类的图像数据集,且数据集很大并且正负极不均衡,上面代码是否需要进行修改
03-08
请帮我检查并优化代码,尤其关注避免数据泄露,调整灵活权重等问题:import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler from tensorflow.keras.optimizers import Adam from sklearn.model_selection import TimeSeriesSplit import joblib import tensorflow as tf from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Layer from tensorflow.keras.optimizers import Adam import warnings import tensorflow as tf warnings.filterwarnings('ignore') # ===== 自定义注意力层 ===== class AttentionLayer(Layer): def __init__(self, **kwargs): super(AttentionLayer, self).__init__(**kwargs) def build(self, input_shape): self.time_steps = input_shape[1] self.feature_dim = input_shape[2] # 创建可训练权重 self.W = self.add_weight( name='att_weight', shape=(self.feature_dim, self.feature_dim), initializer='glorot_uniform', trainable=True ) self.b = self.add_weight( name='att_bias', shape=(self.feature_dim,), initializer='zeros', trainable=True ) self.V = self.add_weight( name='att_v', shape=(self.feature_dim, 1), initializer='glorot_uniform', trainable=True ) super(AttentionLayer, self).build(input_shape) # 修改call方法 def call(self, inputs): # 动态计算每个时间步的重要性 score = tf.matmul(tf.tanh(tf.matmul(inputs, self.W) + self.b), self.V) score = tf.squeeze(score, axis=-1) alpha = tf.nn.softmax(score, axis=1) # 修正为按时间步归一化 alpha = tf.expand_dims(alpha, axis=-1) context = tf.reduce_sum(alpha * inputs, axis=1) return context def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[2]) # (batch_size, feature_dim) # ===== 模型构建函数 ===== # 改进的模型结构 def build_model(input_shape): inputs = Input(shape=input_shape) # 双向LSTM捕获时序特征 lstm_out = Bidirectional(LSTM(128, return_sequences=True))(inputs) # 注意力层 att_out = AttentionLayer()(lstm_out) # 红球分支(6个独立预测) red_outputs = [] for i in range(6): branch = Dense(64, activation='relu')(att_out) red_outputs.append(Dense(33, activation='softmax', name=f'red_{i}')(branch)) # 蓝球分支 blue_branch = Dense(32, activation='relu')(att_out) blue_output = Dense(16, activation='softmax', name='blue_output')(blue_branch) model = Model(inputs, red_outputs + [blue_output]) # 调整损失权重(红球重要性更高) loss_weights = {f'red_{i}': 0.15 for i in range(6)} loss_weights['blue_output'] = 0.1 model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', loss_weights=loss_weights) return model def build_attention_lstm_model(input_shape): time_steps, num_features = input_shape inputs = Input(shape=input_shape) # LSTM层 lstm_out = LSTM(128, return_sequences=True)(inputs) lstm_out = Dropout(0.3)(lstm_out) # 使用自定义注意力层 attention_out = AttentionLayer()(lstm_out) # 红球分支 red_branch = Dense(64, activation='relu')(attention_out) red_branch = Dropout(0.2)(red_branch) # 修改为6个输出节点(每个红球位置独立预测) red_outputs = [] for i in range(6): branch = Dense(32, activation='relu')(attention_out) red_outputs.append(Dense(33, activation='softmax', name=f'red_{i}')(branch)) # 蓝球分支 blue_branch = Dense(32, activation='relu')(attention_out) blue_branch = Dropout(0.2)(blue_branch) blue_output = Dense(16, activation='sigmoid', name='blue_output')(blue_branch) model = Model(inputs=inputs, outputs=[red_output, blue_output]) optimizer = Adam(learning_rate=0.001) model.compile( optimizer=optimizer, loss={'red_output': 'binary_crossentropy', 'blue_output': 'binary_crossentropy'}, metrics={'red_output': 'binary_accuracy', 'blue_output': 'binary_accuracy'}, # 关键修改:使用字典形式指定损失权重 loss_weights={'red_output': 0.7, 'blue_output': 0.3} ) model.summary() return model # ===== 数据预处理函数 ===== #窗口特征计算前必须进行时间序列分割 #建议将create_features中的目标构建移至prepare_data def step1_format_data(): """格式化原始数据""" print("===== 步骤1: 格式化原始数据 =====") df = pd.read_excel('01hand.xlsx', sheet_name='Sheet1', header=None) # 提取A列和C列数据 new_df = pd.DataFrame({ 'A': pd.to_numeric(df.iloc[:, 0], errors='coerce'), 'B': pd.to_numeric(df.iloc[:, 2], errors='coerce') }).dropna() # 保存新文件 new_df.to_excel('01hand2.xlsx', index=False, header=False) print(f"新表格 '01hand2.xlsx' 创建成功! 包含 {len(new_df)} 行数据") def step2_process_data(): """数据去重和排序""" print("\n===== 步骤2: 数据去重和排序 =====") input_file = "01hand2.xlsx" output_file1 = "02resultA.xlsx" # 降序输出 output_file2 = "02resultB.xlsx" # 升序输出 # 读取数据并转换为长格式 df = pd.read_excel(input_file, header=None) all_values = df.stack().dropna().astype(str).tolist() # 确保数据长度是8的倍数 valid_length = len(all_values) - (len(all_values) % 8) if len(all_values) != valid_length: print(f"警告: 数据总量 {len(all_values)} 不符合8的倍数, 截断至 {valid_length} 个元素") all_values = all_values[:valid_length] # 转换数据格式 new_data = [] for i in range(0, len(all_values), 8): group = all_values[i:i+8] try: # 转换日期和数字 date = int(group[0]) numbers = [int(float(num)) if '.' in num else int(num) for num in group[1:]] new_data.append([date] + numbers) except: continue # 创建DataFrame并去重 columns = ['日期', '数字1', '数字2', '数字3', '数字4', '数字5', '数字6', '数字7'] df = pd.DataFrame(new_data, columns=columns) df = df.drop_duplicates(subset='日期').dropna() # 保存降序文件 df.sort_values('日期', ascending=False).to_excel(output_file1, index=False) print(f"降序文件保存至: {output_file1}") # 保存升序文件 df.sort_values('日期', ascending=True).to_excel(output_file2, index=False) print(f"升序文件保存至: {output_file2}") print(f"最终数据维度: {df.shape}") return df # ===== 特征工程函数 ===== def create_features(df, save_features=True): """创建模型特征并保存特征处理器""" print("\n===== 步骤3: 特征工程 =====") features = df[['日期']].copy() red_cols = ['数字1', '数字2', '数字3', '数字4', '数字5', '数字6'] # 基础特征 features['红球和值'] = df[red_cols].sum(axis=1) features['蓝球值'] = df['数字7'] features['奇偶比'] = df[red_cols].applymap(lambda x: x % 2).sum(axis=1) features['大小比'] = df[red_cols].applymap(lambda x: 1 if x > 16 else 0).sum(axis=1) for num in range(1, 34): features[f'red_{num}_missing'] = features.index - features[f'red_{num}_last'] # 添加质数比 prime_nums = [2,3,5,7,11,13,17,19,23,29,31] features['prime_ratio'] = df[red_cols].applymap(lambda x: x in prime_nums).sum(axis=1) # 窗口特征 (窗口大小10) window_size = 10 for col in ['红球和值', '奇偶比', '大小比']: features[f'{col}_MA{window_size}'] = features[col].rolling(window=window_size).mean() features[f'{col}_STD{window_size}'] = features[col].rolling(window=window_size).std() # 滞后特征 (滞后1-9期) for lag in range(1, 10): for col in red_cols + ['数字7']: features[f'{col}_lag{lag}'] = df[col].shift(lag) # 目标变量 (下一期开奖结果) red_targets = [] blue_targets = [] for i in range(len(df) - 1): next_row = df.iloc[i + 1] # 红球目标 (33选6) red_target = [1 if num in next_row[red_cols].values else 0 for num in range(1, 34)] # 蓝球目标 (16选1) blue_target = [1 if i == next_row['数字7'] else 0 for i in range(1, 17)] red_targets.append(red_target) blue_targets.append(blue_target) # 转换为numpy数组 red_targets = np.array(red_targets) blue_targets = np.array(blue_targets) # 移除无效数据 (前window_size行和最后一行) features = features.iloc[window_size:-1].reset_index(drop=True) red_targets = red_targets[window_size-1:-1] # 对齐索引 blue_targets = blue_targets[window_size-1:-1] # 保存特征处理器 feature_columns = features.drop(columns=['日期']).columns.tolist() joblib.dump(feature_columns, 'feature_columns.pkl') print(f"特征列名已保存: {len(feature_columns)}个特征") if save_features: features.to_excel('04_features.xlsx', index=False) print(f"特征工程完成, 维度: {features.shape}") return features, red_targets, blue_targets # ===== 模型构建函数 ===== def prepare_data(features, red_targets, blue_targets): """准备训练数据并保存数据处理器""" print("\n===== 步骤4: 数据准备 =====") scaler_X = MinMaxScaler() X_scaled = scaler_X.fit_transform(features.drop(columns=['日期'])) # 保存特征处理器 joblib.dump(scaler_X, 'scaler_X.save') print("特征缩放器已保存") # 创建时间序列数据 time_steps = 10 X_seq, y_red_seq, y_blue_seq = [], [], [] for i in range(time_steps, len(X_scaled)): X_seq.append(X_scaled[i-time_steps:i, :]) y_red_seq.append(red_targets[i-1]) # 使用当前时间步的目标 y_blue_seq.append(blue_targets[i-1]) X_seq = np.array(X_seq) y_red_seq = np.array(y_red_seq) y_blue_seq = np.array(y_blue_seq) print(f"时间序列数据形状: X={X_seq.shape}, y_red={y_red_seq.shape}, y_blue={y_blue_seq.shape}") # 保存历史数据用于预测 joblib.dump(X_scaled[-10:], 'historical_data.pkl') print("历史数据已保存用于预测") return X_seq, y_red_seq, y_blue_seq, scaler_X # ===== 模型训练函数 ===== # 在train_model中添加 from tensorflow.keras.preprocessing.sequence import pad_sequences # 使用pad_sequences处理不等长序列 X_padded = pad_sequences(X, maxlen=10, padding='pre', dtype='float32') # 修改早停策略 callbacks.append(EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)) def train_model(X, y_red, y_blue): """训练模型""" print("\n===== 步骤5: 模型训练 =====") best_models = [] tscv = TimeSeriesSplit(n_splits=3) for fold, (train_index, val_index) in enumerate(tscv.split(X)): print(f"\n===== 训练 Fold {fold+1}/3 =====") X_train, X_val = X[train_index], X[val_index] y_red_train, y_red_val = y_red[train_index], y_red[val_index] y_blue_train, y_blue_val = y_blue[train_index], y_blue[val_index] model = build_attention_lstm_model((X_train.shape[1], X_train.shape[2])) callbacks = [ EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1), ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-6, verbose=1) ] history = model.fit( X_train, {'red_output': y_red_train, 'blue_output': y_blue_train}, epochs=100, batch_size=32, validation_data=(X_val, {'red_output': y_red_val, 'blue_output': y_blue_val}), callbacks=callbacks, verbose=1 ) model.save(f'best_model_fold{fold+1}.h5') best_models.append(model) # 保存训练历史图 plot_training_history(history, fold+1) return best_models def plot_training_history(history, fold): """绘制训练历史图表""" plt.figure(figsize=(15, 10)) # 损失曲线 plt.subplot(2, 2, 1) plt.plot(history.history['loss'], label='训练损失') plt.plot(history.history['val_loss'], label='验证损失') plt.title(f'Fold {fold} - 总损失曲线') plt.ylabel('损失') plt.xlabel('Epoch') plt.legend() # 红球准确率 plt.subplot(2, 2, 2) plt.plot(history.history['red_output_binary_accuracy'], label='红球训练准确率') plt.plot(history.history['val_red_output_binary_accuracy'], label='红球验证准确率') plt.title(f'Fold {fold} - 红球准确率') plt.ylabel('准确率') plt.xlabel('Epoch') plt.legend() # 蓝球准确率 plt.subplot(2, 2, 3) plt.plot(history.history['blue_output_binary_accuracy'], label='蓝球训练准确率') plt.plot(history.history['val_blue_output_binary_accuracy'], label='蓝球验证准确率') plt.title(f'Fold {fold} - 蓝球准确率') plt.ylabel('准确率') plt.xlabel('Epoch') plt.legend() # 学习率 plt.subplot(2, 2, 4) if 'lr' in history.history: plt.plot(history.history['lr'], label='学习率') plt.title(f'Fold {fold} - 学习率变化') plt.ylabel('学习率') plt.xlabel('Epoch') plt.legend() plt.tight_layout() plt.savefig(f'training_history_fold{fold}.png') plt.close() # ===== 预测准备函数 ===== def prepare_prediction_input(df, features, scaler_X): """准备预测输入,确保特征一致性""" print("\n===== 准备预测输入 =====") # 加载特征列名 feature_columns = joblib.load('feature_columns.pkl') print(f"预期特征数量: {len(feature_columns)}") # 创建空DataFrame prediction_features = pd.DataFrame(columns=feature_columns) # 获取最后10行有效数据 last_10 = features.iloc[-10:] # 填充基础特征 red_cols = ['数字1', '数字2', '数字3', '数字4', '数字5', '数字6'] current_row = df.iloc[-1] prediction_features.loc[0, '红球和值'] = current_row[red_cols].sum() prediction_features.loc[0, '蓝球值'] = current_row['数字7'] prediction_features.loc[0, '奇偶比'] = current_row[red_cols].apply(lambda x: x % 2).sum() prediction_features.loc[0, '大小比'] = current_row[red_cols].apply(lambda x: 1 if x > 16 else 0).sum() # 填充窗口特征 window_size = 10 for col in ['红球和值', '奇偶比', '大小比']: col_values = features[col].iloc[-window_size:] prediction_features.loc[0, f'{col}_MA{window_size}'] = col_values.mean() prediction_features.loc[0, f'{col}_STD{window_size}'] = col_values.std() # 填充滞后特征 - 修正逻辑 for lag in range(1, 10): # 确保滞后索引有效 lag_index = -lag - 1 # 从当前行向前追溯 for col in red_cols + ['数字7']: feature_name = f'{col}_lag{lag}' if feature_name in feature_columns: if len(df) > lag: prediction_features.loc[0, feature_name] = df[col].iloc[lag_index] else: # 数据不足时使用平均值 prediction_features.loc[0, feature_name] = df[col].mean() # 处理缺失特征 missing_cols = set(feature_columns) - set(prediction_features.columns) for col in missing_cols: prediction_features[col] = 0 # 默认填充0 # 确保顺序一致 prediction_features = prediction_features[feature_columns] # 标准化 X_pred = scaler_X.transform(prediction_features) print(f"预测输入形状: {X_pred.shape}") return X_pred # ===== 预测函数 ===== def predict_next_period(models): # 加载特征处理器 scaler = joblib.load('scaler_X.save') # 动态获取最新10期数据(避免文件依赖) raw_data = pd.read_excel('02resultB.xlsx').iloc[-10:] processed = create_features(raw_data, save_features=False)[0] X_seq = scaler.transform(processed.drop(columns=['日期'])) # 概率集成(考虑模型验证集性能) val_scores = [model.evaluate(X_val, y_val)[0] for model in models] weights = np.array([1/score for score in val_scores]) # 加权预测 red_probs, blue_probs = [], [] for model, weight in zip(models, weights): preds = model.predict(np.expand_dims(X_seq, axis=0)) red_probs.append(preds[:6] * weight) blue_probs.append(preds[6] * weight) # 合并结果 final_red = np.mean(red_probs, axis=0) final_blue = np.mean(blue_probs, axis=0) # 归一化概率 red_probs /= total_weight blue_probs /= total_weight # 获取预测结果 red_indices = np.argsort(red_probs[0])[::-1][:6] blue_indices = np.argsort(blue_probs[0])[::-1][:3] return ( [i+1 for i in red_indices], [red_probs[0][i] for i in red_indices], [i+1 for i in blue_indices], [blue_probs[0][i] for i in blue_indices] ) # ===== 主函数 ===== def main(): # 执行数据处理步骤 step1_format_data() df = step2_process_data() # 特征工程 features, red_targets, blue_targets = create_features(df) # 准备训练数据 X, y_red, y_blue, scaler_X = prepare_data(features, red_targets, blue_targets) # 训练模型 models = train_model(X, y_red, y_blue) # 预测 red_nums, red_probs, blue_nums, blue_probs = predict_next_period(models) # 打印结果 print("\n" + "="*50) print("双色球下一期预测结果") print("="*50) print("\n红球预测 (前6个):") for num, prob in zip(red_nums, red_probs): print(f"号码 {num:2d} : 概率 {prob:.4f}") print("\n蓝球预测 (前3个):") for num, prob in zip(blue_nums, blue_probs): print(f"号码 {num:2d} : 概率 {prob:.4f}") # 保存结果 result_df = pd.DataFrame({ '红球预测': red_nums, '红球概率': red_probs, '蓝球预测': blue_nums, '蓝球概率': blue_probs }) result_df.to_excel('prediction_results.xlsx', index=False) print("\n预测结果已保存至: prediction_results.xlsx") if __name__ == "__main__": main() def coverage_rate(y_true, y_pred): # 计算预测号码与实际开奖的重合数量 return K.mean(K.sum(K.cast(K.equal(y_true, y_pred), axis=-1)) # main.py import tensorflow as tf from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Layer from tensorflow.keras.optimizers import Adam # 内联定义自定义层(避免导入问题) class AttentionLayer(Layer): """自定义注意力层""" def __init__(self, **kwargs): super(AttentionLayer, self).__init__(**kwargs) def build(self, input_shape): self.time_steps = input_shape[1] self.feature_dim = input_shape[2] self.W = self.add_weight( name='att_weight', shape=(self.feature_dim, self.feature_dim), initializer='glorot_uniform', trainable=True ) self.b = self.add_weight( name='att_bias', shape=(self.feature_dim,), initializer='zeros', trainable=True ) self.V = self.add_weight( name='att_v', shape=(self.feature_dim, 1), initializer='glorot_uniform', trainable=True ) super(AttentionLayer, self).build(input_shape) def call(self, inputs): score = tf.tanh(tf.matmul(inputs, self.W) + self.b) score = tf.matmul(score, self.V) score = tf.squeeze(score, axis=-1) alpha = tf.nn.softmax(score, axis=-1) alpha = tf.expand_dims(alpha, axis=-1) context = tf.reduce_sum(alpha * inputs, axis=1) return context def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[2]) # 模型构建函数 def build_attention_lstm_model(input_shape): inputs = Input(shape=input_shape) # LSTM层 lstm_out = LSTM(128, return_sequences=True)(inputs) lstm_out = Dropout(0.3)(lstm_out) # 使用自定义层 attention_out = AttentionLayer()(lstm_out) # 输出分支 red_branch = Dense(64, activation='relu')(attention_out) red_output = Dense(33, activation='sigmoid', name='red_output')(red_branch) blue_branch = Dense(32, activation='relu')(attention_out) blue_output = Dense(16, activation='sigmoid', name='blue_output')(blue_branch) model = Model(inputs=inputs, outputs=[red_output, blue_output]) model.compile( optimizer=Adam(0.001), loss={'red_output': 'binary_crossentropy', 'blue_output': 'binary_crossentropy'}, # 这里也需要修改为字典形式 loss_weights={'red_output': 0.7, 'blue_output': 0.3} ) return model # 测试模型构建 if __name__ == "__main__": model = build_attention_lstm_model(input_shape=(10, 20)) model.summary() print("模型构建成功!")
最新发布
07-17
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值