window.print()导致页面加载停滞

本文介绍了一种通过在数据库中添加按钮状态字段,并结合前端代码实现的页面打印功能。该功能能够在打印后隐藏特定按钮,同时确保数据不再被保存。

今天做了一个页面打印功能的玩意

需求:打印后数据按钮被隐藏,

思路

1,数据库添加个按钮状态字段 printState

2,在action中,将该页面带出来的方法里写上request.setparam("data",数据库查出来的数据);方式

3.在页面上,按钮外面加个判断  <c:if test="${data.printState!=1}"     1是打印后状态   0是打印前状态


印上我这部分的代码


var input_qymc=$('#qymc').val();
if(${dt.printState!=1}){
  if(confirm("确定打印后,数据将不能再保存")){
/* 判断是否为空,判断数据库中是否有值 */
 if(qymc==undefined||qymc==""||qymc==null||qymc!=input_qymc||zs!=input_zs||scztlxr!=input_scztlxr||scztlxdh!=input_scztlxdh||qrr!=input_qrr||qrrlxdh!=input_qrrlxdh||fr!=input_fr||frlxdh!=input_frlxdh){
alert("企业配置未保存,无法打印");
alert("1");
}else{
alert("2");
var id="${id}";
window.open("dzdt!printPage.action?id="+id);     //该方法再跳转到打印页面
        refesh();
}
  }



打印页面js

$(function() {
setTimeout(function(){
window.print();
},500);
});


结果发现打印按钮有时有有时无,也就是说它的refesh()方法,偶尔被读到而已


发现

print()是将页面停滞执行的

所以我们将print()方法延迟到refesh();方法后执行


所以将settimeout时间调到一秒钟就ok了

打印页面js

$(function() {
setTimeout(function(){
window.print();
},1000);
});



import tkinter as tk from tkinter import ttk, filedialog, messagebox import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg from sklearn.preprocessing import MinMaxScaler import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense,Layer from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping import os plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体 plt.rcParams['axes.unicode_minus'] = False class Attention(Layer): def __init__(self, **kwargs): super(Attention, self).__init__(**kwargs) def build(self, input_shape): self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True) self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True) super(Attention, self).build(input_shape) def call(self, x): e = tf.tanh(tf.matmul(x, self.W) + self.b) a = tf.nn.softmax(e, axis=1) output = x * a return tf.reduce_sum(output, axis=1) class DamSeepageModel: def __init__(self, root): self.root = root self.root.title("大坝渗流预测模型") self.root.geometry("1200x800") # 初始化数据 self.train_df = None self.test_df = None self.model = None self.scaler = MinMaxScaler(feature_range=(0, 1)) # 创建主界面 self.create_widgets() def create_widgets(self): # 创建主框架 main_frame = ttk.Frame(self.root, padding=10) main_frame.pack(fill=tk.BOTH, expand=True) # 左侧控制面板 control_frame = ttk.LabelFrame(main_frame, text="模型控制", padding=10) control_frame.pack(side=tk.LEFT, fill=tk.Y, padx=5, pady=5) # 文件选择部分 file_frame = ttk.LabelFrame(control_frame, text="数据文件", padding=10) file_frame.pack(fill=tk.X, pady=5) # 训练集选择 ttk.Label(file_frame, text="训练集:").grid(row=0, column=0, sticky=tk.W, pady=5) self.train_file_var = tk.StringVar() ttk.Entry(file_frame, textvariable=self.train_file_var, width=30, state='readonly').grid(row=0, column=1, padx=5) ttk.Button(file_frame, text="选择文件", command=lambda: self.select_file("train")).grid(row=0, column=2) # 测试集选择 ttk.Label(file_frame, text="测试集:").grid(row=1, column=0, sticky=tk.W, pady=5) self.test_file_var = tk.StringVar() ttk.Entry(file_frame, textvariable=self.test_file_var, width=30, state='readonly').grid(row=1, column=1, padx=5) ttk.Button(file_frame, text="选择文件", command=lambda: self.select_file("test")).grid(row=1, column=2) # 参数设置部分 param_frame = ttk.LabelFrame(control_frame, text="模型参数", padding=10) param_frame.pack(fill=tk.X, pady=10) # 时间窗口大小 ttk.Label(param_frame, text="时间窗口大小:").grid(row=0, column=0, sticky=tk.W, pady=5) self.window_size_var = tk.IntVar(value=60) ttk.Spinbox(param_frame, from_=10, to=200, increment=5, textvariable=self.window_size_var, width=10).grid(row=0, column=1, padx=5) # LSTM单元数量 ttk.Label(param_frame, text="LSTM单元数:").grid(row=1, column=0, sticky=tk.W, pady=5) self.lstm_units_var = tk.IntVar(value=50) ttk.Spinbox(param_frame, from_=10, to=200, increment=10, textvariable=self.lstm_units_var, width=10).grid(row=1, column=1, padx=5) # 训练轮次 ttk.Label(param_frame, text="训练轮次:").grid(row=2, column=0, sticky=tk.W, pady=5) self.epochs_var = tk.IntVar(value=100) ttk.Spinbox(param_frame, from_=10, to=500, increment=10, textvariable=self.epochs_var, width=10).grid(row=2, column=1, padx=5) # 批处理大小 ttk.Label(param_frame, text="批处理大小:").grid(row=3, column=0, sticky=tk.W, pady=5) self.batch_size_var = tk.IntVar(value=32) ttk.Spinbox(param_frame, from_=16, to=128, increment=16, textvariable=self.batch_size_var, width=10).grid(row=3, column=1, padx=5) # 控制按钮 btn_frame = ttk.Frame(control_frame) btn_frame.pack(fill=tk.X, pady=10) ttk.Button(btn_frame, text="训练模型", command=self.train_model).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="预测结果", command=self.predict).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="保存结果", command=self.save_results).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="重置", command=self.reset).pack(side=tk.RIGHT, padx=5) # 状态栏 self.status_var = tk.StringVar(value="就绪") status_bar = ttk.Label(control_frame, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W) status_bar.pack(fill=tk.X, side=tk.BOTTOM) # 右侧结果显示区域 result_frame = ttk.Frame(main_frame) result_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建标签页 self.notebook = ttk.Notebook(result_frame) self.notebook.pack(fill=tk.BOTH, expand=True) # 损失曲线标签页 self.loss_frame = ttk.Frame(self.notebook) self.notebook.add(self.loss_frame, text="训练损失") # 预测结果标签页 self.prediction_frame = ttk.Frame(self.notebook) self.notebook.add(self.prediction_frame, text="预测结果") # 初始化绘图区域 self.fig, self.ax = plt.subplots(figsize=(10, 6)) self.canvas = FigureCanvasTkAgg(self.fig, master=self.prediction_frame) self.canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True) self.loss_fig, self.loss_ax = plt.subplots(figsize=(10, 4)) self.loss_canvas = FigureCanvasTkAgg(self.loss_fig, master=self.loss_frame) self.loss_canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True) # 文件选择 def select_file(self, file_type): """选择Excel文件""" file_path = filedialog.askopenfilename( title=f"选择{file_type}集Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if file_path: try: # 读取Excel文件 df = pd.read_excel(file_path) # 时间特征列 time_features = ['year', 'month', 'day'] missing_time_features = [feat for feat in time_features if feat not in df.columns] if '水位' not in df.columns: messagebox.showerror("列名错误", "Excel文件必须包含'水位'列") return if missing_time_features: messagebox.showerror("列名错误", f"Excel文件缺少预处理后的时间特征列: {', '.join(missing_time_features)}\n" "请确保已使用预处理功能添加这些列") return # 创建完整的时间戳列 # 处理可能缺失的小时、分钟、秒数据 if 'hour' in df.columns and 'minute' in df.columns and 'second' in df.columns: df['datetime'] = pd.to_datetime( df[['year', 'month', 'day', 'hour', 'minute', 'second']] ) elif 'hour' in df.columns and 'minute' in df.columns: df['datetime'] = pd.to_datetime( df[['year', 'month', 'day', 'hour', 'minute']].assign(second=0) ) else: df['datetime'] = pd.to_datetime(df[['year', 'month', 'day']]) # 设置时间索引 df = df.set_index('datetime') # 保存数据 if file_type == "train": self.train_df = df self.train_file_var.set(os.path.basename(file_path)) self.status_var.set(f"已加载训练集: {len(self.train_df)}条数据") else: self.test_df = df self.test_file_var.set(os.path.basename(file_path)) self.status_var.set(f"已加载测试集: {len(self.test_df)}条数据") except Exception as e: messagebox.showerror("文件错误", f"读取文件失败: {str(e)}") def create_dataset(self, data, window_size): """创建时间窗口数据集""" X, y = [], [] for i in range(len(data) - window_size): X.append(data[i:(i + window_size), 0]) y.append(data[i + window_size, 0]) return np.array(X), np.array(y) def create_dynamic_plot_callback(self): """创建动态绘图回调实例,用于实时显示训练损失曲线""" class DynamicPlotCallback(tf.keras.callbacks.Callback): def __init__(self, gui_app): self.gui_app = gui_app # 引用主GUI实例 self.train_loss = [] # 存储训练损失 self.val_loss = [] # 存储验证损失 def on_epoch_end(self, epoch, logs=None): """每个epoch结束时更新图表""" logs = logs or {} # 收集损失数据 self.train_loss.append(logs.get('loss')) self.val_loss.append(logs.get('val_loss')) # 更新GUI中的图表(在主线程中执行) self.gui_app.root.after(0, self._update_plot) def _update_plot(self): """实际更新图表的函数""" try: # 清除现有图表 self.gui_app.loss_ax.clear() # 绘制训练和验证损失曲线 epochs = range(1, len(self.train_loss) + 1) self.gui_app.loss_ax.plot(epochs, self.train_loss, 'b-', label='训练损失') self.gui_app.loss_ax.plot(epochs, self.val_loss, 'r-', label='验证损失') # 设置图表属性 self.gui_app.loss_ax.set_title('模型训练损失') self.gui_app.loss_ax.set_xlabel('轮次') self.gui_app.loss_ax.set_ylabel('损失', rotation=0) self.gui_app.loss_ax.legend(loc='upper right') self.gui_app.loss_ax.grid(True, alpha=0.3) # 自动调整Y轴范围 all_losses = self.train_loss + self.val_loss min_loss = max(0, min(all_losses) * 0.9) max_loss = max(all_losses) * 1.1 self.gui_app.loss_ax.set_ylim(min_loss, max_loss) # 刷新画布 self.gui_app.loss_canvas.draw() # 更新状态栏显示最新损失 current_epoch = len(self.train_loss) if current_epoch > 0: latest_train_loss = self.train_loss[-1] latest_val_loss = self.val_loss[-1] if self.val_loss else 0 self.gui_app.status_var.set( f"训练中 | 轮次: {current_epoch} | " f"训练损失: {latest_train_loss:.6f} | " f"验证损失: {latest_val_loss:.6f}" ) self.gui_app.root.update() except Exception as e: print(f"更新图表时出错: {str(e)}") # 返回回调实例 return DynamicPlotCallback(self) def train_model(self): """训练LSTM模型""" if self.train_df is None: messagebox.showwarning("警告", "请先选择训练集文件") return try: self.status_var.set("正在预处理数据...") self.root.update() # 数据预处理 train_scaled = self.scaler.fit_transform(self.train_df[['水位']]) # 创建时间窗口数据集 window_size = self.window_size_var.get() X_train, y_train = self.create_dataset(train_scaled, window_size) # 调整LSTM输入格式 X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) # 构建LSTM模型 self.model = Sequential() self.model.add(LSTM( self.lstm_units_var.get(), return_sequences=True, input_shape=(window_size, 1) )) self.model.add(LSTM(self.lstm_units_var.get(), return_sequences=True)) self.model.add(Attention()) self.model.add(Dense(1)) self.model.compile( optimizer=Adam(learning_rate=0.001), loss='mean_squared_error' ) # 添加早停机制 early_stopping = EarlyStopping( monitor='val_loss', # 监控验证集损失 patience=20, # 连续20轮无改善则停止 min_delta=0.0001, # 最小改善阈值 restore_best_weights=True, # 恢复最佳权重 verbose=1 # 显示早停信息 ) # 训练模型 self.status_var.set("正在训练模型...") self.root.update() history = self.model.fit( X_train, y_train, epochs=self.epochs_var.get(), batch_size=self.batch_size_var.get(), validation_split=0.2, # 使用20%数据作为验证集 callbacks=[early_stopping], # 添加早停回调 verbose=0 ) # 绘制损失曲线 self.loss_ax.clear() self.loss_ax.plot(history.history['loss'], label='训练损失') self.loss_ax.plot(history.history['val_loss'], label='验证损失') self.loss_ax.set_title('模型训练损失') self.loss_ax.set_xlabel('轮次') self.loss_ax.set_ylabel('损失',rotation=0) self.loss_ax.legend() self.loss_ax.grid(True) self.loss_canvas.draw() # 根据早停情况更新状态信息 if early_stopping.stopped_epoch > 0: stopped_epoch = early_stopping.stopped_epoch best_epoch = early_stopping.best_epoch final_loss = history.history['loss'][-1] best_loss = min(history.history['val_loss']) self.status_var.set( f"训练在{stopped_epoch + 1}轮提前终止 | " f"最佳模型在第{best_epoch + 1}轮 | " f"最终损失: {final_loss:.6f} | " f"最佳验证损失: {best_loss:.6f}" ) messagebox.showinfo( "训练完成", f"模型训练提前终止!\n" f"最佳模型在第{best_epoch + 1}轮\n" f"最佳验证损失: {best_loss:.6f}" ) else: final_loss = history.history['loss'][-1] self.status_var.set(f"模型训练完成 | 最终损失: {final_loss:.6f}") messagebox.showinfo("训练完成", "模型训练成功完成!") except Exception as e: messagebox.showerror("训练错误", f"模型训练失败:\n{str(e)}") self.status_var.set("训练失败") def predict(self): """使用模型进行预测""" if self.model is None: messagebox.showwarning("警告", "请先训练模型") return if self.test_df is None: messagebox.showwarning("警告", "请先选择测试集文件") return try: self.status_var.set("正在生成预测...") self.root.update() # 预处理测试数据 test_scaled = self.scaler.transform(self.test_df[['水位']]) # 创建测试集时间窗口 window_size = self.window_size_var.get() X_test, y_test = self.create_dataset(test_scaled, window_size) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) # 进行预测 test_predict = self.model.predict(X_test) # 反归一化 test_predict = self.scaler.inverse_transform(test_predict) y_test_orig = self.scaler.inverse_transform([y_test]).T # 创建时间索引 test_time = self.test_df.index[window_size:window_size + len(test_predict)] # 绘制结果 self.ax.clear() self.ax.plot(self.train_df.index, self.train_df['水位'], 'b-', label='训练集数据') self.ax.plot(test_time, self.test_df['水位'][window_size:window_size + len(test_predict)], 'g-', label='测试集数据') self.ax.plot(test_time, test_predict, 'r--', label='模型预测') # 添加分隔线 split_point = test_time[0] self.ax.axvline(x=split_point, color='k', linestyle='--', alpha=0.5) self.ax.text(split_point, self.ax.get_ylim()[0] * 0.9, ' 训练/测试分界', rotation=90) self.ax.set_title('大坝渗流水位预测结果') self.ax.set_xlabel('时间') self.ax.set_ylabel('测压管水位',rotation=0) self.ax.legend() self.ax.grid(True) self.ax.tick_params(axis='x', rotation=45) self.fig.tight_layout() self.canvas.draw() self.status_var.set("预测完成,结果已显示") except Exception as e: messagebox.showerror("预测错误", f"预测失败:\n{str(e)}") self.status_var.set("预测失败") def save_results(self): """保存预测结果""" if not hasattr(self, 'test_predict') or self.test_predict is None: messagebox.showwarning("警告", "请先生成预测结果") return save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("所有文件", "*.*")] ) if not save_path: return try: # 创建包含预测结果的DataFrame window_size = self.window_size_var.get() test_time = self.test_df.index[window_size:window_size + len(self.test_predict)] result_df = pd.DataFrame({ '时间': test_time, '实际水位': self.test_df['水位'][window_size:window_size + len(self.test_predict)].values, '预测水位': self.test_predict.flatten() }) # 保存到Excel result_df.to_excel(save_path, index=False) # 保存图表 chart_path = os.path.splitext(save_path)[0] + "_chart.png" self.fig.savefig(chart_path, dpi=300) self.status_var.set(f"结果已保存至: {os.path.basename(save_path)}") messagebox.showinfo("保存成功", f"预测结果和图表已保存至:\n{save_path}\n{chart_path}") except Exception as e: messagebox.showerror("保存错误", f"保存结果失败:\n{str(e)}") def reset(self): """重置程序状态""" self.train_df = None self.test_df = None self.model = None self.train_file_var.set("") self.test_file_var.set("") self.ax.clear() self.loss_ax.clear() self.canvas.draw() self.loss_canvas.draw() self.data_text.delete(1.0, tk.END) self.status_var.set("已重置,请选择新数据") messagebox.showinfo("重置", "程序已重置,可以开始新的分析") if __name__ == "__main__": root = tk.Tk() app = DamSeepageModel(root) root.mainloop() 我这个是大坝渗流预测模型,看看代码还有什么可以改进的,我现在训练的时候训练损失很低但是验证损失水平比较高
07-17
import gc import time from pathlib import Path import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader, TensorDataset from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.preprocessing import StandardScaler from skopt import gp_minimize from skopt.space import Real, Categorical, Integer import warnings import seaborn as sns from sklearn.preprocessing import RobustScaler from sklearn.model_selection import TimeSeriesSplit from scipy.stats import boxcox # 设置中文显示 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.switch_backend('TkAgg') # 设置路径 Path(r"D:\result2").mkdir(parents=True, exist_ok=True) Path("model_results/").mkdir(parents=True, exist_ok=True) # 检查GPU可用性 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用 {device} 进行训练") # 设置随机种子保证可重复性 torch.manual_seed(42) np.random.seed(42) # 1. 数据预处理模块 def load_and_preprocess_data(): """加载并预处理数据(内存安全版)""" chunksize = 10000 # 每次处理1万行 dfs = [] datetimes_list = [] location_codes_list = [] # 指定列数据类型以减少内存使用 dtype_dict = { 'damage_count': 'float32', 'damage_depth': 'float32', } for chunk in pd.read_csv( r"D:\my_data\clean\locationTransfer.csv", chunksize=chunksize, dtype=dtype_dict ): # 保存非数值列 datetimes_list.append(chunk['datetime'].copy()) location_codes_list.append(chunk['locationCode'].copy()) # 只处理数值列 numeric_cols = chunk.select_dtypes(include=[np.number]).columns chunk = chunk[numeric_cols] chunk = chunk.dropna(subset=['damage_count']) chunk = chunk[pd.to_numeric(chunk['damage_count'], errors='coerce').notna()] chunk = chunk.fillna(method='ffill').fillna(method='bfill') dfs.append(chunk) if len(dfs) > 10: # 测试时限制块数 break # 合并数据块 df = pd.concat(dfs, ignore_index=True) def create_lag_features(df, lags=3): for lag in range(1, lags + 1): df[f'damage_count_lag_{lag}'] = df['damage_count'].shift(lag) return df.dropna() df = create_lag_features(df) # 在合并df之后,填充na之前 df = df.dropna(subset=['damage_count']) datetimes = pd.concat(datetimes_list, ignore_index=True) location_codes = pd.concat(location_codes_list, ignore_index=True) # 确保长度一致 min_length = min(len(df), len(datetimes), len(location_codes)) df = df.iloc[:min_length] datetimes = datetimes.iloc[:min_length] location_codes = location_codes.iloc[:min_length] # 检查是否存在 NaN 值 nan_check = df.isnull().sum().sum() inf_check = df.isin([np.Inf, -np.Inf]).sum().sum() if nan_check > 0 or inf_check > 0: # 处理 NaN 值或者无穷大值 # 填充缺失值为均值 df = df.fillna(df.mean()) # 删除包含 NaN 值或者无穷大值的行 df = df.dropna() # 结构化特征 X_structured = df.drop(columns=['damage_count', 'damage_depth', 'damage_db', 'asset_code_mapping', 'pile_longitude', 'pile_latitude', 'locationCode', 'datetime', 'locationCode_encoded','damage_count_lag_1','damage_count_lag_2','damage_count_lag_3'], errors='ignore') # 填充缺失值 numeric_cols = X_structured.select_dtypes(include=[np.number]).columns for col in numeric_cols: X_structured[col] = X_structured[col].fillna(X_structured[col].mean()) # 标准化数据 scaler = RobustScaler() # 替换StandardScaler,更抗异常值 X_structured = pd.DataFrame(scaler.fit_transform(X_structured), columns=X_structured.columns) # 确保X_structured是DataFrame if not isinstance(X_structured, pd.DataFrame): X_structured = pd.DataFrame(X_structured, columns=[f"feature_{i}" for i in range(X_structured.shape[1])]) # X_structured = X_structured.values # 将DataFrame转换为NumPy数组 # 修改后的目标变量处理部分 y = df[['damage_count']].values.astype(np.float32) # 添加数据缩放 y_scaler = RobustScaler() y = y_scaler.fit_transform(y) # 使用标准化代替log变换 y = np.clip(y, -1e6, 1e6) # 设置合理的上下界 # 添加数据检查 assert not np.any(np.isinf(y)), "y中包含无限值" assert not np.any(np.isnan(y)), "y中包含NaN值" # 数据检查 print("原始数据统计:") print(f"最小值: {y.min()}, 最大值: {y.max()}, NaN数量: {np.isnan(y).sum()}") print("处理后y值范围:", np.min(y), np.max(y)) print("无限值数量:", np.isinf(y).sum()) # 清理内存 del df, chunk, dfs gc.collect() torch.cuda.empty_cache() return datetimes, X_structured, y, location_codes, scaler, y_scaler # 2. 时间序列数据集类 class TimeSeriesDataset(Dataset): """自定义时间序列数据集类""" def __init__(self, X, y, timesteps): # 确保输入是NumPy数组 if isinstance(X, pd.DataFrame): X = X.values if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values assert X.ndim == 2, f"X应为2维,实际为{X.ndim}维" assert y.ndim == 2, f"y应为2维,实际为{y.ndim}维" # 添加维度调试信息 print(f"数据形状 - X: {X.shape}, y: {y.shape}") self.X = torch.FloatTensor(X).unsqueeze(-1) # [samples, timesteps, 1] self.y = torch.FloatTensor(y) self.timesteps = timesteps # 验证形状 if len(self.X) != len(self.y): raise ValueError("X和y的长度不匹配") def __len__(self): return len(self.X) - self.timesteps def __getitem__(self, idx): # [seq_len, num_features] x_window = self.X[idx:idx + self.timesteps] y_target = self.y[idx + self.timesteps - 1] return x_window.permute(1, 0), y_target # 调整维度顺序 def select_features_by_importance(X, y, n_features, feature_names=None): """使用随机森林选择特征(支持NumPy数组和DataFrame)""" # 确保X是二维数组 if isinstance(X, pd.DataFrame): feature_names = X.columns.tolist() X = X.values elif feature_names is None: feature_names = [f"feature_{i}" for i in range(X.shape[1])] # 处理y的维度 y = np.ravel(np.asarray(y)) y = np.nan_to_num(y, nan=np.nanmean(y)) # 检查特征数 if X.shape[1] < n_features: n_features = X.shape[1] print(f"警告: 特征数少于请求数,使用所有 {n_features} 个特征") # 训练随机森林 rf = RandomForestRegressor(n_estimators=250, random_state=42, n_jobs=-1) rf.fit(X, y) # 获取特征重要性 feature_importances = rf.feature_importances_ indices = np.argsort(feature_importances)[::-1][:n_features] # 返回选中的特征数据和重要性 return X[:, indices], feature_importances[indices], [feature_names[i] for i in indices] # 3. LSTM模型定义 class LSTMModel(nn.Module): """LSTM回归模型""" def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.4): super().__init__() # 确保hidden_size*2能被num_heads整除 if (hidden_size * 2) % 4 != 0: hidden_size = ((hidden_size * 2) // 4) * 4 // 2 # 调整到最近的合规值 print(f"调整hidden_size为{hidden_size}以满足整除条件") self.lstm = nn.LSTM( input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0, bidirectional=True # 添加双向结构 ) # 添加维度检查的初始化 def weights_init(m): if isinstance(m, nn.Linear): if m.weight.dim() < 2: m.weight.data = m.weight.data.unsqueeze(0) # 确保至少2维 nn.init.xavier_uniform_(m.weight) if m.bias is not None: nn.init.constant_(m.bias, 0.0) self.bn = nn.BatchNorm1d(hidden_size * 2) # # 注意力机制层 self.attention = nn.MultiheadAttention(embed_dim=hidden_size * 2, num_heads=4) # 改用多头注意力 # 更深D输出层 self.fc = nn.Sequential( nn.Linear(hidden_size*2, hidden_size), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_size, 1) ) # 应用初始化 self.apply(weights_init) def forward(self, x): lstm_out, _ = self.lstm(x) # [batch, seq_len, hidden*2] lstm_out = lstm_out.permute(1, 0, 2) # [seq_len, batch, features] attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out) attn_out = attn_out.permute(1, 0, 2) # 恢复为[batch, seq_len, features] return self.fc(attn_out[:, -1, :]).squeeze() def plot_feature_importance(feature_names, importance_values, save_path): """绘制特征重要性图""" # 验证输入 if len(feature_names) == 0 or len(importance_values) == 0: print("警告: 无特征重要性数据可绘制") return if len(feature_names) != len(importance_values): print(f"警告: 特征名数量({len(feature_names)})与重要性值数量({len(importance_values)})不匹配") # 取较小值 min_len = min(len(feature_names), len(importance_values)) feature_names = feature_names[:min_len] importance_values = importance_values[:min_len] # 按重要性排序 indices = np.argsort(importance_values)[::-1] sorted_features = [feature_names[i] for i in indices] sorted_importance = importance_values[indices] plt.figure(figsize=(12, 8)) plt.bar(range(len(sorted_features)), sorted_importance, align="center") plt.xticks(range(len(sorted_features)), sorted_features, rotation=90) plt.xlabel("特征") plt.ylabel("重要性得分") plt.title("特征重要性排序") plt.tight_layout() # 确保保存路径存在 save_path.parent.mkdir(parents=True, exist_ok=True) plt.savefig(save_path, dpi=300) plt.close() def evaluate(model, val_loader, criterion): model.eval() val_loss = 0.0 with torch.no_grad(): for inputs, targets in val_loader: inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) val_loss += loss.item() return val_loss / len(val_loader) # 4. 模型训练函数 def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler=None, epochs=100, patience=30): """训练模型并返回最佳模型和训练历史""" best_loss = float('inf') history = {'train_loss': [], 'val_loss': []} # 添加梯度累积 accumulation_steps = 5 # 每4个batch更新一次参数 for epoch in range(epochs): # 训练阶段 model.train() train_loss = 0.0 optimizer.zero_grad() for inputs, targets in train_loader: inputs, targets = inputs.to(device), targets.to(device) scaler = torch.cuda.amp.GradScaler() # 在训练循环中添加 with torch.cuda.amp.autocast(): outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) scaler.scale(loss).backward() for batch_idx, (inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(device), targets.to(device) # 前向传播 outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) # 梯度累积 loss = loss / accumulation_steps scaler.scale(loss).backward() if (batch_idx + 1) % accumulation_steps == 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() optimizer.zero_grad() train_loss += loss.item() * accumulation_steps # 验证 val_loss = evaluate(model, val_loader, criterion) if scheduler: scheduler.step(val_loss) # 根据验证损失调整学习率 # 记录历史 avg_train_loss = train_loss / len(train_loader) history['train_loss'].append(avg_train_loss) history['val_loss'].append(val_loss) # 早停逻辑 if val_loss < best_loss * 0.99:# 相对改进阈值 best_loss = val_loss best_epoch = epoch torch.save(model.state_dict(), 'best_model.pth') print(f"Epoch {epoch + 1}/{epochs} - 训练损失: {avg_train_loss :.4f} - 验证损失: {val_loss:.4f}") # 早停判断 if epoch - best_epoch >= patience: print(f"早停触发,最佳epoch: {best_epoch+1}") break # 加载最佳模型 model.load_state_dict(torch.load('best_model.pth')) return model, history # 5. 贝叶斯优化函数 def optimize_hyperparameters(X_train, y_train, input_size): """使用贝叶斯优化寻找最佳超参数""" # 自定义评分函数 def score_fn(params): """内部评分函数""" try: params = adjust_hidden_size(params) # 调整参数 hidden_size, num_layers, dropout, lr, batch_size, timesteps = params # 确保参数有效 batch_size = max(32, min(256, int(batch_size))) timesteps = max(3, min(10, int(timesteps))) dropout = min(0.5, max(0.1, float(dropout))) lr = min(0.01, max(1e-5, float(lr))) # 检查数据是否足够 if len(X_train) < 2 * timesteps+1: # 至少需要2倍时间步长的数据 return float('inf') # 创建模型 model = LSTMModel( input_size=input_size, hidden_size=int(hidden_size), num_layers=min(3, int(num_layers)), dropout=min(0.5, float(dropout)) ).to(device) # 初始化权重 # for name, param in model.named_parameters(): # if 'weight' in name: # nn.init.xavier_normal_(param) # elif 'bias' in name: # nn.init.constant_(param, 0.1) # 损失函数和优化器 criterion = nn.HuberLoss(delta=1.0) optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3) # 创建数据加载器 dataset = TimeSeriesDataset(X_train, y_train, timesteps=int(timesteps)) # 简化验证流程 train_size = int(0.8 * len(dataset)) train_dataset = torch.utils.data.Subset(dataset, range(train_size)) val_dataset = torch.utils.data.Subset(dataset, range(train_size, len(dataset))) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # 简单训练和验证 model.train() for epoch in range(15): # 减少epoch数以加快评估 for inputs, targets in train_loader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # 验证 model.eval() val_loss = 0.0 with torch.no_grad(): for inputs, targets in val_loader: outputs = model(inputs.to(device)) loss = criterion(outputs, targets.squeeze().to(device)) if torch.isnan(loss) or torch.isinf(loss): return float('inf') val_loss += loss.item() return val_loss / len(val_loader) except Exception as e: print(f"参数评估失败: {str(e)}") return float('inf') # 定义搜索空间 search_spaces = [ Integer(32, 128, name='hidden_size'), Integer(1, 3, name='num_layers'), Real(0.2, 0.5, name='dropout'), Real(5e-4, 1e-3, prior='log-uniform', name='lr'), Categorical([64, 128, 256], name='batch_size'), Integer(3, 10, name='timesteps') # 优化时间步长 ] def adjust_hidden_size(params): """确保hidden_size*2能被4整除""" hs = params[0] params[0] = ((hs * 2) // 4) * 4 // 2 return params result = gp_minimize( score_fn, search_spaces, n_calls=50, random_state=42, verbose=True, n_jobs=1 # 并行执行 ) # 提取最佳参数 best_params = { 'hidden_size': result.x[0], 'num_layers': result.x[1], 'dropout': result.x[2], 'lr': result.x[3], 'batch_size': result.x[4], 'timesteps': result.x[5] } print("优化完成,最佳参数:", best_params) return best_params def collate_fn(batch): """增强型数据批处理函数""" # 解包批次数据 inputs, targets = zip(*batch) # 维度转换 (已包含在数据集中) # inputs = [batch_size, features, seq_len] # 数据增强(可选) # 添加高斯噪声 noise = torch.randn_like(inputs) * 0.05 inputs = inputs + noise # 归一化处理(可选) # mean = inputs.mean(dim=(1,2), keepdim=True) # std = inputs.std(dim=(1,2), keepdim=True) # inputs = (inputs - mean) / (std + 1e-8) return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features] # 6. 评估函数 def evaluate_model(model, test_loader, criterion, test_indices, y_scaler=None): """评估模型性能""" model.eval() test_loss = 0.0 y_true = [] y_pred = [] all_indices = [] with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) if outputs.dim() == 1: outputs = outputs.unsqueeze(1) loss = criterion(outputs, targets) test_loss += loss.item() * inputs.size(0) # 收集预测结果 y_true.extend(targets.cpu().numpy()) y_pred.extend(outputs.cpu().numpy()) # 获取原始数据集中的索引 current_indices = test_indices[batch_idx * test_loader.batch_size: (batch_idx + 1) * test_loader.batch_size] all_indices.extend(current_indices) y_true = np.array(y_true).reshape(-1) y_pred = np.array(y_pred).reshape(-1) if y_scaler is not None: y_true = y_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten() y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten() # 基础指标 metrics = { 'MSE': mean_squared_error(y_true, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 'MAE': mean_absolute_error(y_true, y_pred), 'R2': r2_score(y_true, y_pred), 'MAPE': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100, # 避免除0 'indices': all_indices, # 添加原始索引 'y_true_original': y_true, 'y_pred_original': y_pred, 'test_loss': test_loss } # 可视化误差分布 errors = y_true - y_pred plt.figure(figsize=(12, 6)) sns.histplot(errors, kde=True, bins=50) plt.title('Error Distribution') plt.savefig('error_distribution.png') plt.close() return metrics, y_true, y_pred def collate_fn(batch): """增强型数据批处理函数""" # 解包批次数据 inputs, targets = zip(*batch) # 维度转换 (已包含在数据集中) # inputs = [batch_size, features, seq_len] # 数据增强(可选) # 添加高斯噪声 noise = torch.randn_like(inputs) * 0.05 inputs = inputs + noise # 归一化处理(可选) # mean = inputs.mean(dim=(1,2), keepdim=True) # std = inputs.std(dim=(1,2), keepdim=True) # inputs = (inputs - mean) / (std + 1e-8) return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features] # 7. 主函数 def main(): # 1. 加载和预处理数据 print("正在加载和预处理数据...") datetimes, X_structured, y, location_codes, scaler , y_scaler= load_and_preprocess_data() # 2. 特征选择 print('正在进行特征选择') # 修改为选择前15%特征 n_features = int(X_structured.shape[1] * 0.15) X_selected, feature_importances, top_features = select_features_by_importance( X_structured, y, n_features ) X_selected = X_structured[top_features] print(f"选择后的特征及其重要性:") for feature, importance in zip(top_features, feature_importances): print(f"{feature}: {importance:.4f}") print(X_selected) # 绘制特征重要性图 plot_feature_importance(top_features, feature_importances, Path("feature_importance.png")) # 3. 创建时间序列数据集 print("正在创建时间序列数据集...") timesteps = 5 dataset = TimeSeriesDataset(X_selected, y, timesteps) # 4. 数据划分 train_size = int(0.8 * len(dataset)) train_indices = list(range(train_size)) test_indices = list(range(train_size, len(dataset))) train_dataset = torch.utils.data.Subset(dataset, train_indices) test_dataset = torch.utils.data.Subset(dataset, test_indices) # 5. 贝叶斯优化超参数 print("正在进行贝叶斯优化...") try: best_params = optimize_hyperparameters( X_selected.iloc[:train_size], y[:train_size].copy(), input_size=X_selected.shape[1] ) print("最佳参数:", best_params) except Exception as e: print(f"贝叶斯优化失败: {str(e)}") # 6. 使用最佳参数训练最终模型 torch.cuda.empty_cache() # 清理 GPU 缓存 print("\n使用最佳参数训练模型...") # 获取并验证batch_size batch_size = int(best_params.get('batch_size')) print(f"实际使用的batch_size类型: {type(batch_size)}, 值: {batch_size}") # 调试输出 model = LSTMModel( input_size=X_selected.shape[1], hidden_size=int(best_params['hidden_size']), num_layers=int(best_params['num_layers']), dropout=float(best_params['dropout']) ).to(device) # 数据加载器 train_loader = DataLoader( train_dataset, batch_size=int(batch_size), shuffle=True, # 训练集需要打乱 collate_fn=collate_fn, num_workers=4, # 多进程加载 pin_memory=True # 加速GPU传输 ) val_loader = DataLoader( test_dataset, batch_size=int(batch_size)*2, # 更大的批次提升验证效率 shuffle=False, # 验证集不需要打乱 collate_fn=lambda batch: ( torch.stack([x for x, y in batch]).permute(0, 2, 1), torch.stack([y for x, y in batch]) ), num_workers=2, pin_memory=True ) # 损失函数和优化器 criterion = nn.HuberLoss(delta=1.0) optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50) # 训练模型 model, history = train_model( model, train_loader, val_loader, criterion, optimizer, scheduler=scheduler, epochs=200, patience=15) torch.cuda.empty_cache() # 清理 GPU 缓存 # 7. 评估模型 print("\n评估模型性能...") metrics, y_true, y_pred = evaluate_model(model, val_loader, criterion, test_indices, y_scaler) print(f"测试集 MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}") # 8. 保存所有结果 print("\n保存所有结果...") output_dir = Path(r"D:\result2") output_dir.mkdir(parents=True, exist_ok=True) # 保存评估指标 metrics_df = pd.DataFrame({ 'Metric': ['MSE', 'MAE', 'R2', 'MAPE', 'Test Loss'], 'Value': [metrics['MSE'], metrics['MAE'], metrics['R2'], metrics['MAPE'], metrics['test_loss']] }) metrics_df.to_csv(output_dir / 'evaluation_metrics.csv', index=False) # 保存训练历史 history_df = pd.DataFrame(history) history_df.to_csv(output_dir / 'training_history.csv', index=False) # 保存预测结果与原始数据 pred_indices = [i + timesteps - 1 for i in metrics['indices']] # 调整索引以匹配原始数据 # 确保我们有足够的datetime和locationCode数据 if len(datetimes) > max(pred_indices) and len(location_codes) > max(pred_indices): y_true = y_true.flatten() # 确保是一维 y_pred = y_pred.flatten() # 确保是一维 result_df = pd.DataFrame({ 'datetime': datetimes.iloc[pred_indices].values, 'locationCode': location_codes.iloc[pred_indices].values, 'true_value': y_true, 'predicted_value': y_pred }) # 有条件地添加分位数 if y_pred.shape[1] > 2: result_df['predicted_lower'] = y_pred[:, 0] # 10%分位数 result_df['predicted_upper'] = y_pred[:, 2] # 90%分位数 # 添加其他特征 for i, feature in enumerate(X_selected.columns): result_df[feature] = X_selected.iloc[pred_indices, i].values result_df.to_csv(output_dir / 'predictions_with_metadata.csv', index=False) else: print("警告: datetime或locationCode数据不足,无法完全匹配预测结果") # 保存基础预测结果 pd.DataFrame({ 'true_value': y_true.flatten(), 'predicted_value': y_pred.flatten() }).to_csv(output_dir / 'predictions.csv', index=False) # 9. 可视化结果 plt.figure(figsize=(12, 6)) plt.plot(history['train_loss'], label='训练损失') plt.plot(history['val_loss'], label='验证损失') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('训练过程') plt.legend() plt.savefig(output_dir / 'training_process.png', dpi=300) plt.close() # 添加预测结果可视化 plt.figure(figsize=(15, 6)) plt.plot(y_true[:200], label='真实值') plt.plot(y_pred[:200], label='预测值') # 只使用中位数预测 plt.title('预测结果对比') plt.legend() plt.savefig(output_dir / 'prediction_comparison.png', dpi=300) plt.show() # 误差分布图 errors = y_true - y_pred[:, 1] plt.hist(errors, bins=50) plt.title('预测误差分布') plt.savefig(output_dir / 'error_distribution.png', dpi=300) # 保存图像 plt.close() # # 添加分位数预测可视化 # plt.figure(figsize=(15, 6)) # plt.plot(y_true[:100], label='真实值') # plt.plot(y_pred[:100, 0], label='10%分位数') # plt.plot(y_pred[:100, 1], label='中位数') # plt.plot(y_pred[:100, 2], label='90%分位数') # plt.legend() # plt.savefig(output_dir / 'quantile_predictions.png', dpi=300) # 保存图像 # plt.close() # 9. 保存模型 if metrics['r2'] > 0.8: model_path = output_dir / 'best_model.pth' torch.save(model.state_dict(), model_path) print(f"模型保存成功: {model_path}") print(f"所有结果已保存到 {output_dir}") if __name__ == "__main__": warnings.filterwarnings('ignore') start_time = time.time() main() print(f"总运行时间: {(time.time() - start_time) / 60:.2f}分钟")import gc import time from pathlib import Path import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader, TensorDataset from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.preprocessing import StandardScaler from skopt import gp_minimize from skopt.space import Real, Categorical, Integer import warnings import seaborn as sns from sklearn.preprocessing import RobustScaler from sklearn.model_selection import TimeSeriesSplit from scipy.stats import boxcox # 设置中文显示 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.switch_backend('TkAgg') # 设置路径 Path(r"D:\result2").mkdir(parents=True, exist_ok=True) Path("model_results/").mkdir(parents=True, exist_ok=True) # 检查GPU可用性 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用 {device} 进行训练") # 设置随机种子保证可重复性 torch.manual_seed(42) np.random.seed(42) # 1. 数据预处理模块 def load_and_preprocess_data(): """加载并预处理数据(内存安全版)""" chunksize = 10000 # 每次处理1万行 dfs = [] datetimes_list = [] location_codes_list = [] # 指定列数据类型以减少内存使用 dtype_dict = { 'damage_count': 'float32', 'damage_depth': 'float32', } for chunk in pd.read_csv( r"D:\my_data\clean\locationTransfer.csv", chunksize=chunksize, dtype=dtype_dict ): # 保存非数值列 datetimes_list.append(chunk['datetime'].copy()) location_codes_list.append(chunk['locationCode'].copy()) # 只处理数值列 numeric_cols = chunk.select_dtypes(include=[np.number]).columns chunk = chunk[numeric_cols] chunk = chunk.dropna(subset=['damage_count']) chunk = chunk[pd.to_numeric(chunk['damage_count'], errors='coerce').notna()] chunk = chunk.fillna(method='ffill').fillna(method='bfill') dfs.append(chunk) if len(dfs) > 10: # 测试时限制块数 break # 合并数据块 df = pd.concat(dfs, ignore_index=True) def create_lag_features(df, lags=3): for lag in range(1, lags + 1): df[f'damage_count_lag_{lag}'] = df['damage_count'].shift(lag) return df.dropna() df = create_lag_features(df) # 在合并df之后,填充na之前 df = df.dropna(subset=['damage_count']) datetimes = pd.concat(datetimes_list, ignore_index=True) location_codes = pd.concat(location_codes_list, ignore_index=True) # 确保长度一致 min_length = min(len(df), len(datetimes), len(location_codes)) df = df.iloc[:min_length] datetimes = datetimes.iloc[:min_length] location_codes = location_codes.iloc[:min_length] # 检查是否存在 NaN 值 nan_check = df.isnull().sum().sum() inf_check = df.isin([np.Inf, -np.Inf]).sum().sum() if nan_check > 0 or inf_check > 0: # 处理 NaN 值或者无穷大值 # 填充缺失值为均值 df = df.fillna(df.mean()) # 删除包含 NaN 值或者无穷大值的行 df = df.dropna() # 结构化特征 X_structured = df.drop(columns=['damage_count', 'damage_depth', 'damage_db', 'asset_code_mapping', 'pile_longitude', 'pile_latitude', 'locationCode', 'datetime', 'locationCode_encoded','damage_count_lag_1','damage_count_lag_2','damage_count_lag_3'], errors='ignore') # 填充缺失值 numeric_cols = X_structured.select_dtypes(include=[np.number]).columns for col in numeric_cols: X_structured[col] = X_structured[col].fillna(X_structured[col].mean()) # 标准化数据 scaler = RobustScaler() # 替换StandardScaler,更抗异常值 X_structured = pd.DataFrame(scaler.fit_transform(X_structured), columns=X_structured.columns) # 确保X_structured是DataFrame if not isinstance(X_structured, pd.DataFrame): X_structured = pd.DataFrame(X_structured, columns=[f"feature_{i}" for i in range(X_structured.shape[1])]) # X_structured = X_structured.values # 将DataFrame转换为NumPy数组 # 修改后的目标变量处理部分 y = df[['damage_count']].values.astype(np.float32) # 添加数据缩放 y_scaler = RobustScaler() y = y_scaler.fit_transform(y) # 使用标准化代替log变换 y = np.clip(y, -1e6, 1e6) # 设置合理的上下界 # 添加数据检查 assert not np.any(np.isinf(y)), "y中包含无限值" assert not np.any(np.isnan(y)), "y中包含NaN值" # 数据检查 print("原始数据统计:") print(f"最小值: {y.min()}, 最大值: {y.max()}, NaN数量: {np.isnan(y).sum()}") print("处理后y值范围:", np.min(y), np.max(y)) print("无限值数量:", np.isinf(y).sum()) # 清理内存 del df, chunk, dfs gc.collect() torch.cuda.empty_cache() return datetimes, X_structured, y, location_codes, scaler, y_scaler # 2. 时间序列数据集类 class TimeSeriesDataset(Dataset): """自定义时间序列数据集类""" def __init__(self, X, y, timesteps): # 确保输入是NumPy数组 if isinstance(X, pd.DataFrame): X = X.values if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values assert X.ndim == 2, f"X应为2维,实际为{X.ndim}维" assert y.ndim == 2, f"y应为2维,实际为{y.ndim}维" # 添加维度调试信息 print(f"数据形状 - X: {X.shape}, y: {y.shape}") self.X = torch.FloatTensor(X).unsqueeze(-1) # [samples, timesteps, 1] self.y = torch.FloatTensor(y) self.timesteps = timesteps # 验证形状 if len(self.X) != len(self.y): raise ValueError("X和y的长度不匹配") def __len__(self): return len(self.X) - self.timesteps def __getitem__(self, idx): # [seq_len, num_features] x_window = self.X[idx:idx + self.timesteps] y_target = self.y[idx + self.timesteps - 1] return x_window.permute(1, 0), y_target # 调整维度顺序 def select_features_by_importance(X, y, n_features, feature_names=None): """使用随机森林选择特征(支持NumPy数组和DataFrame)""" # 确保X是二维数组 if isinstance(X, pd.DataFrame): feature_names = X.columns.tolist() X = X.values elif feature_names is None: feature_names = [f"feature_{i}" for i in range(X.shape[1])] # 处理y的维度 y = np.ravel(np.asarray(y)) y = np.nan_to_num(y, nan=np.nanmean(y)) # 检查特征数 if X.shape[1] < n_features: n_features = X.shape[1] print(f"警告: 特征数少于请求数,使用所有 {n_features} 个特征") # 训练随机森林 rf = RandomForestRegressor(n_estimators=250, random_state=42, n_jobs=-1) rf.fit(X, y) # 获取特征重要性 feature_importances = rf.feature_importances_ indices = np.argsort(feature_importances)[::-1][:n_features] # 返回选中的特征数据和重要性 return X[:, indices], feature_importances[indices], [feature_names[i] for i in indices] # 3. LSTM模型定义 class LSTMModel(nn.Module): """LSTM回归模型""" def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.4): super().__init__() # 确保hidden_size*2能被num_heads整除 if (hidden_size * 2) % 4 != 0: hidden_size = ((hidden_size * 2) // 4) * 4 // 2 # 调整到最近的合规值 print(f"调整hidden_size为{hidden_size}以满足整除条件") self.lstm = nn.LSTM( input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0, bidirectional=True # 添加双向结构 ) # 添加维度检查的初始化 def weights_init(m): if isinstance(m, nn.Linear): if m.weight.dim() < 2: m.weight.data = m.weight.data.unsqueeze(0) # 确保至少2维 nn.init.xavier_uniform_(m.weight) if m.bias is not None: nn.init.constant_(m.bias, 0.0) self.bn = nn.BatchNorm1d(hidden_size * 2) # # 注意力机制层 self.attention = nn.MultiheadAttention(embed_dim=hidden_size * 2, num_heads=4) # 改用多头注意力 # 更深D输出层 self.fc = nn.Sequential( nn.Linear(hidden_size*2, hidden_size), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_size, 1) ) # 应用初始化 self.apply(weights_init) def forward(self, x): lstm_out, _ = self.lstm(x) # [batch, seq_len, hidden*2] lstm_out = lstm_out.permute(1, 0, 2) # [seq_len, batch, features] attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out) attn_out = attn_out.permute(1, 0, 2) # 恢复为[batch, seq_len, features] return self.fc(attn_out[:, -1, :]).squeeze() def plot_feature_importance(feature_names, importance_values, save_path): """绘制特征重要性图""" # 验证输入 if len(feature_names) == 0 or len(importance_values) == 0: print("警告: 无特征重要性数据可绘制") return if len(feature_names) != len(importance_values): print(f"警告: 特征名数量({len(feature_names)})与重要性值数量({len(importance_values)})不匹配") # 取较小值 min_len = min(len(feature_names), len(importance_values)) feature_names = feature_names[:min_len] importance_values = importance_values[:min_len] # 按重要性排序 indices = np.argsort(importance_values)[::-1] sorted_features = [feature_names[i] for i in indices] sorted_importance = importance_values[indices] plt.figure(figsize=(12, 8)) plt.bar(range(len(sorted_features)), sorted_importance, align="center") plt.xticks(range(len(sorted_features)), sorted_features, rotation=90) plt.xlabel("特征") plt.ylabel("重要性得分") plt.title("特征重要性排序") plt.tight_layout() # 确保保存路径存在 save_path.parent.mkdir(parents=True, exist_ok=True) plt.savefig(save_path, dpi=300) plt.close() def evaluate(model, val_loader, criterion): model.eval() val_loss = 0.0 with torch.no_grad(): for inputs, targets in val_loader: inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) val_loss += loss.item() return val_loss / len(val_loader) # 4. 模型训练函数 def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler=None, epochs=100, patience=30): """训练模型并返回最佳模型和训练历史""" best_loss = float('inf') history = {'train_loss': [], 'val_loss': []} # 添加梯度累积 accumulation_steps = 5 # 每4个batch更新一次参数 for epoch in range(epochs): # 训练阶段 model.train() train_loss = 0.0 optimizer.zero_grad() for inputs, targets in train_loader: inputs, targets = inputs.to(device), targets.to(device) scaler = torch.cuda.amp.GradScaler() # 在训练循环中添加 with torch.cuda.amp.autocast(): outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) scaler.scale(loss).backward() for batch_idx, (inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(device), targets.to(device) # 前向传播 outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) # 梯度累积 loss = loss / accumulation_steps scaler.scale(loss).backward() if (batch_idx + 1) % accumulation_steps == 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() optimizer.zero_grad() train_loss += loss.item() * accumulation_steps # 验证 val_loss = evaluate(model, val_loader, criterion) if scheduler: scheduler.step(val_loss) # 根据验证损失调整学习率 # 记录历史 avg_train_loss = train_loss / len(train_loader) history['train_loss'].append(avg_train_loss) history['val_loss'].append(val_loss) # 早停逻辑 if val_loss < best_loss * 0.99:# 相对改进阈值 best_loss = val_loss best_epoch = epoch torch.save(model.state_dict(), 'best_model.pth') print(f"Epoch {epoch + 1}/{epochs} - 训练损失: {avg_train_loss :.4f} - 验证损失: {val_loss:.4f}") # 早停判断 if epoch - best_epoch >= patience: print(f"早停触发,最佳epoch: {best_epoch+1}") break # 加载最佳模型 model.load_state_dict(torch.load('best_model.pth')) return model, history # 5. 贝叶斯优化函数 def optimize_hyperparameters(X_train, y_train, input_size): """使用贝叶斯优化寻找最佳超参数""" # 自定义评分函数 def score_fn(params): """内部评分函数""" try: params = adjust_hidden_size(params) # 调整参数 hidden_size, num_layers, dropout, lr, batch_size, timesteps = params # 确保参数有效 batch_size = max(32, min(256, int(batch_size))) timesteps = max(3, min(10, int(timesteps))) dropout = min(0.5, max(0.1, float(dropout))) lr = min(0.01, max(1e-5, float(lr))) # 检查数据是否足够 if len(X_train) < 2 * timesteps+1: # 至少需要2倍时间步长的数据 return float('inf') # 创建模型 model = LSTMModel( input_size=input_size, hidden_size=int(hidden_size), num_layers=min(3, int(num_layers)), dropout=min(0.5, float(dropout)) ).to(device) # 初始化权重 # for name, param in model.named_parameters(): # if 'weight' in name: # nn.init.xavier_normal_(param) # elif 'bias' in name: # nn.init.constant_(param, 0.1) # 损失函数和优化器 criterion = nn.HuberLoss(delta=1.0) optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3) # 创建数据加载器 dataset = TimeSeriesDataset(X_train, y_train, timesteps=int(timesteps)) # 简化验证流程 train_size = int(0.8 * len(dataset)) train_dataset = torch.utils.data.Subset(dataset, range(train_size)) val_dataset = torch.utils.data.Subset(dataset, range(train_size, len(dataset))) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # 简单训练和验证 model.train() for epoch in range(15): # 减少epoch数以加快评估 for inputs, targets in train_loader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets.squeeze()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # 验证 model.eval() val_loss = 0.0 with torch.no_grad(): for inputs, targets in val_loader: outputs = model(inputs.to(device)) loss = criterion(outputs, targets.squeeze().to(device)) if torch.isnan(loss) or torch.isinf(loss): return float('inf') val_loss += loss.item() return val_loss / len(val_loader) except Exception as e: print(f"参数评估失败: {str(e)}") return float('inf') # 定义搜索空间 search_spaces = [ Integer(32, 128, name='hidden_size'), Integer(1, 3, name='num_layers'), Real(0.2, 0.5, name='dropout'), Real(5e-4, 1e-3, prior='log-uniform', name='lr'), Categorical([64, 128, 256], name='batch_size'), Integer(3, 10, name='timesteps') # 优化时间步长 ] def adjust_hidden_size(params): """确保hidden_size*2能被4整除""" hs = params[0] params[0] = ((hs * 2) // 4) * 4 // 2 return params result = gp_minimize( score_fn, search_spaces, n_calls=50, random_state=42, verbose=True, n_jobs=1 # 并行执行 ) # 提取最佳参数 best_params = { 'hidden_size': result.x[0], 'num_layers': result.x[1], 'dropout': result.x[2], 'lr': result.x[3], 'batch_size': result.x[4], 'timesteps': result.x[5] } print("优化完成,最佳参数:", best_params) return best_params def collate_fn(batch): """增强型数据批处理函数""" # 解包批次数据 inputs, targets = zip(*batch) # 维度转换 (已包含在数据集中) # inputs = [batch_size, features, seq_len] # 数据增强(可选) # 添加高斯噪声 noise = torch.randn_like(inputs) * 0.05 inputs = inputs + noise # 归一化处理(可选) # mean = inputs.mean(dim=(1,2), keepdim=True) # std = inputs.std(dim=(1,2), keepdim=True) # inputs = (inputs - mean) / (std + 1e-8) return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features] # 6. 评估函数 def evaluate_model(model, test_loader, criterion, test_indices, y_scaler=None): """评估模型性能""" model.eval() test_loss = 0.0 y_true = [] y_pred = [] all_indices = [] with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) if outputs.dim() == 1: outputs = outputs.unsqueeze(1) loss = criterion(outputs, targets) test_loss += loss.item() * inputs.size(0) # 收集预测结果 y_true.extend(targets.cpu().numpy()) y_pred.extend(outputs.cpu().numpy()) # 获取原始数据集中的索引 current_indices = test_indices[batch_idx * test_loader.batch_size: (batch_idx + 1) * test_loader.batch_size] all_indices.extend(current_indices) y_true = np.array(y_true).reshape(-1) y_pred = np.array(y_pred).reshape(-1) if y_scaler is not None: y_true = y_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten() y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten() # 基础指标 metrics = { 'MSE': mean_squared_error(y_true, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 'MAE': mean_absolute_error(y_true, y_pred), 'R2': r2_score(y_true, y_pred), 'MAPE': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100, # 避免除0 'indices': all_indices, # 添加原始索引 'y_true_original': y_true, 'y_pred_original': y_pred, 'test_loss': test_loss } # 可视化误差分布 errors = y_true - y_pred plt.figure(figsize=(12, 6)) sns.histplot(errors, kde=True, bins=50) plt.title('Error Distribution') plt.savefig('error_distribution.png') plt.close() return metrics, y_true, y_pred def collate_fn(batch): """增强型数据批处理函数""" # 解包批次数据 inputs, targets = zip(*batch) # 维度转换 (已包含在数据集中) # inputs = [batch_size, features, seq_len] # 数据增强(可选) # 添加高斯噪声 noise = torch.randn_like(inputs) * 0.05 inputs = inputs + noise # 归一化处理(可选) # mean = inputs.mean(dim=(1,2), keepdim=True) # std = inputs.std(dim=(1,2), keepdim=True) # inputs = (inputs - mean) / (std + 1e-8) return inputs.permute(0, 2, 1), torch.stack(targets) # [batch, seq_len, features] # 7. 主函数 def main(): # 1. 加载和预处理数据 print("正在加载和预处理数据...") datetimes, X_structured, y, location_codes, scaler , y_scaler= load_and_preprocess_data() # 2. 特征选择 print('正在进行特征选择') # 修改为选择前15%特征 n_features = int(X_structured.shape[1] * 0.15) X_selected, feature_importances, top_features = select_features_by_importance( X_structured, y, n_features ) X_selected = X_structured[top_features] print(f"选择后的特征及其重要性:") for feature, importance in zip(top_features, feature_importances): print(f"{feature}: {importance:.4f}") print(X_selected) # 绘制特征重要性图 plot_feature_importance(top_features, feature_importances, Path("feature_importance.png")) # 3. 创建时间序列数据集 print("正在创建时间序列数据集...") timesteps = 5 dataset = TimeSeriesDataset(X_selected, y, timesteps) # 4. 数据划分 train_size = int(0.8 * len(dataset)) train_indices = list(range(train_size)) test_indices = list(range(train_size, len(dataset))) train_dataset = torch.utils.data.Subset(dataset, train_indices) test_dataset = torch.utils.data.Subset(dataset, test_indices) # 5. 贝叶斯优化超参数 print("正在进行贝叶斯优化...") try: best_params = optimize_hyperparameters( X_selected.iloc[:train_size], y[:train_size].copy(), input_size=X_selected.shape[1] ) print("最佳参数:", best_params) except Exception as e: print(f"贝叶斯优化失败: {str(e)}") # 6. 使用最佳参数训练最终模型 torch.cuda.empty_cache() # 清理 GPU 缓存 print("\n使用最佳参数训练模型...") # 获取并验证batch_size batch_size = int(best_params.get('batch_size')) print(f"实际使用的batch_size类型: {type(batch_size)}, 值: {batch_size}") # 调试输出 model = LSTMModel( input_size=X_selected.shape[1], hidden_size=int(best_params['hidden_size']), num_layers=int(best_params['num_layers']), dropout=float(best_params['dropout']) ).to(device) # 数据加载器 train_loader = DataLoader( train_dataset, batch_size=int(batch_size), shuffle=True, # 训练集需要打乱 collate_fn=collate_fn, num_workers=4, # 多进程加载 pin_memory=True # 加速GPU传输 ) val_loader = DataLoader( test_dataset, batch_size=int(batch_size)*2, # 更大的批次提升验证效率 shuffle=False, # 验证集不需要打乱 collate_fn=lambda batch: ( torch.stack([x for x, y in batch]).permute(0, 2, 1), torch.stack([y for x, y in batch]) ), num_workers=2, pin_memory=True ) # 损失函数和优化器 criterion = nn.HuberLoss(delta=1.0) optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50) # 训练模型 model, history = train_model( model, train_loader, val_loader, criterion, optimizer, scheduler=scheduler, epochs=200, patience=15) torch.cuda.empty_cache() # 清理 GPU 缓存 # 7. 评估模型 print("\n评估模型性能...") metrics, y_true, y_pred = evaluate_model(model, val_loader, criterion, test_indices, y_scaler) print(f"测试集 MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}") # 8. 保存所有结果 print("\n保存所有结果...") output_dir = Path(r"D:\result2") output_dir.mkdir(parents=True, exist_ok=True) # 保存评估指标 metrics_df = pd.DataFrame({ 'Metric': ['MSE', 'MAE', 'R2', 'MAPE', 'Test Loss'], 'Value': [metrics['MSE'], metrics['MAE'], metrics['R2'], metrics['MAPE'], metrics['test_loss']] }) metrics_df.to_csv(output_dir / 'evaluation_metrics.csv', index=False) # 保存训练历史 history_df = pd.DataFrame(history) history_df.to_csv(output_dir / 'training_history.csv', index=False) # 保存预测结果与原始数据 pred_indices = [i + timesteps - 1 for i in metrics['indices']] # 调整索引以匹配原始数据 # 确保我们有足够的datetime和locationCode数据 if len(datetimes) > max(pred_indices) and len(location_codes) > max(pred_indices): y_true = y_true.flatten() # 确保是一维 y_pred = y_pred.flatten() # 确保是一维 result_df = pd.DataFrame({ 'datetime': datetimes.iloc[pred_indices].values, 'locationCode': location_codes.iloc[pred_indices].values, 'true_value': y_true, 'predicted_value': y_pred }) # 有条件地添加分位数 if y_pred.shape[1] > 2: result_df['predicted_lower'] = y_pred[:, 0] # 10%分位数 result_df['predicted_upper'] = y_pred[:, 2] # 90%分位数 # 添加其他特征 for i, feature in enumerate(X_selected.columns): result_df[feature] = X_selected.iloc[pred_indices, i].values result_df.to_csv(output_dir / 'predictions_with_metadata.csv', index=False) else: print("警告: datetime或locationCode数据不足,无法完全匹配预测结果") # 保存基础预测结果 pd.DataFrame({ 'true_value': y_true.flatten(), 'predicted_value': y_pred.flatten() }).to_csv(output_dir / 'predictions.csv', index=False) # 9. 可视化结果 plt.figure(figsize=(12, 6)) plt.plot(history['train_loss'], label='训练损失') plt.plot(history['val_loss'], label='验证损失') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('训练过程') plt.legend() plt.savefig(output_dir / 'training_process.png', dpi=300) plt.close() # 添加预测结果可视化 plt.figure(figsize=(15, 6)) plt.plot(y_true[:200], label='真实值') plt.plot(y_pred[:200], label='预测值') # 只使用中位数预测 plt.title('预测结果对比') plt.legend() plt.savefig(output_dir / 'prediction_comparison.png', dpi=300) plt.show() # 误差分布图 errors = y_true - y_pred[:, 1] plt.hist(errors, bins=50) plt.title('预测误差分布') plt.savefig(output_dir / 'error_distribution.png', dpi=300) # 保存图像 plt.close() # # 添加分位数预测可视化 # plt.figure(figsize=(15, 6)) # plt.plot(y_true[:100], label='真实值') # plt.plot(y_pred[:100, 0], label='10%分位数') # plt.plot(y_pred[:100, 1], label='中位数') # plt.plot(y_pred[:100, 2], label='90%分位数') # plt.legend() # plt.savefig(output_dir / 'quantile_predictions.png', dpi=300) # 保存图像 # plt.close() # 9. 保存模型 if metrics['r2'] > 0.8: model_path = output_dir / 'best_model.pth' torch.save(model.state_dict(), model_path) print(f"模型保存成功: {model_path}") print(f"所有结果已保存到 {output_dir}") if __name__ == "__main__": warnings.filterwarnings('ignore') start_time = time.time() main() print(f"总运行时间: {(time.time() - start_time) / 60:.2f}分钟")参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 Iteration No: 5 ended. Evaluation done at random point. Time taken: 0.0120 Function value obtained: inf Current minimum: inf Iteration No: 6 started. Evaluating function at random point. 数据形状 - X: (21168, 3), y: (21168, 1) 参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 Iteration No: 6 ended. Evaluation done at random point. Time taken: 0.0170 Function value obtained: inf Current minimum: inf Iteration No: 7 started. Evaluating function at random point. 数据形状 - X: (21168, 3), y: (21168, 1) 参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 Iteration No: 7 ended. Evaluation done at random point. Time taken: 0.0126 Function value obtained: inf Current minimum: inf Iteration No: 8 started. Evaluating function at random point. 数据形状 - X: (21168, 3), y: (21168, 1) 参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 Iteration No: 8 ended. Evaluation done at random point. Time taken: 0.0126 Function value obtained: inf Current minimum: inf Iteration No: 9 started. Evaluating function at random point. 数据形状 - X: (21168, 3), y: (21168, 1) 参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 Iteration No: 9 ended. Evaluation done at random point. Time taken: 0.0085 Function value obtained: inf Current minimum: inf Iteration No: 10 started. Evaluating function at random point. 数据形状 - X: (21168, 3), y: (21168, 1) 参数评估失败: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 3 is not equal to len(dims) = 2 贝叶斯优化失败: Input y contains infinity or a value too large for dtype('float64').结合代码分析为啥优化失败 并且改造
05-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值