5、基于逻辑回归的客户流失预测与贷款违约预测

基于逻辑回归的客户流失预测与贷款违约预测

1. 逻辑回归基础

逻辑回归是一种用于预测事件发生概率的统计模型。以客户流失预测为例,其核心公式为 (p(y = 1 | x) = \frac{1}{1 + e^{-wx}}),其中 (p(y = 1 | x)) 是在给定特征 (x) 时事件(如客户流失)发生的概率,(w) 是权重向量,(x) 是特征向量,权重向量 (w) 在模型训练过程中学习得到。

1.1 逻辑回归的直觉理解

逻辑回归模型由一组权重和偏差参数化。权重与数据点的特征相乘,偏差加到结果上,这个总和再通过一个 sigmoid 函数得到一个概率值。sigmoid 函数是一个非线性函数,它将模型的输出压缩到 0 到 1 之间,其输出可解释为事件发生的概率。

1.2 逻辑函数与相关概念

  • 逻辑函数 :数学表示为 (f(x) = \frac{1}{1 + e^{-x}}),能将任何实数值输入映射到 0 到 1 之间,在二元分类问题中非常有用。
  • 赔率和对数赔率 :如果 (p) 是特定事件的概率,那么该事件的赔率为 (odds = \frac{p}{1 - p}),逻辑回归建模的是对数赔率 (log(odds) = ln(\frac{p}{1 - p}))。
  • 逻辑回归方程 :(ln(\frac{p}{1 - p}) = \beta_0 + \beta_1X_1 + \beta_2X_2 + \cdots + \beta_kX_k),其中 (\beta_0, \beta_
<think>我们之前已经给出了一个比较完整的代码框架,但是用户反馈说没有生成完整的预测部分。现在,我们专注于预测部分的重写,特别是基于8月数据文件进行预测。下面的代码将重点放在预测流程上,并简化了特征工程,避免滞后特征问题。 我们将创建一个名为`SeepagePredictor`的类,它封装了数据加载、特征工程、模型训练(可选)和预测的功能。预测部分将专门针对8月数据文件。 下面是一个完整的、可执行的版本,其中预测部分被重构: ```python import pandas as pd import numpy as np from datetime import datetime import warnings import os import joblib warnings.filterwarnings('ignore') # 导入机器学习库 from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline # 导入高级模型 import xgboost as xgb import lightgbm as lgb np.random.seed(42) # 配置参数 AUGUST_DATA_PATH = r'D:\zhongyuan_DM_pycharm\python\渗透量处理\8月数据.xlsx' HISTORICAL_DATA_PATH = r'D:\zhongyuan_DM_pycharm\python\渗透量处理\大坝数据.xlsx' OUTPUT_PATH = r'2025年8月渗漏量预测结果.xlsx' MODEL_SAVE_PATH = 'trained_models.joblib' class SeepagePredictor: def __init__(self): self.models = {} self.feature_names = None self.scaler = StandardScaler() def load_data(self, path, is_historical=True): """加载数据并预处理""" print(f"加载数据: {path}") if not os.path.exists(path): raise FileNotFoundError(f"文件不存在: {path}") df = pd.read_excel(path, engine='openpyxl') # 列名规范化 column_map = { '间': 'datetime', '渗漏量': 'seepage', '降雨量': 'rainfall', '温度': 'temperature', '水位': 'water_level' } df.rename(columns=column_map, inplace=True) # 确保有datetime列 if 'datetime' not in df.columns: raise ValueError("数据必须包含'datetime'列") # 转换间列 df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce') df = df.sort_values('datetime').reset_index(drop=True) # 数值列处理 num_cols = ['rainfall', 'temperature', 'water_level'] if is_historical: num_cols.append('seepage') for col in num_cols: if col in df.columns: # 转换数据类型 df[col] = pd.to_numeric(df[col], errors='coerce') # 处理负值 if col in ['water_level', 'seepage']: df[col] = df[col].apply(lambda x: x if x > 0 else np.nan) else: df[col] = df[col].apply(lambda x: x if x >= 0 else np.nan) # 插值 df[col] = df[col].interpolate(method='linear', limit_direction='both') df[col] = df[col].fillna(method='ffill').fillna(method='bfill') else: if is_historical: raise ValueError(f"历史数据缺少列: {col}") else: # 预测数据中如果缺少,使用默认值 df[col] = 0.0 if col == 'water_level': df[col] = 100.0 # 默认水位 return df def create_features(self, df): """创建特征""" print("创建特征...") # 间特征 df['year'] = df['datetime'].dt.year df['month'] = df['datetime'].dt.month df['day'] = df['datetime'].dt.day df['hour'] = df['datetime'].dt.hour df['dayofyear'] = df['datetime'].dt.dayofyear df['weekofyear'] = df['datetime'].dt.isocalendar().week.astype(int) # 周期性特征 df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24)) df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24)) df['dayofyear_sin'] = np.sin(df['dayofyear'] * (2 * np.pi / 365)) df['dayofyear_cos'] = np.cos(df['dayofyear'] * (2 * np.pi / 365)) # 滞后特征(仅使用water_level的滞后,因为其他变量预测没有未来值) lags = [1, 2, 3, 6, 12, 24] for lag in lags: df[f'water_level_lag_{lag}'] = df['water_level'].shift(lag) # 滚动特征 windows = [3, 6, 12, 24] for window in windows: df[f'water_level_roll_mean_{window}'] = df['water_level'].rolling(window=window, min_periods=1).mean() df[f'water_level_roll_std_{window}'] = df['water_level'].rolling(window=window, min_periods=1).std() if 'rainfall' in df.columns: df[f'rainfall_roll_mean_{window}'] = df['rainfall'].rolling(window=window, min_periods=1).mean() if 'temperature' in df.columns: df[f'temperature_roll_mean_{window}'] = df['temperature'].rolling(window=window, min_periods=1).mean() # 交互特征 df['water_level_rainfall'] = df['water_level'] * df['rainfall'] df['water_level_temperature'] = df['water_level'] * df['temperature'] df['is_rainy_season'] = df['month'].isin([5, 6, 7, 8, 9]).astype(int) # 填充滞后特征产生的空值 for col in df.columns: if df[col].isnull().any(): df[col].fillna(df[col].mean(), inplace=True) # 记录特征名(训练) if 'seepage' in df.columns: self.feature_names = [col for col in df.columns if col not in ['datetime', 'seepage']] print(f"特征数量: {len(self.feature_names)}") return df def train(self, historical_path): """训练模型""" print("开始训练模型...") # 加载历史数据 df = self.load_data(historical_path, is_historical=True) df = self.create_features(df) # 划分训练集和测试集(最后一个月作为测试集) split_date = df['datetime'].max() - pd.Timedelta(days=30) train_df = df[df['datetime'] <= split_date] test_df = df[df['datetime'] > split_date] X_train = train_df[self.feature_names] y_train = train_df['seepage'] X_test = test_df[self.feature_names] y_test = test_df['seepage'] # 定义模型 models = { 'LinearRegression': LinearRegression(), 'Ridge': Ridge(alpha=0.5), 'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42), 'XGBoost': xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42), 'LightGBM': lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42) } # 训练模型 for name, model in models.items(): print(f"训练 {name}...") pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', model) ]) pipeline.fit(X_train, y_train) self.models[name] = pipeline # 评估 y_pred = pipeline.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print(f"{name} 测试集 RMSE: {rmse:.4f}, MAE: {mae:.4f}") # 保存模型 joblib.dump(self.models, MODEL_SAVE_PATH) print(f"模型已保存到 {MODEL_SAVE_PATH}") def predict_for_august(self, august_data_path): """预测8月渗漏量""" # 如果模型未训练,尝试加载 if not self.models: if os.path.exists(MODEL_SAVE_PATH): self.models = joblib.load(MODEL_SAVE_PATH) print(f"加载已训练模型: {list(self.models.keys())}") else: raise FileNotFoundError("未找到训练好的模型,请先训练模型") # 加载8月数据 august_df = self.load_data(august_data_path, is_historical=False) # 为了构建滞后特征,需要连接历史数据 historical_df = self.load_data(HISTORICAL_DATA_PATH, is_historical=True) # 取历史数据的最后24小 last_24h = historical_df.tail(24) # 将最后24小8月数据连接 combined_df = pd.concat([last_24h, august_df], ignore_index=True) # 创建特征 combined_df = self.create_features(combined_df) # 提取8月部分 august_start = august_df['datetime'].min() august_end = august_df['datetime'].max() august_mask = (combined_df['datetime'] >= august_start) & (combined_df['datetime'] <= august_end) X_august = combined_df.loc[august_mask, self.feature_names] # 使用集成模型进行预测(加权平均) predictions = {} weights = { 'LinearRegression': 0.1, 'Ridge': 0.1, 'RandomForest': 0.25, 'XGBoost': 0.25, 'LightGBM': 0.3 } ensemble_pred = np.zeros(len(X_august)) for name, model in self.models.items(): if name in weights: pred = model.predict(X_august) ensemble_pred += weights[name] * pred predictions[name] = pred # 创建结果DataFrame result_df = pd.DataFrame({ 'datetime': combined_df.loc[august_mask, 'datetime'], 'predicted_seepage': ensemble_pred }) # 保存结果 result_df.to_excel(OUTPUT_PATH, index=False) print(f"预测结果已保存到 {OUTPUT_PATH}") # 打印统计信息 print("\n预测结果统计:") print(f"预测间范围: {result_df['datetime'].min()} 至 {result_df['datetime'].max()}") print(f"平均渗漏量: {result_df['predicted_seepage'].mean():.4f} L/s") print(f"最小渗漏量: {result_df['predicted_seepage'].min():.4f} L/s") print(f"最大渗漏量: {result_df['predicted_seepage'].max():.4f} L/s") return result_df if __name__ == "__main__": predictor = SeepagePredictor() # 如果模型已经训练过,可以跳过训练 if not os.path.exists(MODEL_SAVE_PATH): predictor.train(HISTORICAL_DATA_PATH) else: print("检测到已训练模型,跳过训练...") # 预测8月数据 predictor.predict_for_august(AUGUST_DATA_PATH) ``` ### 代码说明 1. **类`SeepagePredictor`**:封装了整个预测流程。 - `load_data`方法:加载数据,进行基本预处理(列名规范化、转换、数值处理)。 - `create_features`方法:创建间特征、滞后特征、滚动特征和交互特征。注意:滞后特征仅使用`water_level`,因为其他变量预测没有未来值。 - `train`方法:训练多个模型(线性回归、岭回归、随机森林、XGBoost、LightGBM),并保存模型。 - `predict_for_august`方法:加载8月数据,结合历史数据构建特征,然后使用加权平均集成模型进行预测。 2. **预测流程**: - 加载8月数据。 - 取历史数据的最后24小(用于构建滞后特征)。 - 将历史数据的最后24小8月数据连接。 - 在连接后的数据上创建特征(这样8月数据的前几个小就可以利用历史数据构建滞后特征)。 - 提取8月数据对应的特征部分进行预测。 3. **模型集成**:对各个模型的预测结果进行加权平均,权重分配为: - 线性模型(LinearRegression和Ridge)各0.1 - 随机森林0.25 - XGBoost 0.25 - LightGBM 0.3 4. **输出**:保存预测结果到Excel文件,并打印统计信息。 ### 注意事项 1. 在运行预测之前,需要确保历史数据文件(`大坝数据.xlsx`)和8月数据文件(`8月数据.xlsx`)的路径正确。 2. 如果模型已经训练过,会跳过训练直接加载模型(模型保存在`trained_models.joblib`文件中)。 3. 如果8月数据文件中包含的历史数据不足24小,则在构建滞后特征可能会使用历史数据的最后部分来填充。 这个版本简化了特征工程,专注于预测部分,并确保了整个流程的连贯性。 §§相关问题§§: 1. 如何调整模型权重以获得更好的预测结果? 2. 如果8月数据文件中的列名历史数据文件不一致,应该如何处理? 3. 如何增加更多特征来提高预测的准确性? 4. 能否使用深度学习模型(如LSTM)来预测间序列的渗漏量? </think>我理解您需要的完整代码示例没有被完全生成。下面是完整的、重构后的大坝渗漏量预测模型代码,专注于基于8月数据文件的预测部分: ```python #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 大坝渗漏量预测模型(完整预测版) 日期: 2025-10-01 主要优化: 1. 完整实现了基于8月数据的预测流程 2. 重构特征工程,避免滞后特征问题 3. 增强数据预处理和异常处理 """ import pandas as pd import numpy as np from datetime import datetime import warnings import os import joblib warnings.filterwarnings('ignore') # 导入机器学习库 from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.pipeline import Pipeline # 导入高级模型 import xgboost as xgb import lightgbm as lgb np.random.seed(42) # ==================== 配置参数 ==================== # 文件路径配置 AUGUST_DATA_PATH = r'D:\zhongyuan_DM_pycharm\python\渗透量处理\8月数据.xlsx' # 8月数据文件路径 HISTORICAL_DATA_PATH = r'D:\zhongyuan_DM_pycharm\python\渗透量处理\大坝数据.xlsx' # 历史数据文件路径 OUTPUT_PATH = r'2025年8月渗漏量预测结果.xlsx' # 预测结果输出路径 MODEL_SAVE_PATH = 'trained_models.joblib' # 模型保存路径 # ================================================= class SeepagePredictor: """大坝渗漏量预测器""" def __init__(self): self.models = {} self.feature_names = [] self.scaler = StandardScaler() def load_and_preprocess(self, file_path, is_historical=True): """加载和预处理数据""" print(f"加载{'历史' if is_historical else '8月'}数据: {file_path}") # 检查文件是否存在 if not os.path.exists(file_path): raise FileNotFoundError(f"数据文件不存在: {file_path}") # 读取Excel文件 try: df = pd.read_excel(file_path, engine='openpyxl') print(f"原始数据形状: {df.shape}, 列名: {df.columns.tolist()}") except Exception as e: raise ValueError(f"读取文件失败: {str(e)}") # 规范化列名 rename_map = {} if '间' in df.columns: rename_map['间'] = 'datetime' if '渗漏量' in df.columns: rename_map['渗漏量'] = 'seepage' if '降雨量' in df.columns: rename_map['降雨量'] = 'rainfall' if '温度' in df.columns: rename_map['温度'] = 'temperature' if '水位' in df.columns: rename_map['水位'] = 'water_level' if rename_map: df = df.rename(columns=rename_map) # 确保所有必需列存在 required_cols = ['datetime', 'rainfall', 'temperature', 'water_level'] if is_historical: required_cols.append('seepage') missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: print(f"警告: 缺少列 {missing_cols},尝试补充默认值") for col in missing_cols: if col == 'seepage': df[col] = np.nan if is_historical else 0.0 else: df[col] = 100.0 if col == 'water_level' else 0.0 # 数据类型转换 df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce') # 数值处理 numeric_cols = ['rainfall', 'temperature', 'water_level'] if is_historical: numeric_cols.append('seepage') for col in numeric_cols: # 处理异常值和缺失值 df[col] = pd.to_numeric(df[col], errors='coerce') if col in ['water_level', 'seepage']: df.loc[df[col] <= 0, col] = np.nan else: df.loc[df[col] < 0, col] = np.nan # 插值填充 df[col] = df[col].interpolate(method='linear', limit_direction='both') df[col] = df[col].fillna(method='ffill').fillna(method='bfill') # 打印统计信息 stats = df[col].agg(['min', 'mean', 'max']) print(f"{col}: 最小={stats['min']:.2f}, 平均={stats['mean']:.2f}, 最大={stats['max']:.2f}") # 间范围验证 if not df['datetime'].is_monotonic_increasing: df = df.sort_values('datetime') print(f"数据加载完成: {len(df)}条记录 ({df['datetime'].min()} 至 {df['datetime'].max()})") return df def create_time_features(self, df): """创建间特征""" df['year'] = df['datetime'].dt.year df['month'] = df['datetime'].dt.month df['day'] = df['datetime'].dt.day df['hour'] = df['datetime'].dt.hour df['dayofyear'] = df['datetime'].dt.dayofyear df['weekofyear'] = df['datetime'].dt.isocalendar().week.astype(int) # 周期性特征 df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24)) df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24)) df['dayofyear_sin'] = np.sin(df['dayofyear'] * (2 * np.pi / 365)) df['dayofyear_cos'] = np.cos(df['dayofyear'] * (2 * np.pi / 365)) return df def create_lag_features(self, df, col, lags=[1, 2, 3, 6, 12, 24]): """创建滞后特征""" for lag in lags: df[f'{col}_lag{lag}'] = df[col].shift(lag) return df def create_rolling_features(self, df, col, windows=[3, 6, 12, 24]): """创建滚动特征""" for window in windows: df[f'{col}_roll_mean{window}'] = df[col].rolling(window, min_periods=1).mean() df[f'{col}_roll_std{window}'] = df[col].rolling(window, min_periods=1).std() return df def create_interaction_features(self, df): """创建交互特征""" df['water_rain_interact'] = df['water_level'] * df['rainfall'] df['water_temp_interact'] = df['water_level'] * df['temperature'] df['rain_temp_interact'] = df['rainfall'] * df['temperature'] df['is_rainy'] = df['month'].between(5, 9).astype(int) return df def build_features(self, df, is_train=True): """构建完整的特征集""" print(f"构建{'训练' if is_train else '预测'}特征...") # 基础特征 df = self.create_time_features(df) # 滞后特征 for col in ['water_level', 'rainfall', 'temperature']: df = self.create_lag_features(df, col) # 滚动特征 for col in ['water_level', 'rainfall', 'temperature']: df = self.create_rolling_features(df, col) # 交互特征 df = self.create_interaction_features(df) # 填充缺失值 numeric_cols = df.select_dtypes(include=np.number).columns for col in numeric_cols: if df[col].isnull().any(): df[col].fillna(df[col].mean(), inplace=True) # 保存特征名用于预测 if is_train: self.feature_names = [col for col in df.columns if col not in ['datetime', 'seepage']] print(f"特征数量: {len(self.feature_names)}") # 只保留需要的特征 if self.feature_names and not is_train: missing_cols = set(self.feature_names) - set(df.columns) if missing_cols: print(f"补充缺失特征: {missing_cols}") for col in missing_cols: df[col] = 0.0 df = df[['datetime'] + self.feature_names] return df def train_models(self, X_train, y_train): """训练多个模型""" print("\n训练模型...") models = { 'LinearRegression': LinearRegression(), 'Ridge': Ridge(alpha=0.5, random_state=42), 'RandomForest': RandomForestRegressor( n_estimators=150, max_depth=8, min_samples_split=15, random_state=42, n_jobs=-1 ), 'XGBoost': xgb.XGBRegressor( n_estimators=150, max_depth=6, learning_rate=0.08, subsample=0.8, colsample_bytree=0.8, random_state=42 ), 'LightGBM': lgb.LGBMRegressor( n_estimators=150, max_depth=6, learning_rate=0.07, subsample=0.8, colsample_bytree=0.8, random_state=42 ) } # 训练并保存模型 for name, model in models.items(): try: print(f"训练 {name}...") pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', model) ]) pipeline.fit(X_train, y_train) self.models[name] = pipeline print(f"{name} 训练完成") except Exception as e: print(f"训练 {name} 失败: {str(e)[:100]}") if not self.models: raise RuntimeError("所有模型训练失败,请检查数据") # 保存模型 joblib.dump(self.models, MODEL_SAVE_PATH) print(f"模型已保存至 {MODEL_SAVE_PATH}") return self.models def evaluate_models(self, X_test, y_test): """评估模型性能""" print("\n评估模型性能...") results = [] for name, model in self.models.items(): try: y_pred = model.predict(X_test) y_pred = np.clip(y_pred, 0, y_test.max() * 1.5) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results.append({ 'Model': name, 'RMSE': rmse, 'MAE': mae, 'R2': r2 }) print(f"{name}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}") except Exception as e: print(f"评估 {name} 失败: {str(e)[:100]}") return pd.DataFrame(results) def predict(self, X): """使用集成方法进行预测""" if not self.models: raise RuntimeError("请先训练或加载模型") predictions = {} weights = {} total_score = 0 # 第一步:收集各模型的预测结果并计算权重 for name, model in self.models.items(): try: pred = model.predict(X) # 为每个模型分配初始权重(基于模型类型) if 'XGBoost' in name or 'LightGBM' in name: weights[name] = 0.35 # 给树模型较高权重 else: weights[name] = 0.15 predictions[name] = pred total_score += weights[name] except Exception as e: print(f"模型 {name} 预测失败: {str(e)[:50]}") # 归一化权重 ensemble_pred = np.zeros(X.shape[0]) for name, w in weights.items(): normalized_w = w / total_score ensemble_pred += normalized_w * predictions[name] # 后处理:确保预测值在合理范围内 min_val = min(X['water_level'].min(), 0) max_val = max(X['water_level'].max() * 1.5, 100) ensemble_pred = np.clip(ensemble_pred, min_val, max_val) return ensemble_pred def predict_august(self, historical_df, august_df): """基于8月数据进行预测""" print("\n开始8月渗漏量预测...") # 1. 预处理8月数据 august_df = self.build_features(august_df, is_train=False) # 2. 合并历史数据用于特征构建 # 获取历史数据最后24小用于构建滞后特征 historical_last_24h = historical_df.sort_values('datetime').tail(24) combined_df = pd.concat([historical_last_24h, august_df], ignore_index=True) # 3. 重建特征(包含历史数据) combined_df = self.build_features(combined_df, is_train=False) # 4. 提取8月数据部分 august_start = august_df['datetime'].min() august_end = august_df['datetime'].max() august_mask = (combined_df['datetime'] >= august_start) & (combined_df['datetime'] <= august_end) X_august = combined_df.loc[august_mask, self.feature_names].copy() # 5. 执行预测 predictions = self.predict(X_august) # 6. 创建结果DataFrame result_df = pd.DataFrame({ 'datetime': combined_df.loc[august_mask, 'datetime'], 'predicted_seepage_L/s': predictions.round(4), 'water_level': X_august['water_level'], 'rainfall': X_august['rainfall'], 'temperature': X_august['temperature'] }) # 7. 保存结果 result_df.to_excel(OUTPUT_PATH, index=False) print(f"预测结果已保存至 {OUTPUT_PATH}") # 8. 打印统计信息 print("\n预测结果统计:") print(f"间段: {august_start} 至 {august_end}") print(f"预测点数: {len(result_df)}") print(f"平均渗漏量: {result_df['predicted_seepage_L/s'].mean():.2f} L/s") print(f"最小渗漏量: {result_df['predicted_seepage_L/s'].min():.2f} L/s") print(f"最大渗漏量: {result_df['predicted_seepage_L/s'].max():.2f} L/s") return result_df def run(self, train=True): """运行完整流程""" try: # 1. 加载历史数据 if train or not os.path.exists(MODEL_SAVE_PATH): print("="*60) print("训练阶段") print("="*60) historical_df = self.load_and_preprocess(HISTORICAL_DATA_PATH, is_historical=True) historical_df = self.build_features(historical_df, is_train=True) # 划分训练集和测试集 split_index = int(len(historical_df) * 0.8) train_df = historical_df.iloc[:split_index] test_df = historical_df.iloc[split_index:] X_train = train_df[self.feature_names] y_train = train_df['seepage'] X_test = test_df[self.feature_names] y_test = test_df['seepage'] # 训练模型 self.train_models(X_train, y_train) # 评估模型 results_df = self.evaluate_models(X_test, y_test) results_df.to_excel('model_performance.xlsx', index=False) print("模型评估结果已保存") else: # 加载已训练模型 self.models = joblib.load(MODEL_SAVE_PATH) self.feature_names = list(self.models.values())[0].named_steps['scaler'].feature_names_in_ print(f"已加载预训练模型: {list(self.models.keys())}") # 2. 加载8月数据并进行预测 print("\n" + "="*60) print("预测阶段") print("="*60) august_df = self.load_and_preprocess(AUGUST_DATA_PATH, is_historical=False) # 如果未加载历史数据,加载用于特征构建 if 'historical_df' not in locals(): historical_df = self.load_and_preprocess(HISTORICAL_DATA_PATH, is_historical=True) # 执行预测 self.predict_august(historical_df, august_df) print("\n" + "="*60) print("预测流程完成!") except Exception as e: print(f"\n❌ 流程出错: {str(e)}") import traceback traceback.print_exc() if __name__ == "__main__": print("大坝渗漏量预测系统启动") print(f"历史数据路径: {HISTORICAL_DATA_PATH}") print(f"8月数据路径: {AUGUST_DATA_PATH}") print(f"预测结果将保存至: {OUTPUT
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符  | 博主筛选后可见
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值