高度偏斜特征处理:log(x)、sqrt(x)、box-cox、Yeo-Johnson

一、概念

高度偏斜的特征 : 数据分布不均匀、不对称的特征

处理之后:使其分布更接近正态分布或至少减少偏斜程度

二、处理方法

1、对数变换:log(x)

  • 适用于右偏数据
  • 压缩大值,拉伸小值

2、平方根变换:sqrt(x)

  • 对右偏数据的效果比对数变换温和

3、Box-Cox变换

  • 一种更通用的幂变换方法
  • 可以处理各种程度的偏斜

4、Yeo-Johnson变换

  • Box-Cox的扩展,可以处理负值和零

三、代码

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import PowerTransformer

# 设置随机种子以确保结果可重现
np.random.seed(42)

# 生成一个右偏(正偏)的数据集
data = np.random.lognormal(mean=0, sigma=1, size=10000)

# 计算关键统计量
min_val = np.min(data)
q1 = np.percentile(data, 25)
median = np.median(data)
q3 = np.percentile(data, 75)
max_val = np.max(data)

print(f"原始数据:")
print(f"最小值: {min_val:.2f}")
print(f"25%分位点: {q1:.2f}")
print(f"中位数: {median:.2f}")
print(f"75%分位点: {q3:.2f}")
print(f"最大值: {max_val:.2f}")
print(f"偏度: {stats.skew(data):.2f}")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2025-08-12 16:30 # @Author : atom import numpy as np import pandas as pd import lightgbm as lgb from datetime import datetime from sklearn.preprocessing import MinMaxScaler, RobustScaler from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from matplotlib import pyplot as plt from joblib import dump, load from sklearn.model_selection import TimeSeriesSplit from sklearn.feature_selection import mutual_info_regression from scipy.stats import boxcox, yeojohnson from matplotlib import rcParams import calendar import warnings from matplotlib import rcParams from sklearn.metrics import r2_score rcParams['font.family'] = 'SimHei' rcParams['axes.unicode_minus'] = False # 忽略警告 warnings.filterwarnings("ignore") def robust_data_cleaning(df): """健壮的数据清洗和处理""" # 确保目标列存在 if '实际功率(MW)' not in df.columns: raise ValueError("数据中缺少目标列 '实际功率(MW)'") # 替换无穷大值为NaN df.replace([np.inf, -np.inf], np.nan, inplace=True) # 填充缺失值 - 使用前后填充 df = df.fillna(method='ffill').fillna(method='bfill') # 对于风速列,确保最小值大于0 wind_cols = ['ws100m', 'ws10m'] for col in wind_cols: if col in df.columns: df[col] = df[col].clip(lower=0.1) # 确保风速至少为0.1 return df def advanced_feature_engineering(df): """ 高级特征工程 - 不使用滞后功率特征 使用Yeo-Johnson变换 """ df = robust_data_cleaning(df) ws100_colname = "ws100m" target_col = '实际功率(MW)' time_colname = 'date' # 确保日期格式正确 df[time_colname] = pd.to_datetime(df[time_colname]) # 1. 时间特征工程扩展 df['hour'] = df[time_colname].dt.hour df['dayofweek'] = df[time_colname].dt.dayofweek df['month'] = df[time_colname].dt.month df['dayofyear'] = df[time_colname].dt.dayofyear df['quarter'] = df[time_colname].dt.quarter # 周期性时间特征 df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) # 2. 气象特征工程扩展 # 风向特征处理(角度转向量) if 'wd100m' in df.columns: df['wd100m_sin'] = np.sin(np.radians(df['wd100m'])) df['wd100m_cos'] = np.cos(np.radians(df['wd100m'])) # 风速多项式特征 - 使用clip确保值在合理范围内 df['ws100m**2'] = df[ws100_colname].clip(0, 30) ** 2 df['ws100m**3'] = df[ws100_colname].clip(0, 30) ** 3 df['ws100m_sqrt'] = np.sqrt(df[ws100_colname].clip(0, 30)) # 风速组合特征 - 添加剪裁和epsilon防止除零 epsilon = 1e-6 df['ws_diff'] = df['ws100m'].clip(0, 30) - df['ws10m'].clip(0, 30) # 高低层风速差 df['ws_ratio'] = df['ws100m'].clip(0, 30) / (df['ws10m'].clip(0.1, 30) + epsilon) # 高低层风速比 # 气象要素交互特征 df['wind_temp'] = df[ws100_colname].clip(0, 30) * df['t2m'] # 风速-温度交互 df['wind_humidity'] = df[ws100_colname].clip(0, 30) * df['rh'] # 风速-湿度交互 df['wind_pressure'] = df[ws100_colname].clip(0, 30) * df['sp'] # 风速-气压交互 # 3. 大气密度修正功率特征 # 使用国际标准公式:ρ = P/(R×T),其中R=287.05 J/(kg·K) df['air_density'] = df['sp'] * 100 / (287.05 * (df['t2m'] + 273.15)) # 确保风速在合理范围内 safe_ws = df[ws100_colname].clip(0.1, 30) # 风速不能为负或零 df['theoretical_power'] = 0.5 * df['air_density'] * (safe_ws ** 3) * 0.3 # 4. 目标变量处理(使用Yeo-Johnson变换替代Box-Cox) target_values = df[target_col].values # 检查目标值是否有负数或零 min_target = np.min(target_values) shift = 0 if min_target <= 0: shift = abs(min_target) + 1 target_values += shift # 使用Yeo-Johnson变换处理目标变量 transformed_target, lambda_val = yeojohnson(target_values) df['power_normalized'] = transformed_target # 移除目标和不需要的列 X = df.drop([target_col, time_colname, 'power_normalized'], axis=1) y = df['power_normalized'] # 保存变换参数用于后续逆变换 dump({'lambda': lambda_val, 'shift': shift}, 'yeojohnson_params.joblib') return X, y def feature_selection(X, y, n_features=30): """ 使用互信息进行特征选择 """ # 计算特征与目标的互信息 mi_scores = mutual_info_regression(X, y) mi_scores = pd.Series(mi_scores, index=X.columns) mi_scores = mi_scores.sort_values(ascending=False) # 选择最重要的特征(前n_features个) selected_features = mi_scores.index[:min(n_features, len(X.columns))].tolist() # 可视化特征重要性 plt.figure(figsize=(15, 10)) mi_scores.sort_values().plot.barh(title='特征互信息分数') plt.savefig('feature_importance.png', dpi=300) plt.close() print(f"选择了 {len(selected_features)} 个重要特征") return X[selected_features] def train_lightgbm(X_train, y_train): """ 使用LightGBM训练模型 - 修复.iloc错误 """ # 创建LightGBM数据集 train_data = lgb.Dataset(X_train, label=y_train) # LightGBM参数 - 优化超参数 params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.03, # 降低学习率获得更好泛化 'num_leaves': 31, 'max_depth': 7, # 限制深度防止过拟合 'min_data_in_leaf': 50, # 增加防止过拟合 'feature_fraction': 0.7, # 每次迭代使用70%特征 'bagging_fraction': 0.7, # 每次迭代使用70%数据 'bagging_freq': 5, 'lambda_l1': 0.2, # L1正则化 'lambda_l2': 0.3, # L2正则化 'min_split_gain': 0.01, 'verbose': -1, 'seed': 42 } # 使用验证集进行早期停止 valid_size = int(0.1 * len(X_train)) X_valid = X_train[-valid_size:] y_valid = y_train[-valid_size:] X_train_train = X_train[:-valid_size] y_train_train = y_train[:-valid_size] train_set = lgb.Dataset(X_train_train, label=y_train_train) valid_set = lgb.Dataset(X_valid, label=y_valid, reference=train_set) # 训练模型 model = lgb.train( params, train_set, num_boost_round=1500, # 增加迭代轮数 valid_sets=[valid_set], callbacks=[ lgb.early_stopping(stopping_rounds=100, verbose=True), lgb.log_evaluation(period=100) ] ) return model def robust_scaling(X_train, X_valid): """使用健壮缩放器处理数据""" # 使用RobustScaler处理特征 scaler = RobustScaler(quantile_range=(5, 95)) # 忽略极端值 X_train_scaled = scaler.fit_transform(X_train) X_valid_scaled = scaler.transform(X_valid) # 保存缩放器 dump(scaler, 'robust_scaler.pkl') return X_train_scaled, X_valid_scaled def main(): # 参数设置 FILE_NAME = 'B_new2.csv' # 原始数据文件 MODEL_PATH = 'lightgbm_model.pkl' # 模型保存路径 PLOT_PATH = 'Prediction_advanced.png' # 可视化图像路径 # 读取数据 print("读取数据...") try: df = pd.read_csv(FILE_NAME) print(f"成功读取数据,共 {len(df)} 行") except Exception as e: print(f"读取数据失败: {e}") return # 特征工程 print("高级特征工程...") try: X, y = advanced_feature_engineering(df) except Exception as e: print(f"特征工程失败: {e}") return # 特征选择 print("特征选择...") try: X = feature_selection(X, y, n_features=30) except Exception as e: print(f"特征选择失败: {e}") return # 划分训练集和验证集 print("划分数据集...") VALID_START = int(0.9 * len(X)) X_train = X.iloc[:VALID_START] X_valid = X.iloc[VALID_START:] y_train = y.iloc[:VALID_START] y_valid = y.iloc[VALID_START:] # 将y_train和y_valid转换为NumPy数组 y_train = y_train.values y_valid = y_valid.values # 数据缩放 (使用健壮缩放器) print("数据缩放处理...") try: X_train_scaled, X_valid_scaled = robust_scaling(X_train, X_valid) except Exception as e: print(f"数据缩放失败: {e}") return # 训练模型 print("训练LightGBM模型...") try: model = train_lightgbm(X_train_scaled, y_train) except Exception as e: print(f"模型训练失败: {e}") return # 保存模型 dump(model, MODEL_PATH) print(f"模型已保存至:{MODEL_PATH}") # 模型预测 print("模型预测...") y_valid_pred_scaled = model.predict(X_valid_scaled) # 加载变换参数 transform_params = load('yeojohnson_params.joblib') lambda_val = transform_params['lambda'] shift = transform_params['shift'] # Yeo-Johnson逆变换函数 def inverse_yeojohnson(y, lmbda, shift=0): if lmbda == 0 and y >= 0: return np.exp(y) - shift elif lmbda != 0 and y >= 0: return (y * lmbda + 1) ** (1 / lmbda) - shift elif lmbda != 2 and y < 0: return 1 - (-(2 - lmbda) * y + 1) ** (1 / (2 - lmbda)) - shift else: # lmbda == 2 and y < 0 return -np.exp(-y) + 1 - shift # 向量化逆变换函数 inverse_yeojohnson_vec = np.vectorize(inverse_yeojohnson) # 逆变换预测值和真实值 y_valid_pred = inverse_yeojohnson_vec(y_valid_pred_scaled, lambda_val, shift) y_valid_true = inverse_yeojohnson_vec(y_valid, lambda_val, shift) # 计算评估指标 rmse = np.sqrt(mean_squared_error(y_valid_true, y_valid_pred)) mae = mean_absolute_error(y_valid_true, y_valid_pred) r2 = r2_score(y_valid_true, y_valid_pred) print(f"验证集评估结果: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}") # 可视化结果 print("可视化结果...") plt.figure(figsize=(25, 6)) plt.plot(y_valid_pred, label='预测值', color='red', linewidth=1.5) plt.plot(y_valid_true, label='真实值', color='green', alpha=0.8) plt.title(f'预测 vs 实际功率 (RMSE={rmse:.4f}, R2 ={r2:.4f})', fontsize=16) plt.xlabel('时间点', fontsize=12) plt.ylabel('功率 (MW)', fontsize=12) plt.grid(linestyle='--', alpha=0.7) plt.legend(fontsize=12) plt.savefig(PLOT_PATH, dpi=300, bbox_inches='tight') plt.show() # 绘制散点图 plt.figure(figsize=(7, 6)) plt.scatter(y_valid_true, y_valid_pred, s=40, alpha=0.6, edgecolor='blue', facecolor='none') min_val = min(y_valid_true.min(), y_valid_pred.min()) max_val = max(y_valid_true.max(), y_valid_pred.max()) plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='理想线: y=x') plt.text(0.05, 0.95, f'R2 = {r2:.4f}\nRMSE = {rmse:.4f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.2)) plt.grid(linestyle=":", alpha=0.7) plt.xlabel("实际功率 (MW)", fontsize=12) plt.ylabel("预测功率 (MW)", fontsize=12) plt.title("实测功率 vs 预测功率", fontsize=14) plt.legend() plt.tight_layout() plt.savefig('Scatter_plot.png', dpi=300) plt.show() print("程序执行完成") if __name__ == "__main__": main() #!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2025-08-11 10:38 # @Author : atom # 新数据检验 import numpy as np import pandas as pd import lightgbm as lgb from datetime import datetime, timedelta from matplotlib import pyplot as plt from joblib import load from matplotlib import rcParams import warnings # 忽略警告 warnings.filterwarnings("ignore") # 字体显示中文 rcParams['font.family'] = 'SimHei' rcParams['axes.unicode_minus'] = False # 导入文件 FILE_NAME2 = 'PR_data_B1-724.csv' dfall2 = pd.read_csv(FILE_NAME2, parse_dates=['date']) print(f"成功读取预测数据: {len(dfall2)}行") # 选择时间段 try: day_date = dfall2.set_index('date').loc['2025-07-22 00:00':'2025-08-01 23:45'].reset_index() print(f"时间段数据: {len(day_date)}行") except KeyError: day_date = dfall2[(dfall2['date'] >= '2025-07-22') & (dfall2['date'] <= '2025-08-01')] print(f"使用替代方法选择时间段数据: {len(day_date)}行") # 1. 应用相同的特征工程 def apply_feature_engineering(df): """ 应用与训练相同的特征工程 """ # 确保日期格式正确 df['date'] = pd.to_datetime(df['date']) # 时间特征 df['hour'] = df['date'].dt.hour df['dayofweek'] = df['date'].dt.dayofweek df['month'] = df['date'].dt.month df['dayofyear'] = df['date'].dt.dayofyear df['quarter'] = df['date'].dt.quarter # 周期性时间特征 df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) # 风向特征 if 'wd100m' in df.columns: df['wd100m_sin'] = np.sin(np.radians(df['wd100m'])) df['wd100m_cos'] = np.cos(np.radians(df['wd100m'])) # 风速特征 if 'ws100m' in df.columns: df['ws100m**2'] = df['ws100m'].clip(0, 30) ** 2 df['ws100m**3'] = df['ws100m'].clip(0, 30) ** 3 df['ws100m_sqrt'] = np.sqrt(df['ws100m'].clip(0, 30)) # 风速组合特征 epsilon = 1e-6 if 'ws100m' in df.columns and 'ws10m' in df.columns: df['ws_diff'] = df['ws100m'].clip(0, 30) - df['ws10m'].clip(0, 30) df['ws_ratio'] = df['ws100m'].clip(0, 30) / (df['ws10m'].clip(0.1, 30) + epsilon) # 气象交互特征 if 'ws100m' in df.columns: if 't2m' in df.columns: df['wind_temp'] = df['ws100m'].clip(0, 30) * df['t2m'] if 'rh' in df.columns: df['wind_humidity'] = df['ws100m'].clip(0, 30) * df['rh'] if 'sp' in df.columns: df['wind_pressure'] = df['ws100m'].clip(0, 30) * df['sp'] # 大气密度特征 if 'sp' in df.columns and 't2m' in df.columns: df['air_density'] = df['sp'] * 100 / (287.05 * (df['t2m'] + 273.15)) if 'ws100m' in df.columns: safe_ws = df['ws100m'].clip(0.1, 30) df['theoretical_power'] = 0.5 * df['air_density'] * (safe_ws ** 3) * 0.3 return df print("应用特征工程...") df_engineered = apply_feature_engineering(day_date.copy()) # 2. 选择与训练相同的特征 try: # 尝试加载特征名称(需要在训练时保存) selected_features = load('selected_features.joblib') print(f"加载了 {len(selected_features)} 个特征") except: # 如果未保存特征名称,手动列出训练中使用的主要特征 selected_features = [ 'ws100m', 't2m', 'theoretical_power', 'air_density', 'hour_sin', 'hour_cos', 'ws100m**2', 'ws100m**3', 'wind_temp', 'sp', 'rh', 'month_sin', 'month_cos', 'dayofyear', 'hour', 'ws_diff', 'wind_humidity' ] print("使用预设特征列表") # 确保只保留存在的特征 available_features = [f for f in selected_features if f in df_engineered.columns] X_new = df_engineered[available_features] print(f"最终使用的特征: {len(available_features)}") # 3. 应用相同的特征缩放 print("加载和应用特征缩放器...") try: scaler = load('robust_scaler.pkl') X_new_scaled = scaler.transform(X_new) print("特征缩放完成") except Exception as e: print(f"特征缩放失败: {e}") # 如果没有缩放器,使用原始数据 X_new_scaled = X_new.values print("使用未缩放特征") # 4. 正确加载LightGBM模型并进行预测 print("加载模型和预测...") try: # 正确加载LightGBM模型 model = lgb.Booster(model_file='lightgbm_model.pkl') # 使用保存的文本格式模型 # 预测 - 注意:LightGBM可以直接使用numpy数组 y_pred_scaled = model.predict(X_new_scaled) print("预测完成") # 5. 应用逆变换 print("应用逆变换...") try: # 加载变换参数 transform_params = load('yeojohnson_params.joblib') lambda_val = transform_params['lambda'] shift = transform_params['shift'] # Yeo-Johnson逆变换函数 def inverse_yeojohnson(y, lmbda, shift=0): if lmbda == 0 and y >= 0: return np.exp(y) - shift elif lmbda != 0 and y >= 0: return (y * lmbda + 1) ** (1 / lmbda) - shift elif lmbda != 2 and y < 0: return 1 - (-(2 - lmbda) * y + 1) ** (1 / (2 - lmbda)) - shift else: return -np.exp(-y) + 1 - shift # 向量化逆变换 inverse_yeojohnson_vec = np.vectorize(inverse_yeojohnson) y_pred = inverse_yeojohnson_vec(y_pred_scaled, lambda_val, shift) print("逆变换完成") # 6. 保存结果 result_df = pd.DataFrame({ 'date': df_engineered['date'], 'predicted_power': y_pred }) result_file = "power_predictions_all1.csv" result_df.to_csv(result_file, index=False, encoding='utf-8-sig') print(f"结果已保存至: {result_file}") # 7. 可视化 plt.figure(figsize=(30, 4)) plt.plot(result_df['date'], result_df['predicted_power']) plt.title('风电功率预测结果') plt.xlabel('时间') plt.ylabel('功率 (MW)') plt.grid(True) plt.savefig('prediction_plot.png', dpi=300) plt.show() except Exception as e: print(f"逆变换失败: {e}") except Exception as e: print(f"模型加载或预测失败: {e}") # 备用方案:使用简单模型(如果需要) # ... 模型加载失败
最新发布
08-15
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值