#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2025-08-12 16:30
# @Author : atom
import numpy as np
import pandas as pd
import lightgbm as lgb
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot as plt
from joblib import dump, load
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import boxcox, yeojohnson
from matplotlib import rcParams
import calendar
import warnings
from matplotlib import rcParams
from sklearn.metrics import r2_score
rcParams['font.family'] = 'SimHei'
rcParams['axes.unicode_minus'] = False
# 忽略警告
warnings.filterwarnings("ignore")
def robust_data_cleaning(df):
"""健壮的数据清洗和处理"""
# 确保目标列存在
if '实际功率(MW)' not in df.columns:
raise ValueError("数据中缺少目标列 '实际功率(MW)'")
# 替换无穷大值为NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# 填充缺失值 - 使用前后填充
df = df.fillna(method='ffill').fillna(method='bfill')
# 对于风速列,确保最小值大于0
wind_cols = ['ws100m', 'ws10m']
for col in wind_cols:
if col in df.columns:
df[col] = df[col].clip(lower=0.1) # 确保风速至少为0.1
return df
def advanced_feature_engineering(df):
"""
高级特征工程 - 不使用滞后功率特征
使用Yeo-Johnson变换
"""
df = robust_data_cleaning(df)
ws100_colname = "ws100m"
target_col = '实际功率(MW)'
time_colname = 'date'
# 确保日期格式正确
df[time_colname] = pd.to_datetime(df[time_colname])
# 1. 时间特征工程扩展
df['hour'] = df[time_colname].dt.hour
df['dayofweek'] = df[time_colname].dt.dayofweek
df['month'] = df[time_colname].dt.month
df['dayofyear'] = df[time_colname].dt.dayofyear
df['quarter'] = df[time_colname].dt.quarter
# 周期性时间特征
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
# 2. 气象特征工程扩展
# 风向特征处理(角度转向量)
if 'wd100m' in df.columns:
df['wd100m_sin'] = np.sin(np.radians(df['wd100m']))
df['wd100m_cos'] = np.cos(np.radians(df['wd100m']))
# 风速多项式特征 - 使用clip确保值在合理范围内
df['ws100m**2'] = df[ws100_colname].clip(0, 30) ** 2
df['ws100m**3'] = df[ws100_colname].clip(0, 30) ** 3
df['ws100m_sqrt'] = np.sqrt(df[ws100_colname].clip(0, 30))
# 风速组合特征 - 添加剪裁和epsilon防止除零
epsilon = 1e-6
df['ws_diff'] = df['ws100m'].clip(0, 30) - df['ws10m'].clip(0, 30) # 高低层风速差
df['ws_ratio'] = df['ws100m'].clip(0, 30) / (df['ws10m'].clip(0.1, 30) + epsilon) # 高低层风速比
# 气象要素交互特征
df['wind_temp'] = df[ws100_colname].clip(0, 30) * df['t2m'] # 风速-温度交互
df['wind_humidity'] = df[ws100_colname].clip(0, 30) * df['rh'] # 风速-湿度交互
df['wind_pressure'] = df[ws100_colname].clip(0, 30) * df['sp'] # 风速-气压交互
# 3. 大气密度修正功率特征
# 使用国际标准公式:ρ = P/(R×T),其中R=287.05 J/(kg·K)
df['air_density'] = df['sp'] * 100 / (287.05 * (df['t2m'] + 273.15))
# 确保风速在合理范围内
safe_ws = df[ws100_colname].clip(0.1, 30) # 风速不能为负或零
df['theoretical_power'] = 0.5 * df['air_density'] * (safe_ws ** 3) * 0.3
# 4. 目标变量处理(使用Yeo-Johnson变换替代Box-Cox)
target_values = df[target_col].values
# 检查目标值是否有负数或零
min_target = np.min(target_values)
shift = 0
if min_target <= 0:
shift = abs(min_target) + 1
target_values += shift
# 使用Yeo-Johnson变换处理目标变量
transformed_target, lambda_val = yeojohnson(target_values)
df['power_normalized'] = transformed_target
# 移除目标和不需要的列
X = df.drop([target_col, time_colname, 'power_normalized'], axis=1)
y = df['power_normalized']
# 保存变换参数用于后续逆变换
dump({'lambda': lambda_val, 'shift': shift}, 'yeojohnson_params.joblib')
return X, y
def feature_selection(X, y, n_features=30):
"""
使用互信息进行特征选择
"""
# 计算特征与目标的互信息
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
# 选择最重要的特征(前n_features个)
selected_features = mi_scores.index[:min(n_features, len(X.columns))].tolist()
# 可视化特征重要性
plt.figure(figsize=(15, 10))
mi_scores.sort_values().plot.barh(title='特征互信息分数')
plt.savefig('feature_importance.png', dpi=300)
plt.close()
print(f"选择了 {len(selected_features)} 个重要特征")
return X[selected_features]
def train_lightgbm(X_train, y_train):
"""
使用LightGBM训练模型 - 修复.iloc错误
"""
# 创建LightGBM数据集
train_data = lgb.Dataset(X_train, label=y_train)
# LightGBM参数 - 优化超参数
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'learning_rate': 0.03, # 降低学习率获得更好泛化
'num_leaves': 31,
'max_depth': 7, # 限制深度防止过拟合
'min_data_in_leaf': 50, # 增加防止过拟合
'feature_fraction': 0.7, # 每次迭代使用70%特征
'bagging_fraction': 0.7, # 每次迭代使用70%数据
'bagging_freq': 5,
'lambda_l1': 0.2, # L1正则化
'lambda_l2': 0.3, # L2正则化
'min_split_gain': 0.01,
'verbose': -1,
'seed': 42
}
# 使用验证集进行早期停止
valid_size = int(0.1 * len(X_train))
X_valid = X_train[-valid_size:]
y_valid = y_train[-valid_size:]
X_train_train = X_train[:-valid_size]
y_train_train = y_train[:-valid_size]
train_set = lgb.Dataset(X_train_train, label=y_train_train)
valid_set = lgb.Dataset(X_valid, label=y_valid, reference=train_set)
# 训练模型
model = lgb.train(
params,
train_set,
num_boost_round=1500, # 增加迭代轮数
valid_sets=[valid_set],
callbacks=[
lgb.early_stopping(stopping_rounds=100, verbose=True),
lgb.log_evaluation(period=100)
]
)
return model
def robust_scaling(X_train, X_valid):
"""使用健壮缩放器处理数据"""
# 使用RobustScaler处理特征
scaler = RobustScaler(quantile_range=(5, 95)) # 忽略极端值
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
# 保存缩放器
dump(scaler, 'robust_scaler.pkl')
return X_train_scaled, X_valid_scaled
def main():
# 参数设置
FILE_NAME = 'B_new2.csv' # 原始数据文件
MODEL_PATH = 'lightgbm_model.pkl' # 模型保存路径
PLOT_PATH = 'Prediction_advanced.png' # 可视化图像路径
# 读取数据
print("读取数据...")
try:
df = pd.read_csv(FILE_NAME)
print(f"成功读取数据,共 {len(df)} 行")
except Exception as e:
print(f"读取数据失败: {e}")
return
# 特征工程
print("高级特征工程...")
try:
X, y = advanced_feature_engineering(df)
except Exception as e:
print(f"特征工程失败: {e}")
return
# 特征选择
print("特征选择...")
try:
X = feature_selection(X, y, n_features=30)
except Exception as e:
print(f"特征选择失败: {e}")
return
# 划分训练集和验证集
print("划分数据集...")
VALID_START = int(0.9 * len(X))
X_train = X.iloc[:VALID_START]
X_valid = X.iloc[VALID_START:]
y_train = y.iloc[:VALID_START]
y_valid = y.iloc[VALID_START:]
# 将y_train和y_valid转换为NumPy数组
y_train = y_train.values
y_valid = y_valid.values
# 数据缩放 (使用健壮缩放器)
print("数据缩放处理...")
try:
X_train_scaled, X_valid_scaled = robust_scaling(X_train, X_valid)
except Exception as e:
print(f"数据缩放失败: {e}")
return
# 训练模型
print("训练LightGBM模型...")
try:
model = train_lightgbm(X_train_scaled, y_train)
except Exception as e:
print(f"模型训练失败: {e}")
return
# 保存模型
dump(model, MODEL_PATH)
print(f"模型已保存至:{MODEL_PATH}")
# 模型预测
print("模型预测...")
y_valid_pred_scaled = model.predict(X_valid_scaled)
# 加载变换参数
transform_params = load('yeojohnson_params.joblib')
lambda_val = transform_params['lambda']
shift = transform_params['shift']
# Yeo-Johnson逆变换函数
def inverse_yeojohnson(y, lmbda, shift=0):
if lmbda == 0 and y >= 0:
return np.exp(y) - shift
elif lmbda != 0 and y >= 0:
return (y * lmbda + 1) ** (1 / lmbda) - shift
elif lmbda != 2 and y < 0:
return 1 - (-(2 - lmbda) * y + 1) ** (1 / (2 - lmbda)) - shift
else: # lmbda == 2 and y < 0
return -np.exp(-y) + 1 - shift
# 向量化逆变换函数
inverse_yeojohnson_vec = np.vectorize(inverse_yeojohnson)
# 逆变换预测值和真实值
y_valid_pred = inverse_yeojohnson_vec(y_valid_pred_scaled, lambda_val, shift)
y_valid_true = inverse_yeojohnson_vec(y_valid, lambda_val, shift)
# 计算评估指标
rmse = np.sqrt(mean_squared_error(y_valid_true, y_valid_pred))
mae = mean_absolute_error(y_valid_true, y_valid_pred)
r2 = r2_score(y_valid_true, y_valid_pred)
print(f"验证集评估结果: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}")
# 可视化结果
print("可视化结果...")
plt.figure(figsize=(25, 6))
plt.plot(y_valid_pred, label='预测值', color='red', linewidth=1.5)
plt.plot(y_valid_true, label='真实值', color='green', alpha=0.8)
plt.title(f'预测 vs 实际功率 (RMSE={rmse:.4f}, R2 ={r2:.4f})', fontsize=16)
plt.xlabel('时间点', fontsize=12)
plt.ylabel('功率 (MW)', fontsize=12)
plt.grid(linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.savefig(PLOT_PATH, dpi=300, bbox_inches='tight')
plt.show()
# 绘制散点图
plt.figure(figsize=(7, 6))
plt.scatter(y_valid_true, y_valid_pred, s=40, alpha=0.6, edgecolor='blue', facecolor='none')
min_val = min(y_valid_true.min(), y_valid_pred.min())
max_val = max(y_valid_true.max(), y_valid_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='理想线: y=x')
plt.text(0.05, 0.95, f'R2 = {r2:.4f}\nRMSE = {rmse:.4f}',
transform=plt.gca().transAxes, fontsize=12,
verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.2))
plt.grid(linestyle=":", alpha=0.7)
plt.xlabel("实际功率 (MW)", fontsize=12)
plt.ylabel("预测功率 (MW)", fontsize=12)
plt.title("实测功率 vs 预测功率", fontsize=14)
plt.legend()
plt.tight_layout()
plt.savefig('Scatter_plot.png', dpi=300)
plt.show()
print("程序执行完成")
if __name__ == "__main__":
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2025-08-11 10:38
# @Author : atom
# 新数据检验
import numpy as np
import pandas as pd
import lightgbm as lgb
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from joblib import load
from matplotlib import rcParams
import warnings
# 忽略警告
warnings.filterwarnings("ignore")
# 字体显示中文
rcParams['font.family'] = 'SimHei'
rcParams['axes.unicode_minus'] = False
# 导入文件
FILE_NAME2 = 'PR_data_B1-724.csv'
dfall2 = pd.read_csv(FILE_NAME2, parse_dates=['date'])
print(f"成功读取预测数据: {len(dfall2)}行")
# 选择时间段
try:
day_date = dfall2.set_index('date').loc['2025-07-22 00:00':'2025-08-01 23:45'].reset_index()
print(f"时间段数据: {len(day_date)}行")
except KeyError:
day_date = dfall2[(dfall2['date'] >= '2025-07-22') & (dfall2['date'] <= '2025-08-01')]
print(f"使用替代方法选择时间段数据: {len(day_date)}行")
# 1. 应用相同的特征工程
def apply_feature_engineering(df):
"""
应用与训练相同的特征工程
"""
# 确保日期格式正确
df['date'] = pd.to_datetime(df['date'])
# 时间特征
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['dayofyear'] = df['date'].dt.dayofyear
df['quarter'] = df['date'].dt.quarter
# 周期性时间特征
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
# 风向特征
if 'wd100m' in df.columns:
df['wd100m_sin'] = np.sin(np.radians(df['wd100m']))
df['wd100m_cos'] = np.cos(np.radians(df['wd100m']))
# 风速特征
if 'ws100m' in df.columns:
df['ws100m**2'] = df['ws100m'].clip(0, 30) ** 2
df['ws100m**3'] = df['ws100m'].clip(0, 30) ** 3
df['ws100m_sqrt'] = np.sqrt(df['ws100m'].clip(0, 30))
# 风速组合特征
epsilon = 1e-6
if 'ws100m' in df.columns and 'ws10m' in df.columns:
df['ws_diff'] = df['ws100m'].clip(0, 30) - df['ws10m'].clip(0, 30)
df['ws_ratio'] = df['ws100m'].clip(0, 30) / (df['ws10m'].clip(0.1, 30) + epsilon)
# 气象交互特征
if 'ws100m' in df.columns:
if 't2m' in df.columns:
df['wind_temp'] = df['ws100m'].clip(0, 30) * df['t2m']
if 'rh' in df.columns:
df['wind_humidity'] = df['ws100m'].clip(0, 30) * df['rh']
if 'sp' in df.columns:
df['wind_pressure'] = df['ws100m'].clip(0, 30) * df['sp']
# 大气密度特征
if 'sp' in df.columns and 't2m' in df.columns:
df['air_density'] = df['sp'] * 100 / (287.05 * (df['t2m'] + 273.15))
if 'ws100m' in df.columns:
safe_ws = df['ws100m'].clip(0.1, 30)
df['theoretical_power'] = 0.5 * df['air_density'] * (safe_ws ** 3) * 0.3
return df
print("应用特征工程...")
df_engineered = apply_feature_engineering(day_date.copy())
# 2. 选择与训练相同的特征
try:
# 尝试加载特征名称(需要在训练时保存)
selected_features = load('selected_features.joblib')
print(f"加载了 {len(selected_features)} 个特征")
except:
# 如果未保存特征名称,手动列出训练中使用的主要特征
selected_features = [
'ws100m', 't2m', 'theoretical_power', 'air_density',
'hour_sin', 'hour_cos', 'ws100m**2', 'ws100m**3',
'wind_temp', 'sp', 'rh', 'month_sin', 'month_cos',
'dayofyear', 'hour', 'ws_diff', 'wind_humidity'
]
print("使用预设特征列表")
# 确保只保留存在的特征
available_features = [f for f in selected_features if f in df_engineered.columns]
X_new = df_engineered[available_features]
print(f"最终使用的特征数: {len(available_features)}")
# 3. 应用相同的特征缩放
print("加载和应用特征缩放器...")
try:
scaler = load('robust_scaler.pkl')
X_new_scaled = scaler.transform(X_new)
print("特征缩放完成")
except Exception as e:
print(f"特征缩放失败: {e}")
# 如果没有缩放器,使用原始数据
X_new_scaled = X_new.values
print("使用未缩放特征")
# 4. 正确加载LightGBM模型并进行预测
print("加载模型和预测...")
try:
# 正确加载LightGBM模型
model = lgb.Booster(model_file='lightgbm_model.pkl') # 使用保存的文本格式模型
# 预测 - 注意:LightGBM可以直接使用numpy数组
y_pred_scaled = model.predict(X_new_scaled)
print("预测完成")
# 5. 应用逆变换
print("应用逆变换...")
try:
# 加载变换参数
transform_params = load('yeojohnson_params.joblib')
lambda_val = transform_params['lambda']
shift = transform_params['shift']
# Yeo-Johnson逆变换函数
def inverse_yeojohnson(y, lmbda, shift=0):
if lmbda == 0 and y >= 0:
return np.exp(y) - shift
elif lmbda != 0 and y >= 0:
return (y * lmbda + 1) ** (1 / lmbda) - shift
elif lmbda != 2 and y < 0:
return 1 - (-(2 - lmbda) * y + 1) ** (1 / (2 - lmbda)) - shift
else:
return -np.exp(-y) + 1 - shift
# 向量化逆变换
inverse_yeojohnson_vec = np.vectorize(inverse_yeojohnson)
y_pred = inverse_yeojohnson_vec(y_pred_scaled, lambda_val, shift)
print("逆变换完成")
# 6. 保存结果
result_df = pd.DataFrame({
'date': df_engineered['date'],
'predicted_power': y_pred
})
result_file = "power_predictions_all1.csv"
result_df.to_csv(result_file, index=False, encoding='utf-8-sig')
print(f"结果已保存至: {result_file}")
# 7. 可视化
plt.figure(figsize=(30, 4))
plt.plot(result_df['date'], result_df['predicted_power'])
plt.title('风电功率预测结果')
plt.xlabel('时间')
plt.ylabel('功率 (MW)')
plt.grid(True)
plt.savefig('prediction_plot.png', dpi=300)
plt.show()
except Exception as e:
print(f"逆变换失败: {e}")
except Exception as e:
print(f"模型加载或预测失败: {e}")
# 备用方案:使用简单模型(如果需要)
# ...
模型加载失败
最新发布