import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
import os
import traceback
# 设置中文字体支持
try:
# 尝试使用系统支持的中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei']
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
except:
print("警告:无法设置中文字体,图表中的中文可能显示为方块")
# 文件路径配置
file_paths = {
"price": r"C:\Users\1\Desktop\电价_实时.xlsx",
"wind": r"C:\Users\1\Desktop\风能_预处理后数据.xlsx",
"pv": r"C:\Users\1\Desktop\光伏_预处理后数据.xlsx",
"load": r"C:\Users\1\Desktop\高压侧总负荷_预处理后_20251205_132714.xlsx",
"weather": r"C:\Users\1\Desktop\processed_气象数据.xlsx"
}
# 1. 增强型数据加载函数(添加时间范围筛选)
def load_and_process_data(file_paths):
"""加载所有数据并处理列名不一致问题,并筛选时间范围"""
# 定义各文件的实际列名映射
column_mapping = {
"price": {"time_col": "时间", "value_col": "电价"}, # 电价文件列名
"wind": {"time_col": "DATA_TIME", "value_col": "风电功率"},
"pv": {"time_col": "DATA_TIME", "value_col": "光伏功率"},
"load": {"time_col": "DATA_TIME", "value_col": "负荷值"},
"weather": {"time_col": "时间", "value_cols": ["温度", "湿度", "风速", "天气状况"]}
}
datasets = {}
start_date = pd.Timestamp('2025-01-27 00:00:00')
end_date = pd.Timestamp('2025-04-14 00:00:00')
# 加载电价数据
price_df = pd.read_excel(file_paths['price'])
# 检查列名并重命名
if column_mapping["price"]["time_col"] in price_df.columns:
price_df = price_df.rename(columns={
column_mapping["price"]["time_col"]: "DATA_TIME",
column_mapping["price"]["value_col"]: "price_value"
})
price_df['DATA_TIME'] = pd.to_datetime(price_df['DATA_TIME'])
datasets['price'] = price_df.set_index('DATA_TIME')
# 加载风电数据
wind_df = pd.read_excel(file_paths['wind'])
if column_mapping["wind"]["time_col"] in wind_df.columns:
wind_df = wind_df.rename(columns={
column_mapping["wind"]["time_col"]: "DATA_TIME",
column_mapping["wind"]["value_col"]: "power_generated"
})
wind_df['DATA_TIME'] = pd.to_datetime(wind_df['DATA_TIME'])
datasets['wind'] = wind_df.set_index('DATA_TIME')
# 加载光伏数据
pv_df = pd.read_excel(file_paths['pv'])
if column_mapping["pv"]["time_col"] in pv_df.columns:
pv_df = pv_df.rename(columns={
column_mapping["pv"]["time_col"]: "DATA_TIME",
column_mapping["pv"]["value_col"]: "power_generated"
})
pv_df['DATA_TIME'] = pd.to_datetime(pv_df['DATA_TIME'])
datasets['pv'] = pv_df.set_index('DATA_TIME')
# 加载负荷数据
load_df = pd.read_excel(file_paths['load'])
if column_mapping["load"]["time_col"] in load_df.columns:
load_df = load_df.rename(columns={
column_mapping["load"]["time_col"]: "DATA_TIME",
column_mapping["load"]["value_col"]: "load_value"
})
load_df['DATA_TIME'] = pd.to_datetime(load_df['DATA_TIME'])
datasets['load'] = load_df.set_index('DATA_TIME')
# 加载气象数据
weather_df = pd.read_excel(file_paths['weather'])
if column_mapping["weather"]["time_col"] in weather_df.columns:
rename_dict = {column_mapping["weather"]["time_col"]: "DATA_TIME"}
# 动态映射数值列
num_cols = []
for col in weather_df.columns:
if col != column_mapping["weather"]["time_col"] and pd.api.types.is_numeric_dtype(weather_df[col]):
num_cols.append(col)
# 重命名数值列
for i, col in enumerate(num_cols):
rename_dict[col] = f"num_{i}"
# 处理分类列
cat_cols = [col for col in weather_df.columns if col not in num_cols and col != column_mapping["weather"]["time_col"]]
for i, col in enumerate(cat_cols):
rename_dict[col] = f"cat_{i}"
weather_df = weather_df.rename(columns=rename_dict)
weather_df['DATA_TIME'] = pd.to_datetime(weather_df['DATA_TIME'])
datasets['weather'] = weather_df.set_index('DATA_TIME')
# 筛选指定时间范围:2025-01-27 00:00:00 到 2025-04-14 00:00:00
for name, df in datasets.items():
# 确保索引是DateTime类型
if not isinstance(df.index, pd.DatetimeIndex):
df.index = pd.to_datetime(df.index)
# 筛选日期范围 [start_date, end_date]
mask = (df.index >= start_date) & (df.index <= end_date)
datasets[name] = df.loc[mask]
# 检查数据是否为空
if len(datasets[name]) == 0:
print(f"警告: {name}数据集在指定时间范围内没有数据!")
else:
print(f"{name}数据集: {len(datasets[name])}条记录, 时间范围: {datasets[name].index.min()} 至 {datasets[name].index.max()}")
return datasets
# 2. 时间对齐到1小时频率
def align_to_hourly_frequency(datasets):
"""将所有数据统一到1小时频率"""
aligned_datasets = {}
print("=== 数据重采样到小时频率 ===")
# 确定统一的时间范围 (2025-01-27 00:00:00 到 2025-04-14 00:00:00)
start_date = pd.Timestamp('2025-01-27 00:00:00')
end_date = pd.Timestamp('2025-04-14 00:00:00')
full_range = pd.date_range(start=start_date, end=end_date, freq='h')
# 对每个数据集进行重采样
for name, df in datasets.items():
# 跳过空数据集
if len(df) == 0:
aligned_datasets[name] = pd.DataFrame(index=full_range)
print(f"{name}: 空数据集 → 对齐后 {len(full_range)}")
continue
# 数值型数据取平均值
if name in ['price', 'wind', 'pv', 'load']:
# 获取第一个数值列
value_col = df.columns[0]
resampled = df[[value_col]].resample('h').mean()
# 气象数据特殊处理
elif name == 'weather':
# 区分数值列和分类列
num_cols = [col for col in df.columns if col.startswith('num_')]
cat_cols = [col for col in df.columns if col.startswith('cat_')]
avg_df = df[num_cols].resample('h').mean() if num_cols else pd.DataFrame()
mode_df = pd.DataFrame()
if cat_cols:
mode_df = df[cat_cols].resample('h').apply(lambda x: x.mode()[0] if not x.empty else np.nan)
resampled = pd.concat([avg_df, mode_df], axis=1)
else:
resampled = df.resample('h').mean()
# 重新索引到完整时间范围
aligned = resampled.reindex(full_range)
aligned_datasets[name] = aligned
print(f"{name}: 原始点 {len(df)} → 对齐后 {len(aligned)}")
return aligned_datasets
# 3. 缺失值处理
def handle_missing_data(aligned_datasets):
"""处理各数据集的缺失值"""
print("\n=== 缺失值处理 ===")
# 创建缺失值报告函数
def report_missing(df, name):
if df.empty:
print(f"{name}数据集为空,跳过缺失值处理")
return 0
total = df.size
missing = df.isnull().sum().sum()
print(f"{name}缺失值: {missing}/{total} ({missing/total:.1%})")
return missing
# 处理每个数据集
for name, df in aligned_datasets.items():
original_missing = report_missing(df, name)
# 如果数据集为空,跳过处理
if df.empty:
continue
# 不同数据集使用不同填充策略
if name == 'price':
# 电价:时间序列插值
df = df.interpolate(method='time')
elif name == 'weather':
# 数值列:KNN填充
num_cols = [col for col in df.columns if col.startswith('num_')]
if num_cols:
imputer = KNNImputer(n_neighbors=6)
df[num_cols] = imputer.fit_transform(df[num_cols])
# 分类列用众数填充
cat_cols = [col for col in df.columns if col.startswith('cat_')]
for col in cat_cols:
if df[col].isnull().any():
mode_val = df[col].mode()[0] if not df[col].mode().empty else "未知"
df[col] = df[col].fillna(mode_val)
else: # 其他数据:前后填充
df = df.ffill().bfill()
# 最终检查
final_missing = df.isnull().sum().sum()
print(f"处理后缺失值: {final_missing}/{df.size} ({final_missing/df.size:.1%})")
aligned_datasets[name] = df
return aligned_datasets
# 4. 数据合并与特征工程
def merge_and_engineer(aligned_datasets):
"""合并所有数据集并创建特征"""
print("\n=== 数据合并与特征工程 ===")
# 创建基础数据框 (使用统一的时间范围)
start_date = pd.Timestamp('2025-01-27 00:00:00')
end_date = pd.Timestamp('2025-04-14 00:00:00')
full_index = pd.date_range(start=start_date, end=end_date, freq='h')
combined_df = pd.DataFrame(index=full_index)
# 添加各数据源(使用列名映射)
for name in ['price', 'wind', 'pv', 'load']:
if not aligned_datasets[name].empty:
combined_df[name if name != 'price' else 'price'] = aligned_datasets[name].iloc[:, 0]
# 添加气象数据
weather = aligned_datasets['weather']
if not weather.empty:
# 添加数值列
num_cols = [col for col in weather.columns if col.startswith('num_')]
for col in num_cols:
combined_df[col] = weather[col]
# 添加分类列
cat_cols = [col for col in weather.columns if col.startswith('cat_')]
for col in cat_cols:
combined_df[col] = weather[col]
# 特征工程
combined_df['hour'] = combined_df.index.hour
combined_df['day_of_week'] = combined_df.index.dayofweek
combined_df['is_weekend'] = combined_df['day_of_week'].isin([5, 6]).astype(int)
# 创建滞后特征
for lag in [1, 2, 3, 24]: # 1小时、2小时、3小时、24小时滞后
if 'price' in combined_df.columns:
combined_df[f'price_lag_{lag}'] = combined_df['price'].shift(lag)
# 添加新能源占比(修复分母为0的问题)
if 'wind_power' in combined_df.columns and 'pv_power' in combined_df.columns and 'load' in combined_df.columns:
# 确保负荷不为0
combined_df['load'] = combined_df['load'].replace(0, np.nan).ffill().bfill()
# 计算可再生能源比例
combined_df['renewable_ratio'] = (combined_df['wind_power'] + combined_df['pv_power']) / combined_df['load']
# 处理无穷大值
combined_df['renewable_ratio'] = combined_df['renewable_ratio'].replace([np.inf, -np.inf], np.nan).ffill().bfill()
# 最终缺失值处理
combined_df = combined_df.ffill().bfill()
print(f"合并后数据集形状: {combined_df.shape}")
print(f"时间范围: {combined_df.index.min()} 至 {combined_df.index.max()}")
return combined_df
# 5. 修复可视化错误
def analyze_and_visualize(combined_df):
"""执行基础分析并可视化"""
print("\n=== 数据分析与可视化 ===")
if combined_df.empty:
print("警告: 合并后的数据集为空,无法进行分析和可视化!")
return
# 基础统计
print("\n数据集统计信息:")
print(combined_df.describe())
# 相关性分析(排除非数值列)
numeric_cols = combined_df.select_dtypes(include=[np.number]).columns
if 'price' in numeric_cols:
corr_matrix = combined_df[numeric_cols].corr()
price_corr = corr_matrix['price'].sort_values(ascending=False)
print("\n与电价相关性最高的因素:")
print(price_corr.head(10))
# 可视化
plt.figure(figsize=(15, 12))
# 电价时间序列
if 'price' in combined_df.columns:
plt.subplot(3, 2, 1)
combined_df['price'].plot(title='Electricity Price Trend')
plt.ylabel('Price')
else:
plt.subplot(3, 2, 1)
plt.text(0.5, 0.5, '无电价数据', ha='center', va='center')
plt.title('Electricity Price Trend')
# 新能源功率时间序列
plt.subplot(3, 2, 2)
if 'wind_power' in combined_df.columns:
combined_df['wind_power'].plot(label='Wind Power', alpha=0.7)
if 'pv_power' in combined_df.columns:
combined_df['pv_power'].plot(label='PV Power', alpha=0.7)
plt.title('Renewable Energy Generation')
plt.ylabel('Power (MW)')
plt.legend()
# 负荷与电价关系
plt.subplot(3, 2, 3)
if 'load' in combined_df.columns and 'price' in combined_df.columns:
plt.scatter(combined_df['load'], combined_df['price'], alpha=0.5)
plt.title('Load vs Price')
plt.xlabel('Load')
plt.ylabel('Price')
else:
plt.text(0.5, 0.5, '缺少负荷或电价数据', ha='center', va='center')
plt.title('Load vs Price')
# 小时平均电价
plt.subplot(3, 2, 4)
if 'price' in combined_df.columns and 'hour' in combined_df.columns:
hour_avg = combined_df.groupby('hour')['price'].mean()
hour_avg.plot(kind='bar')
plt.title('Average Price by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Average Price')
else:
plt.text(0.5, 0.5, '缺少电价或小时数据', ha='center', va='center')
plt.title('Average Price by Hour')
# 可再生能源比例与电价
plt.subplot(3, 2, 5)
if 'renewable_ratio' in combined_df.columns and 'price' in combined_df.columns:
plt.scatter(combined_df['renewable_ratio'], combined_df['price'], alpha=0.5)
plt.title('Renewable Ratio vs Price')
plt.xlabel('(Wind + PV) / Load')
plt.ylabel('Price')
else:
plt.text(0.5, 0.5, '缺少可再生能源比例或电价数据', ha='center', va='center')
plt.title('Renewable Ratio vs Price')
# 温度与电价关系
plt.subplot(3, 2, 6)
if 'num_0' in combined_df.columns and 'price' in combined_df.columns: # 假设第一个数值列是温度
plt.scatter(combined_df['num_0'], combined_df['price'], alpha=0.5)
plt.title('Temperature vs Price')
plt.xlabel('Temperature (°C)')
plt.ylabel('Price')
elif 'price' in combined_df.columns and 'day_of_week' in combined_df.columns:
# 如果没有温度数据,绘制星期几对电价的影响
weekday_avg = combined_df.groupby('day_of_week')['price'].mean完整代码
最新发布