df

#include <iostream>

using namespace std;

#include <fstream>

#include <cstdlib>

int main()

{

char b=EOF;

int a[1]={'/0'};

int buffer[10]={1,2,3,4,5,6,7,8,9,0};

fstream file("bfile.dat",ios::out|ios::binary);

cout << "首先向文件种写数据。。。" << endl ;

file.write((char *)buffer,sizeof(buffer));

file.close();

cout << "写入数据成功" << endl;

file.open("bfile.dat",ios::in);

if (file.fail())

{

cout << "fail!!" << endl;

exit(0);

}

cout << "打开文件数据" << endl;

file.read((char *)buffer,sizeof(buffer));

for (int count=0;count <10;count++)

{

cout << buffer[count] << " ";

}

file.close();

return 0;

}

import pandas as pd import numpy as np from sklearn.impute import KNNImputer import matplotlib.pyplot as plt import matplotlib as mpl import warnings import os import traceback # 设置中文字体支持 try: # 尝试使用系统支持的中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei'] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 except: print("警告:无法设置中文字体,图表中的中文可能显示为方块") # 文件路径配置 file_paths = { "price": r"C:\Users\1\Desktop\电价_实时.xlsx", "wind": r"C:\Users\1\Desktop\风能_预处理后数据.xlsx", "pv": r"C:\Users\1\Desktop\光伏_预处理后数据.xlsx", "load": r"C:\Users\1\Desktop\高压侧总负荷_预处理后_20251205_132714.xlsx", "weather": r"C:\Users\1\Desktop\processed_气象数据.xlsx" } # 1. 增强型数据加载函数(添加时间范围筛选) def load_and_process_data(file_paths): """加载所有数据并处理列名不一致问题,并筛选时间范围""" # 定义各文件的实际列名映射 column_mapping = { "price": {"time_col": "时间", "value_col": "电价"}, # 电价文件列名 "wind": {"time_col": "DATA_TIME", "value_col": "风电功率"}, "pv": {"time_col": "DATA_TIME", "value_col": "光伏功率"}, "load": {"time_col": "DATA_TIME", "value_col": "负荷值"}, "weather": {"time_col": "时间", "value_cols": ["温度", "湿度", "风速", "天气状况"]} } datasets = {} start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') # 加载电价数据 price_df = pd.read_excel(file_paths['price']) # 检查列名并重命名 if column_mapping["price"]["time_col"] in price_df.columns: price_df = price_df.rename(columns={ column_mapping["price"]["time_col"]: "DATA_TIME", column_mapping["price"]["value_col"]: "price_value" }) price_df['DATA_TIME'] = pd.to_datetime(price_df['DATA_TIME']) datasets['price'] = price_df.set_index('DATA_TIME') # 加载风电数据 wind_df = pd.read_excel(file_paths['wind']) if column_mapping["wind"]["time_col"] in wind_df.columns: wind_df = wind_df.rename(columns={ column_mapping["wind"]["time_col"]: "DATA_TIME", column_mapping["wind"]["value_col"]: "power_generated" }) wind_df['DATA_TIME'] = pd.to_datetime(wind_df['DATA_TIME']) datasets['wind'] = wind_df.set_index('DATA_TIME') # 加载光伏数据 pv_df = pd.read_excel(file_paths['pv']) if column_mapping["pv"]["time_col"] in pv_df.columns: pv_df = pv_df.rename(columns={ column_mapping["pv"]["time_col"]: "DATA_TIME", column_mapping["pv"]["value_col"]: "power_generated" }) pv_df['DATA_TIME'] = pd.to_datetime(pv_df['DATA_TIME']) datasets['pv'] = pv_df.set_index('DATA_TIME') # 加载负荷数据 load_df = pd.read_excel(file_paths['load']) if column_mapping["load"]["time_col"] in load_df.columns: load_df = load_df.rename(columns={ column_mapping["load"]["time_col"]: "DATA_TIME", column_mapping["load"]["value_col"]: "load_value" }) load_df['DATA_TIME'] = pd.to_datetime(load_df['DATA_TIME']) datasets['load'] = load_df.set_index('DATA_TIME') # 加载气象数据 weather_df = pd.read_excel(file_paths['weather']) if column_mapping["weather"]["time_col"] in weather_df.columns: rename_dict = {column_mapping["weather"]["time_col"]: "DATA_TIME"} # 动态映射数值列 num_cols = [] for col in weather_df.columns: if col != column_mapping["weather"]["time_col"] and pd.api.types.is_numeric_dtype(weather_df[col]): num_cols.append(col) # 重命名数值列 for i, col in enumerate(num_cols): rename_dict[col] = f"num_{i}" # 处理分类列 cat_cols = [col for col in weather_df.columns if col not in num_cols and col != column_mapping["weather"]["time_col"]] for i, col in enumerate(cat_cols): rename_dict[col] = f"cat_{i}" weather_df = weather_df.rename(columns=rename_dict) weather_df['DATA_TIME'] = pd.to_datetime(weather_df['DATA_TIME']) datasets['weather'] = weather_df.set_index('DATA_TIME') # 筛选指定时间范围:2025-01-27 00:00:00 到 2025-04-14 00:00:00 for name, df in datasets.items(): # 确保索引是DateTime类型 if not isinstance(df.index, pd.DatetimeIndex): df.index = pd.to_datetime(df.index) # 筛选日期范围 [start_date, end_date] mask = (df.index >= start_date) & (df.index <= end_date) datasets[name] = df.loc[mask] # 检查数据是否为空 if len(datasets[name]) == 0: print(f"警告: {name}数据集在指定时间范围内没有数据!") else: print(f"{name}数据集: {len(datasets[name])}条记录, 时间范围: {datasets[name].index.min()} 至 {datasets[name].index.max()}") return datasets # 2. 时间对齐到1小时频率 def align_to_hourly_frequency(datasets): """将所有数据统一到1小时频率""" aligned_datasets = {} print("=== 数据重采样到小时频率 ===") # 确定统一的时间范围 (2025-01-27 00:00:00 到 2025-04-14 00:00:00) start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') full_range = pd.date_range(start=start_date, end=end_date, freq='h') # 对每个数据集进行重采样 for name, df in datasets.items(): # 跳过空数据集 if len(df) == 0: aligned_datasets[name] = pd.DataFrame(index=full_range) print(f"{name}: 空数据集 → 对齐后 {len(full_range)}") continue # 数值型数据取平均值 if name in ['price', 'wind', 'pv', 'load']: # 获取第一个数值列 value_col = df.columns[0] resampled = df[[value_col]].resample('h').mean() # 气象数据特殊处理 elif name == 'weather': # 区分数值列和分类列 num_cols = [col for col in df.columns if col.startswith('num_')] cat_cols = [col for col in df.columns if col.startswith('cat_')] avg_df = df[num_cols].resample('h').mean() if num_cols else pd.DataFrame() mode_df = pd.DataFrame() if cat_cols: mode_df = df[cat_cols].resample('h').apply(lambda x: x.mode()[0] if not x.empty else np.nan) resampled = pd.concat([avg_df, mode_df], axis=1) else: resampled = df.resample('h').mean() # 重新索引到完整时间范围 aligned = resampled.reindex(full_range) aligned_datasets[name] = aligned print(f"{name}: 原始点 {len(df)} → 对齐后 {len(aligned)}") return aligned_datasets # 3. 缺失值处理 def handle_missing_data(aligned_datasets): """处理各数据集的缺失值""" print("\n=== 缺失值处理 ===") # 创建缺失值报告函数 def report_missing(df, name): if df.empty: print(f"{name}数据集为空,跳过缺失值处理") return 0 total = df.size missing = df.isnull().sum().sum() print(f"{name}缺失值: {missing}/{total} ({missing/total:.1%})") return missing # 处理每个数据集 for name, df in aligned_datasets.items(): original_missing = report_missing(df, name) # 如果数据集为空,跳过处理 if df.empty: continue # 不同数据集使用不同填充策略 if name == 'price': # 电价:时间序列插值 df = df.interpolate(method='time') elif name == 'weather': # 数值列:KNN填充 num_cols = [col for col in df.columns if col.startswith('num_')] if num_cols: imputer = KNNImputer(n_neighbors=6) df[num_cols] = imputer.fit_transform(df[num_cols]) # 分类列用众数填充 cat_cols = [col for col in df.columns if col.startswith('cat_')] for col in cat_cols: if df[col].isnull().any(): mode_val = df[col].mode()[0] if not df[col].mode().empty else "未知" df[col] = df[col].fillna(mode_val) else: # 其他数据:前后填充 df = df.ffill().bfill() # 最终检查 final_missing = df.isnull().sum().sum() print(f"处理后缺失值: {final_missing}/{df.size} ({final_missing/df.size:.1%})") aligned_datasets[name] = df return aligned_datasets # 4. 数据合并与特征工程 def merge_and_engineer(aligned_datasets): """合并所有数据集并创建特征""" print("\n=== 数据合并与特征工程 ===") # 创建基础数据框 (使用统一的时间范围) start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') full_index = pd.date_range(start=start_date, end=end_date, freq='h') combined_df = pd.DataFrame(index=full_index) # 添加各数据源(使用列名映射) for name in ['price', 'wind', 'pv', 'load']: if not aligned_datasets[name].empty: combined_df[name if name != 'price' else 'price'] = aligned_datasets[name].iloc[:, 0] # 添加气象数据 weather = aligned_datasets['weather'] if not weather.empty: # 添加数值列 num_cols = [col for col in weather.columns if col.startswith('num_')] for col in num_cols: combined_df[col] = weather[col] # 添加分类列 cat_cols = [col for col in weather.columns if col.startswith('cat_')] for col in cat_cols: combined_df[col] = weather[col] # 特征工程 combined_df['hour'] = combined_df.index.hour combined_df['day_of_week'] = combined_df.index.dayofweek combined_df['is_weekend'] = combined_df['day_of_week'].isin([5, 6]).astype(int) # 创建滞后特征 for lag in [1, 2, 3, 24]: # 1小时、2小时、3小时、24小时滞后 if 'price' in combined_df.columns: combined_df[f'price_lag_{lag}'] = combined_df['price'].shift(lag) # 添加新能源占比(修复分母为0的问题) if 'wind_power' in combined_df.columns and 'pv_power' in combined_df.columns and 'load' in combined_df.columns: # 确保负荷不为0 combined_df['load'] = combined_df['load'].replace(0, np.nan).ffill().bfill() # 计算可再生能源比例 combined_df['renewable_ratio'] = (combined_df['wind_power'] + combined_df['pv_power']) / combined_df['load'] # 处理无穷大值 combined_df['renewable_ratio'] = combined_df['renewable_ratio'].replace([np.inf, -np.inf], np.nan).ffill().bfill() # 最终缺失值处理 combined_df = combined_df.ffill().bfill() print(f"合并后数据集形状: {combined_df.shape}") print(f"时间范围: {combined_df.index.min()} 至 {combined_df.index.max()}") return combined_df # 5. 修复可视化错误 def analyze_and_visualize(combined_df): """执行基础分析并可视化""" print("\n=== 数据分析与可视化 ===") if combined_df.empty: print("警告: 合并后的数据集为空,无法进行分析和可视化!") return # 基础统计 print("\n数据集统计信息:") print(combined_df.describe()) # 相关性分析(排除非数值列) numeric_cols = combined_df.select_dtypes(include=[np.number]).columns if 'price' in numeric_cols: corr_matrix = combined_df[numeric_cols].corr() price_corr = corr_matrix['price'].sort_values(ascending=False) print("\n与电价相关性最高的因素:") print(price_corr.head(10)) # 可视化 plt.figure(figsize=(15, 12)) # 电价时间序列 if 'price' in combined_df.columns: plt.subplot(3, 2, 1) combined_df['price'].plot(title='Electricity Price Trend') plt.ylabel('Price') else: plt.subplot(3, 2, 1) plt.text(0.5, 0.5, '无电价数据', ha='center', va='center') plt.title('Electricity Price Trend') # 新能源功率时间序列 plt.subplot(3, 2, 2) if 'wind_power' in combined_df.columns: combined_df['wind_power'].plot(label='Wind Power', alpha=0.7) if 'pv_power' in combined_df.columns: combined_df['pv_power'].plot(label='PV Power', alpha=0.7) plt.title('Renewable Energy Generation') plt.ylabel('Power (MW)') plt.legend() # 负荷与电价关系 plt.subplot(3, 2, 3) if 'load' in combined_df.columns and 'price' in combined_df.columns: plt.scatter(combined_df['load'], combined_df['price'], alpha=0.5) plt.title('Load vs Price') plt.xlabel('Load') plt.ylabel('Price') else: plt.text(0.5, 0.5, '缺少负荷或电价数据', ha='center', va='center') plt.title('Load vs Price') # 小时平均电价 plt.subplot(3, 2, 4) if 'price' in combined_df.columns and 'hour' in combined_df.columns: hour_avg = combined_df.groupby('hour')['price'].mean() hour_avg.plot(kind='bar') plt.title('Average Price by Hour') plt.xlabel('Hour of Day') plt.ylabel('Average Price') else: plt.text(0.5, 0.5, '缺少电价或小时数据', ha='center', va='center') plt.title('Average Price by Hour') # 可再生能源比例与电价 plt.subplot(3, 2, 5) if 'renewable_ratio' in combined_df.columns and 'price' in combined_df.columns: plt.scatter(combined_df['renewable_ratio'], combined_df['price'], alpha=0.5) plt.title('Renewable Ratio vs Price') plt.xlabel('(Wind + PV) / Load') plt.ylabel('Price') else: plt.text(0.5, 0.5, '缺少可再生能源比例或电价数据', ha='center', va='center') plt.title('Renewable Ratio vs Price') # 温度与电价关系 plt.subplot(3, 2, 6) if 'num_0' in combined_df.columns and 'price' in combined_df.columns: # 假设第一个数值列是温度 plt.scatter(combined_df['num_0'], combined_df['price'], alpha=0.5) plt.title('Temperature vs Price') plt.xlabel('Temperature (°C)') plt.ylabel('Price') elif 'price' in combined_df.columns and 'day_of_week' in combined_df.columns: # 如果没有温度数据,绘制星期几对电价的影响 weekday_avg = combined_df.groupby('day_of_week')['price'].mean完整代码
最新发布
12-06
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值