df.rolling(period,min_period).method()

df.rolling(period,min_period).somemethod()

rolling 方法很好用


freq 参数:从0.18版本中已经被舍弃。


第二种方法:pd.rolling_max(df['A'],15)

使用时提示:pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 

Series.rolling(min_periods=24,window=24,center=False).sum()

“请使用第一种方法”


""" 传感器“卡值”(stuck value) 轻量级检测脚本(稳定相位+模板Z过滤版,Python 3.8+) ------------------------------------------------ 特点: - 仅依赖 numpy / pandas / matplotlib - 同时支持: 1) 绝对卡值/重复值段(平坦段) 2) 低变化段(导数接近0) 3) 周期性数据的“相位差残差”卡值(去除周期基线后再检测) - 自动估计或手动指定主周期 - 新增:跨周期“天然稳定相位”与模板Z分数过滤,显著降低对天然平台的误报 - 返回区间级别的告警(起止索引/时间、类型、置信度) 用法示例见底部 __main__ 区域。 """ from dataclasses import dataclass from typing import List, Optional, Tuple, Dict import numpy as np import pandas as pd # ----------------------------- # 工具函数 # ----------------------------- def _rolling_std(x: np.ndarray, window: int) -> np.ndarray: if window <= 1: return np.zeros_like(x, dtype=float) s = pd.Series(x) return s.rolling(window, min_periods=window).std(ddof=0).to_numpy() def _autocorr_via_fft(x: np.ndarray) -> np.ndarray: """快速自相关(归一化),返回与 x 等长的自相关数组。""" x = np.asarray(x, dtype=float) x = x - np.nanmean(x) x[np.isnan(x)] = 0.0 n = int(1 << (len(x) * 2 - 1).bit_length()) fx = np.fft.rfft(x, n=n) acf = np.fft.irfft(fx * np.conjugate(fx), n=n)[: len(x)] acf /= np.maximum(acf[0], 1e-12) return acf def estimate_period( x: np.ndarray, min_period: int = 5, max_period: Optional[int] = None, ) -> Optional[int]: """粗略估计主周期(样本点单位)。返回最可能的周期长度,找不到返回 None。""" n = len(x) if n < 3 * min_period: return None if max_period is None: max_period = max(min(n // 3, 2000), min_period + 1) acf = _autocorr_via_fft(x) seg = acf[min_period : max_period] if len(seg) == 0: return None k = int(np.nanargmax(seg)) + min_period if acf[k] < 0.15: return None return k def _group_runs(mask: np.ndarray) -> List[Tuple[int, int]]: """将布尔序列中为 True 的连续段转为 [start, end](含 end)。""" runs: List[Tuple[int, int]] = [] i = 0 n = len(mask) while i < n: if mask[i]: j = i while j + 1 < n and mask[j + 1]: j += 1 runs.append((i, j)) i = j + 1 else: i += 1 return runs def _phase_template_sigma(x: np.ndarray, period: int): """ 计算“同相位跨周期”的模板(相位中位数)与 σ(由MAD近似)。 返回 (template, sigma),若周期样本不足(<2个周期)则返回 (None, None)。 """ m = len(x) // period if m < 2: return None, None X = x[: m * period].reshape(m, period) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 return template, sigma # ----------------------------- # 结果数据结构 # ----------------------------- @dataclass class StuckInterval: start_idx: int end_idx: int kind: str # "flat", "low_var", "seasonal_flat" score: float # 0~1 大致置信度 value_summary: str # ----------------------------- # 主检测器 # ----------------------------- class StuckDetector: def __init__( self, min_flat_run: int = 5, value_tol: float = 0.0, deriv_window: int = 3, deriv_tol: float = 1e-6, lowvar_window: int = 15, lowvar_std_tol: float = 1e-4, seasonal_period: Optional[int] = None, seasonal_robust: bool = True, seasonal_min_run: int = 5, seasonal_value_tol: float = 0.0, # —— 稳定相位+Z过滤参数 —— stable_phase_q: float = 0.20, # σ 的分位阈值,以下视为“天然稳定相位” stable_overlap_thr: float = 0.60, # 区间与稳定相位重叠比例阈值 require_z_k: float = 3.0 # 在稳定相位里仍要报卡值的最小Z阈值 ) -> None: self.min_flat_run = min_flat_run self.value_tol = value_tol self.deriv_window = deriv_window self.deriv_tol = deriv_tol self.lowvar_window = lowvar_window self.lowvar_std_tol = lowvar_std_tol self.seasonal_period = seasonal_period self.seasonal_robust = seasonal_robust self.seasonal_min_run = seasonal_min_run self.seasonal_value_tol = seasonal_value_tol self.stable_phase_q = stable_phase_q self.stable_overlap_thr = stable_overlap_thr self.require_z_k = require_z_k # ---- 基础检测 ---- def _flat_runs(self, x: np.ndarray) -> List[StuckInterval]: eq = np.abs(np.diff(x, prepend=x[0])) <= self.value_tol runs = _group_runs(eq) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.min_flat_run: v = np.median(x[s : e + 1]) out.append(StuckInterval(s, e, "flat", score=0.7, value_summary="~{:.6g}".format(v))) return out def _low_variability(self, x: np.ndarray) -> List[StuckInterval]: sd = _rolling_std(x, self.lowvar_window) mask = sd <= self.lowvar_std_tol runs = _group_runs(mask) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= max(self.lowvar_window, self.min_flat_run): v = np.median(x[s : e + 1]) local_sd = float(np.nanmax(sd[s : e + 1])) if np.isfinite(sd[s : e + 1]).any() else 0.0 score = float(np.clip(1.0 - local_sd / (self.lowvar_std_tol + 1e-12), 0, 1)) out.append(StuckInterval(s, e, "low_var", score=score, value_summary="~{:.6g}".format(v))) return out def _derivative_flat(self, x: np.ndarray) -> List[StuckInterval]: dx = np.diff(x, prepend=x[0]) if self.deriv_window > 1: dx = pd.Series(dx).rolling(self.deriv_window, min_periods=1, center=True).mean().to_numpy() mask = np.abs(dx) <= self.deriv_tol runs = _group_runs(mask) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.min_flat_run: v = np.median(x[s : e + 1]) local_absmax = float(np.nanmax(np.abs(dx[s : e + 1]))) if np.isfinite(dx[s : e + 1]).any() else 0.0 score = float(np.clip(1.0 - local_absmax / (self.deriv_tol + 1e-12), 0, 1)) out.append(StuckInterval(s, e, "flat", score=score, value_summary="~{:.6g}".format(v))) return out # ---- 周期性处理 ---- def _seasonal_baseline(self, x: np.ndarray, period: int) -> np.ndarray: phase_vals = [x[i::period] for i in range(period)] if self.seasonal_robust: phase_stats = [np.nanmedian(v) for v in phase_vals] else: phase_stats = [np.nanmean(v) for v in phase_vals] baseline = np.empty_like(x, dtype=float) for i in range(period): baseline[i::period] = phase_stats[i] return baseline def _seasonal_flat_runs(self, x: np.ndarray, period: int) -> List[StuckInterval]: baseline = self._seasonal_baseline(x, period) resid = x - baseline template, sigma = _phase_template_sigma(x, period) use_z = template is not None and sigma is not None eps = 1e-12 eq = np.abs(np.diff(resid, prepend=resid[0])) <= self.seasonal_value_tol runs = _group_runs(eq) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.seasonal_min_run: v = np.median(x[s : e + 1]) rmad_series = pd.Series(resid).rolling(period, min_periods=period)\ .apply(lambda w: np.nanmedian(np.abs(w - np.nanmedian(w))), raw=False) # 取窗口末端的RMAD,防止 NaN rmad_val = rmad_series.iloc[e] if pd.isna(rmad_val): rmad_val = 0.0 rmad = float(rmad_val) flat_strength = 1.0 if self.seasonal_value_tol <= 0 else float( np.clip(1.0 - np.nanmax(np.abs(np.diff(resid[s : e + 1], prepend=resid[s]))) / (self.seasonal_value_tol + 1e-12), 0, 1) ) season_stability = float(np.clip(1.0 - rmad / (np.nanstd(resid) + 1e-12), 0, 1)) if np.isfinite(rmad) else 0.5 score = float(np.clip(0.5 * flat_strength + 0.5 * season_stability, 0, 1)) if use_z: idx = np.arange(s, e + 1) phase = idx % period z = np.abs(x[idx] - template[phase]) / (sigma[phase] + eps) z_med = float(np.nanmedian(z)) if np.isfinite(z).any() else 0.0 if z_med < max(2.5, self.require_z_k - 0.5): continue out.append(StuckInterval(s, e, "seasonal_flat", score=score, value_summary="~{:.6g}".format(v))) return out # ---- 稳定相位+Z过滤 ---- def _filter_by_stability_and_z( self, x: np.ndarray, intervals: List[StuckInterval], period: int, template: np.ndarray, sigma: np.ndarray, stable_mask: np.ndarray, ) -> List[StuckInterval]: keep: List[StuckInterval] = [] eps = 1e-12 n = len(x) for it in intervals: s, e = it.start_idx, it.end_idx s = max(0, int(s)); e = min(n - 1, int(e)) if s > e: continue overlap = float(np.mean(stable_mask[s:e+1])) if e >= s else 0.0 idx = np.arange(s, e + 1) phase = idx % period z = np.abs(x[idx] - template[phase]) / (sigma[phase] + eps) z_med = float(np.nanmedian(z)) if np.isfinite(z).any() else 0.0 if overlap >= self.stable_overlap_thr and z_med < self.require_z_k: continue keep.append(it) return keep # ---- 主入口 ---- def detect(self, series: pd.Series) -> Tuple[List[StuckInterval], Dict[str, Optional[int]]]: """ 输入:时间序列(pd.Series,索引可为时间戳或整数) 输出: - intervals: StuckInterval 列表(不重叠;若重叠会做简单合并) - meta: {"period": 周期估计} """ x = series.to_numpy(dtype=float) n = len(x) intervals: List[StuckInterval] = [] period = self.seasonal_period or estimate_period(x) template = sigma = None stable_phase = None stable_mask = np.zeros(n, dtype=bool) if period is not None and period >= 3 and period * 2 <= n: template, sigma = _phase_template_sigma(x, period) if template is not None: thr = np.nanquantile(sigma, self.stable_phase_q) stable_phase = sigma <= thr m = n // period for i in range(m): stable_mask[i * period : i * period + period] = stable_phase rem = n - m * period if rem > 0: stable_mask[m * period : m * period + rem] = stable_phase[:rem] intervals.extend(self._flat_runs(x)) intervals.extend(self._low_variability(x)) intervals.extend(self._derivative_flat(x)) if period is not None and period >= 3 and period * 2 <= n: intervals.extend(self._seasonal_flat_runs(x, period)) if (period is not None) and (template is not None) and (sigma is not None): intervals = self._filter_by_stability_and_z(x, intervals, period, template, sigma, stable_mask) intervals = self._merge_intervals(intervals) return intervals, {"period": period} @staticmethod def _merge_intervals(intervals: List[StuckInterval]) -> List[StuckInterval]: if not intervals: return [] intervals = sorted(intervals, key=lambda z: (z.start_idx, z.end_idx)) merged: List[StuckInterval] = [] cur = intervals[0] for nx in intervals[1:]: if nx.start_idx <= cur.end_idx + 1 and nx.kind == cur.kind: new_s = cur.start_idx new_e = max(cur.end_idx, nx.end_idx) score = max(cur.score, nx.score) cur = StuckInterval(new_s, new_e, cur.kind, score=score, value_summary=cur.value_summary) elif nx.start_idx <= cur.end_idx + 1 and nx.kind != cur.kind: if nx.score >= cur.score: cur = StuckInterval(cur.start_idx, max(cur.end_idx, nx.end_idx), nx.kind, nx.score, nx.value_summary) else: cur = StuckInterval(cur.start_idx, max(cur.end_idx, nx.end_idx), cur.kind, cur.score, cur.value_summary) else: merged.append(cur) cur = nx merged.append(cur) return merged # ----------------------------- # 便捷接口 # ----------------------------- def detect_stuck_segments( series: pd.Series, sampling_period: Optional[pd.Timedelta] = None, **kwargs, ) -> pd.DataFrame: """一次性运行并返回 DataFrame 结果。""" det = StuckDetector(**kwargs) intervals, meta = det.detect(series) rows: List[Dict[str, object]] = [] for it in intervals: start_time = end_time = None if isinstance(series.index, pd.DatetimeIndex): if sampling_period is None: start_time = series.index[it.start_idx] end_time = series.index[it.end_idx] else: start_time = series.index[0] + it.start_idx * sampling_period end_time = series.index[0] + it.end_idx * sampling_period rows.append( { "start_idx": it.start_idx, "end_idx": it.end_idx, "start_time": start_time, "end_time": end_time, "kind": it.kind, "score": it.score, "value_summary": it.value_summary, "length": it.end_idx - it.start_idx + 1, } ) df = pd.DataFrame(rows) if "period" in meta: df.attrs["estimated_period"] = meta["period"] return df # ----------------------------- # 进阶:基于“同周期模板”的卡值定位(周期内卡段) # ----------------------------- def detect_within_cycle_stuck( series: pd.Series, period: Optional[int] = None, # 支持自动估计 min_run: int = 5, value_tol: float = 0.0, run_std_tol: float = 1e-4, peer_z_k: float = 4.0, ) -> pd.DataFrame: x = series.to_numpy(dtype=float) n = len(x) # 自动估计周期 if period is None: period = estimate_period(x) if period is None or period < 3: return pd.DataFrame() m = n // period if m < 2: return pd.DataFrame() X = x[: m * period].reshape(m, period) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 rows: List[Dict[str, object]] = [] for ci in range(m): cyc = X[ci] diff = np.abs(cyc - template) eq = np.abs(np.diff(cyc, prepend=cyc[0])) <= value_tol run_std = pd.Series(cyc).rolling(min_run, min_periods=min_run).std(ddof=0).to_numpy() std_mask = run_std <= run_std_tol mask = np.zeros(period, dtype=bool) for i in range(period): L = max(0, i - min_run + 1) R = i if R - L + 1 >= min_run: cond_std = (not np.isnan(std_mask[R])) and bool(std_mask[R]) if eq[L:R+1].all() and cond_std: mask[L:R+1] = True runs = _group_runs(mask) for s, e in runs: if float(np.nanmedian(sigma[s:e+1])) < run_std_tol: continue z = diff[s:e+1] / sigma[s:e+1] z_med = float(np.nanmedian(z)) if np.isfinite(z_med) and z_med >= peer_z_k: abs_s = ci * period + s abs_e = ci * period + e mean_diff = float(np.nanmean(np.sign(cyc[s:e+1] - template[s:e+1]) * diff[s:e+1])) score = float(np.tanh((z_med - peer_z_k) / 2 + 1)) rows.append({ "cycle_idx": ci, "start_phase": s, "end_phase": e, "abs_start_idx": abs_s, "abs_end_idx": abs_e, "mean_diff_to_template": mean_diff, "kind": "cycle_flat", "score": score, }) return pd.DataFrame(rows) # ----------------------------- # 合成温度型周期数据(非正弦,含天然稳定段 + 可选卡值段) # ----------------------------- def generate_temperature_like_series( start_time: str = "2025-01-01", period: int = 60, cycles: int = 20, noise_std: float = 0.03, stable_plateau_len: int = 8, stable_values: Tuple[float, float] = (31.0, 34.0), temp_range: Tuple[float, float] = (30.0, 35.0), stuck_cycle_idx: Optional[int] = 8, stuck_phase_range: Tuple[int, int] = (20, 35), stuck_value: Optional[float] = None, seed: Optional[int] = 42, ) -> pd.Series: """生成更接近温度传感器的周期信号(非正弦)。""" rng = np.random.RandomState(seed) # 为兼容旧版 NumPy a = max(6, (period // 3) - stable_plateau_len) b = max(6, (period - a - 2 * stable_plateau_len)) v_low, v_high = stable_values v_low = float(np.clip(v_low, *temp_range)) v_high = float(np.clip(v_high, *temp_range)) one_cycle_parts = [] if a > 0: ramp_up = np.linspace(v_low, v_high, a, endpoint=False) ramp_up = ramp_up + rng.normal(0, noise_std, size=a) one_cycle_parts.append(ramp_up) plateau1 = np.full(stable_plateau_len, v_high) one_cycle_parts.append(plateau1) if b > 0: mid = v_low + 0.5 * (v_high - v_low) ramp_var = np.linspace(v_high, mid, b, endpoint=False) ramp_var = ramp_var + rng.normal(0, noise_std, size=b) one_cycle_parts.append(ramp_var) plateau2 = np.full(stable_plateau_len, v_low) one_cycle_parts.append(plateau2) one_cycle = np.concatenate(one_cycle_parts) if len(one_cycle) < period: pad = np.full(period - len(one_cycle), v_low) one_cycle = np.concatenate([one_cycle, pad]) else: one_cycle = one_cycle[:period] sig = np.tile(one_cycle, cycles) plateau_mask = np.zeros(period, dtype=bool) plateau_mask[a:a + stable_plateau_len] = True plateau_mask[a + stable_plateau_len + b : a + stable_plateau_len + b + stable_plateau_len] = True plateau_mask_full = np.tile(plateau_mask, cycles) non_plateau_idx = np.where(~plateau_mask_full)[0] sig[non_plateau_idx] += rng.normal(0, noise_std, size=len(non_plateau_idx)) if stuck_cycle_idx is not None: s = stuck_cycle_idx * period + stuck_phase_range[0] e = stuck_cycle_idx * period + stuck_phase_range[1] s = int(np.clip(s, 0, len(sig) - 1)) e = int(np.clip(e, 0, len(sig) - 1)) if s <= e: if stuck_value is None: seg = sig[s:e + 1] sv = float(np.round(np.median(seg), 3)) else: sv = float(np.clip(stuck_value, *temp_range)) sig[s:e + 1] = sv sig = np.clip(sig, *temp_range) idx = pd.date_range(start_time, periods=len(sig), freq="S") return pd.Series(sig, index=idx) # ----------------------------- # 可视化:原始数据 + 卡值区间 + 稳定点 标注 # ----------------------------- import matplotlib.pyplot as plt def _mask_from_intervals(n: int, intervals_df: pd.DataFrame, start_col: str, end_col: str) -> np.ndarray: m = np.zeros(n, dtype=bool) if intervals_df is None or len(intervals_df) == 0: return m for s, e in intervals_df[[start_col, end_col]].to_numpy(): s = int(max(0, s)) e = int(min(n - 1, e)) if s <= e: m[s : e + 1] = True return m def detect_stable_points( series: pd.Series, period: Optional[int] = None, stuck_mask: Optional[np.ndarray] = None, method: str = "cycle_sigma", stable_sigma_q: float = 0.2, rolling_window: int = 20, rolling_std_tol: float = 1e-3, ) -> np.ndarray: """返回布尔数组,表示“本身为稳定的点”。 method="cycle_sigma" 时若未提供 period,会尝试自动估计;失败则退回滚动标准差法。 """ x = series.to_numpy(dtype=float) n = len(x) stable = np.zeros(n, dtype=bool) if stuck_mask is None: stuck_mask = np.zeros(n, dtype=bool) if method == "cycle_sigma": p = period if p is None: p = estimate_period(x) if p is not None and n // p >= 2: m = n // p X = x[: m * p].reshape(m, p) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 thr = np.nanquantile(sigma, stable_sigma_q) stable_phase = sigma <= thr for i in range(m): idx0 = i * p stable[idx0 : idx0 + p] = stable_phase rem = n - m * p if rem > 0: stable[m * p : m * p + rem] = stable_phase[:rem] else: sd = _rolling_std(x, rolling_window) stable = sd <= rolling_std_tol else: sd = _rolling_std(x, rolling_window) stable = sd <= rolling_std_tol stable = np.logical_and(stable, ~stuck_mask) return stable def plot_stuck_overview( series: pd.Series, res_basic: Optional[pd.DataFrame] = None, res_within: Optional[pd.DataFrame] = None, period: Optional[int] = None, stable_method: str = "cycle_sigma", stable_sigma_q: float = 0.2, rolling_window: int = 20, rolling_std_tol: float = 1e-3, figsize: Tuple[int, int] = (12, 5), save_path: Optional[str] = None, ): """绘制原始数据,并叠加:卡值区间(红)+ 稳定点(绿)。""" x = series.to_numpy(dtype=float) n = len(x) mask_basic = _mask_from_intervals(n, res_basic, "start_idx", "end_idx") if res_basic is not None else np.zeros(n, dtype=bool) mask_within = _mask_from_intervals(n, res_within, "abs_start_idx", "abs_end_idx") if res_within is not None else np.zeros(n, dtype=bool) stuck_mask = np.logical_or(mask_basic, mask_within) stable_mask = detect_stable_points( series, period=period, stuck_mask=stuck_mask, method=stable_method, stable_sigma_q=stable_sigma_q, rolling_window=rolling_window, rolling_std_tol=rolling_std_tol, ) fig, ax = plt.subplots(1, 1, figsize=figsize) if isinstance(series.index, pd.DatetimeIndex): t = series.index else: t = np.arange(n) ax.plot(t, x, linewidth=1.2, label="signal") def _add_spans(mask, color="#ff4d4f", alpha=0.25, label="stuck"): runs = _group_runs(mask) for i, (s, e) in enumerate(runs): ax.axvspan(t[s], t[e], color=color, alpha=alpha, lw=0, label=(label if i == 0 else None)) _add_spans(stuck_mask, label="stuck") idx_stable = np.where(stable_mask)[0] if len(idx_stable) > 0: ax.scatter(t[idx_stable], x[idx_stable], s=12, color="#52c41a", label="stable points", zorder=3) ax.set_title("Signal with Stuck Segments and Stable Points") ax.set_xlabel("Time" if isinstance(series.index, pd.DatetimeIndex) else "Index") ax.set_ylabel("Value") ax.legend() ax.grid(True, linestyle=":", alpha=0.5) if save_path: fig.savefig(save_path, dpi=160, bbox_inches="tight") return fig, ax # ----------------------------- # 示例(可注释/删除) # ----------------------------- if __name__ == "__main__": s = generate_temperature_like_series( start_time="2025-01-01", period=60, cycles=20, noise_std=0.03, stable_plateau_len=8, stable_values=(31.0, 33.0), temp_range=(30.0, 35.0), stuck_cycle_idx=8, stuck_phase_range=(20, 35), stuck_value=None, seed=42 ) # 基础检测:seasonal_period 未给则自动估计;此处演示手动指定为 60 res1 = detect_stuck_segments( s, sampling_period=pd.Timedelta(seconds=1), min_flat_run=5, value_tol=5e-4, deriv_window=3, deriv_tol=5e-4, lowvar_window=10, lowvar_std_tol=1e-3, seasonal_period=60, # 可改为 None 让其自动估计 seasonal_min_run=5, seasonal_value_tol=5e-4, stable_phase_q=0.20, stable_overlap_thr=0.60, require_z_k=3.0 ) # 周期内卡段检测:period=None 将自动估计 res2 = detect_within_cycle_stuck( s, period=None, # ← 自动估计 min_run=6, value_tol=1e-4, run_std_tol=8e-4, peer_z_k=4.0, ) plot_stuck_overview( s, res_basic=res1, res_within=res2, period=None, # 绘图也会尝试自动估计周期,不行则退回滚动法 stable_method="cycle_sigma", stable_sigma_q=0.2, figsize=(12, 4.5), save_path=None, ) import matplotlib.pyplot as plt plt.show() 为什么生成的周期模板总是少一个周期希望生成所有周期的
11-13
import pandas as pd import numpy as np from sklearn.impute import KNNImputer import matplotlib.pyplot as plt import matplotlib as mpl import warnings import os import traceback # 设置中文字体支持 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei'] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # 文件路径配置 file_paths = { "price": r"C:\Users\1\Desktop\电价_实时.xlsx", "wind": r"C:\Users\1\Desktop\风能_预处理后数据.xlsx", "pv": r"C:\Users\1\Desktop\光伏_预处理后数据.xlsx", "load": r"C:\Users\1\Desktop\高压侧总负荷_预处理后_20251205_132714.xlsx", "weather": r"C:\Users\1\Desktop\processed_气象数据.xlsx" } # 1. 增强型数据加载函数(添加时间范围筛选) def load_and_process_data(file_paths): """加载所有数据并处理列名不一致问题,并筛选时间范围""" # 定义各文件的实际列名映射 column_mapping = { "price": {"time_col": "时间", "value_col": "电价"}, "wind": {"time_col": "DATA_TIME", "value_col": "风电功率"}, "pv": {"time_col": "DATA_TIME", "value_col": "光伏功率"}, "load": {"time_col": "DATA_TIME", "value_col": "负荷值"}, "weather": {"time_col": "时间", "value_cols": ["温度", "湿度", "风速", "天气状况"]} } datasets = {} start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') # 加载电价数据 price_df = pd.read_excel(file_paths['price']) if column_mapping["price"]["time_col"] in price_df.columns: price_df = price_df.rename(columns={ column_mapping["price"]["time_col"]: "DATA_TIME", column_mapping["price"]["value_col"]: "price_value" }) price_df['DATA_TIME'] = pd.to_datetime(price_df['DATA_TIME']) datasets['price'] = price_df.set_index('DATA_TIME') # 加载风电数据 wind_df = pd.read_excel(file_paths['wind']) if column_mapping["wind"]["time_col"] in wind_df.columns: wind_df = wind_df.rename(columns={ column_mapping["wind"]["time_col"]: "DATA_TIME", column_mapping["wind"]["value_col"]: "power_generated" }) wind_df['DATA_TIME'] = pd.to_datetime(wind_df['DATA_TIME']) datasets['wind'] = wind_df.set_index('DATA_TIME') # 加载光伏数据 pv_df = pd.read_excel(file_paths['pv']) if column_mapping["pv"]["time_col"] in pv_df.columns: pv_df = pv_df.rename(columns={ column_mapping["pv"]["time_col"]: "DATA_TIME", column_mapping["pv"]["value_col"]: "power_generated" }) pv_df['DATA_TIME'] = pd.to_datetime(pv_df['DATA_TIME']) datasets['pv'] = pv_df.set_index('DATA_TIME') # 加载负荷数据 load_df = pd.read_excel(file_paths['load']) if column_mapping["load"]["time_col"] in load_df.columns: load_df = load_df.rename(columns={ column_mapping["load"]["time_col"]: "DATA_TIME", column_mapping["load"]["value_col"]: "load_value" }) load_df['DATA_TIME'] = pd.to_datetime(load_df['DATA_TIME']) datasets['load'] = load_df.set_index('DATA_TIME') # 加载气象数据 weather_df = pd.read_excel(file_paths['weather']) if column_mapping["weather"]["time_col"] in weather_df.columns: rename_dict = {column_mapping["weather"]["time_col"]: "DATA_TIME"} # 动态映射数值列 num_cols = [] for col in weather_df.columns: if col != column_mapping["weather"]["time_col"] and pd.api.types.is_numeric_dtype(weather_df[col]): num_cols.append(col) # 重命名数值列 for i, col in enumerate(num_cols): rename_dict[col] = f"num_{i}" # 处理分类列 cat_cols = [col for col in weather_df.columns if col not in num_cols and col != column_mapping["weather"]["time_col"]] for i, col in enumerate(cat_cols): rename_dict[col] = f"cat_{i}" weather_df = weather_df.rename(columns=rename_dict) weather_df['DATA_TIME'] = pd.to_datetime(weather_df['DATA_TIME']) datasets['weather'] = weather_df.set_index('DATA_TIME') # 筛选指定时间范围:2025-01-27 00:00:00 到 2025-04-14 00:00:00 for name, df in datasets.items(): # 确保索引是DateTime类型 if not isinstance(df.index, pd.DatetimeIndex): df.index = pd.to_datetime(df.index) # 筛选日期范围 [start_date, end_date] mask = (df.index >= start_date) & (df.index <= end_date) datasets[name] = df.loc[mask] # 检查数据是否为空 if len(datasets[name]) == 0: print(f"警告: {name}数据集在指定时间范围内没有数据!") else: print( f"{name}数据集: {len(datasets[name])}条记录, 时间范围: {datasets[name].index.min()} 至 {datasets[name].index.max()}") return datasets # 2. 时间对齐到1小时频率 def align_to_hourly_frequency(datasets): """将所有数据统一到1小时频率""" print("=== 数据重采样到小时频率 ===") # 确定统一的时间范围 (2025-01-27 00:00:00 到 2025-04-14 00:00:00) start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') full_range = pd.date_range(start=start_date, end=end_date, freq='h') aligned_datasets = {} # 对每个数据集进行重采样 for name, df in datasets.items(): # 跳过空数据集 if len(df) == 0: aligned_datasets[name] = pd.DataFrame(index=full_range) print(f"{name}: 空数据集 → 对齐后 {len(full_range)}") continue # 数值型数据取平均值 if name in ['price', 'wind', 'pv', 'load']: # 获取第一个数值列 value_col = df.columns[0] resampled = df[[value_col]].resample('h').mean() # 气象数据特殊处理 elif name == 'weather': # 区分数值列和分类列 num_cols = [col for col in df.columns if col.startswith('num_')] cat_cols = [col for col in df.columns if col.startswith('cat_')] avg_df = df[num_cols].resample('h').mean() if num_cols else pd.DataFrame() mode_df = pd.DataFrame() if cat_cols: mode_df = df[cat_cols].resample('h').apply(lambda x: x.mode()[0] if not x.empty else np.nan) resampled = pd.concat([avg_df, mode_df], axis=1) else: resampled = df.resample('h').mean() # 重新索引到完整时间范围 aligned = resampled.reindex(full_range) aligned_datasets[name] = aligned print(f"{name}: 原始点 {len(df)} → 对齐后 {len(aligned)}") return aligned_datasets # 3. 缺失值处理 def handle_missing_data(aligned_datasets): """处理各数据集的缺失值""" print("\n=== 缺失值处理 ===") # 创建缺失值报告函数 def report_missing(df, name): if df.empty: print(f"{name}数据集为空,跳过缺失值处理") return 0 total = df.size missing = df.isnull().sum().sum() print(f"{name}缺失值: {missing}/{total} ({missing / total:.1%})") return missing # 处理每个数据集 for name, df in aligned_datasets.items(): original_missing = report_missing(df, name) # 如果数据集为空,跳过处理 if df.empty: continue # 不同数据集使用不同填充策略 if name == 'price': # 电价:时间序列插值 df = df.interpolate(method='time') elif name == 'weather': # 数值列:KNN填充 num_cols = [col for col in df.columns if col.startswith('num_')] if num_cols: imputer = KNNImputer(n_neighbors=6) df[num_cols] = imputer.fit_transform(df[num_cols]) # 分类列用众数填充 cat_cols = [col for col in df.columns if col.startswith('cat_')] for col in cat_cols: if df[col].isnull().any(): mode_val = df[col].mode()[0] if not df[col].mode().empty else "未知" df[col] = df[col].fillna(mode_val) else: # 其他数据:前后填充 df = df.ffill().bfill() # 最终检查 final_missing = df.isnull().sum().sum() print(f"处理后缺失值: {final_missing}/{df.size} ({final_missing / df.size:.1%})") aligned_datasets[name] = df return aligned_datasets # 4. 数据合并与特征工程 def merge_and_engineer(aligned_datasets): """合并所有数据集并创建特征""" print("\n=== 数据合并与特征工程 ===") # 创建基础数据框 (使用统一的时间范围) start_date = pd.Timestamp('2025-01-27 00:00:00') end_date = pd.Timestamp('2025-04-14 00:00:00') full_index = pd.date_range(start=start_date, end=end_date, freq='h') combined_df = pd.DataFrame(index=full_index) # 添加各数据源 for name in ['price', 'wind', 'pv', 'load']: if not aligned_datasets[name].empty: # 根据数据集名称选择列名 col_name = name if name == 'price': col_name = 'price' elif name == 'wind': col_name = 'wind_power' elif name == 'pv': col_name = 'pv_power' elif name == 'load': col_name = 'load' combined_df[col_name] = aligned_datasets[name].iloc[:, 0] # 添加气象数据 weather = aligned_datasets['weather'] if not weather.empty: # 添加数值列 num_cols = [col for col in weather.columns if col.startswith('num_')] for i, col in enumerate(num_cols): combined_df[f'weather_num_{i}'] = weather[col] # 添加分类列 cat_cols = [col for col in weather.columns if col.startswith('cat_')] for i, col in enumerate(cat_cols): combined_df[f'weather_cat_{i}'] = weather[col] # 特征工程 combined_df['hour'] = combined_df.index.hour combined_df['day_of_week'] = combined_df.index.dayofweek combined_df['is_weekend'] = combined_df['day_of_week'].isin([5, 6]).astype(int) # 创建滞后特征 for lag in [1, 2, 3, 24]: # 1小时、2小时、3小时、24小时滞后 if 'price' in combined_df.columns: combined_df[f'price_lag_{lag}'] = combined_df['price'].shift(lag) # 添加新能源占比 if 'wind_power' in combined_df.columns and 'pv_power' in combined_df.columns and 'load' in combined_df.columns: # 确保负荷不为0 combined_df['load'] = combined_df['load'].replace(0, np.nan).ffill().bfill() # 计算可再生能源比例 combined_df['renewable_ratio'] = (combined_df['wind_power'] + combined_df['pv_power']) / combined_df['load'] # 处理无穷大值 combined_df['renewable_ratio'] = combined_df['renewable_ratio'].replace([np.inf, -np.inf], np.nan).ffill().bfill() # 最终缺失值处理 combined_df = combined_df.ffill().bfill() print(f"合并后数据集形状: {combined_df.shape}") print(f"时间范围: {combined_df.index.min()} 至 {combined_df.index.max()}") return combined_df # 5. 数据分析与可视化 def analyze_and_visualize(combined_df): """执行基础分析并可视化""" print("\n=== 数据分析与可视化 ===") if combined_df.empty: print("警告: 合并后的数据集为空,无法进行分析和可视化!") return # 基础统计 print("\n数据集统计信息:") print(combined_df.describe()) # 相关性分析(排除非数值列) numeric_cols = combined_df.select_dtypes(include=[np.number]).columns if 'price' in numeric_cols: corr_matrix = combined_df[numeric_cols].corr() price_corr = corr_matrix['price'].sort_values(ascending=False) print("\n与电价相关性最高的因素:") print(price_corr.head(10)) # 可视化 plt.figure(figsize=(15, 12)) # 电价时间序列 plt.subplot(3, 2, 1) if 'price' in combined_df.columns: combined_df['price'].plot(title='电价变化趋势') plt.ylabel('电价') else: plt.text(0.5, 0.5, '无电价数据', ha='center', va='center') plt.title('电价变化趋势') # 新能源功率时间序列 plt.subplot(3, 2, 2) if 'wind_power' in combined_df.columns: combined_df['wind_power'].plot(label='风电功率', alpha=0.7) if 'pv_power' in combined_df.columns: combined_df['pv_power'].plot(label='光伏功率', alpha=0.7) plt.title('新能源发电功率') plt.ylabel('功率 (MW)') plt.legend() # 负荷与电价关系 plt.subplot(3, 2, 3) if 'load' in combined_df.columns and 'price' in combined_df.columns: plt.scatter(combined_df['load'], combined_df['price'], alpha=0.5) plt.title('负荷与电价关系') plt.xlabel('负荷') plt.ylabel('电价') else: plt.text(0.5, 0.5, '缺少负荷或电价数据', ha='center', va='center') plt.title('负荷与电价关系') # 小时平均电价 plt.subplot(3, 2, 4) if 'price' in combined_df.columns and 'hour' in combined_df.columns: hour_avg = combined_df.groupby('hour')['price'].mean() hour_avg.plot(kind='bar') plt.title('分时平均电价') plt.xlabel('小时') plt.ylabel('平均电价') else: plt.text(0.5, 0.5, '缺少电价或小时数据', ha='center', va='center') plt.title('分时平均电价') # 可再生能源比例与电价 plt.subplot(3, 2, 5) if 'renewable_ratio' in combined_df.columns and 'price' in combined_df.columns: plt.scatter(combined_df['renewable_ratio'], combined_df['price'], alpha=0.5) plt.title('可再生能源占比与电价关系') plt.xlabel('(风电+光伏)/总负荷') plt.ylabel('电价') else: plt.text(0.5, 0.5, '缺少新能源占比或电价数据', ha='center', va='center') plt.title('可再生能源占比与电价关系') # 温度与电价关系 if 'num_0' in combined_df.columns: # 假设第一个数值列是温度 plt.subplot(3, 2, 6) plt.scatter(combined_df['num_0'], combined_df['price'], alpha=0.5) plt.title('Temperature vs Price') plt.xlabel('Temperature (°C)') plt.ylabel('Price') else: # 如果没有温度数据,绘制星期几对电价的影响 plt.subplot(3, 2, 6) weekday_avg = combined_df.groupby('day_of_week')['price'].mean() weekday_avg.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] weekday_avg.plot(kind='bar') plt.title('Average Price by Weekday') plt.xlabel('Weekday') plt.ylabel('Average Price') plt.tight_layout() plt.savefig('energy_analysis_results.png') # 保存分析结果 combined_df.to_csv('combined_energy_data.csv') price_corr.to_csv('price_correlations.csv') print("分析完成! 结果已保存为CSV和PNG文件") # 主流程 if __name__ == "__main__": try: # 步骤1: 加载数据并筛选1月27日后数据 print("开始加载数据...") datasets = load_and_process_data(file_paths) # 步骤2: 统一到小时频率 print("统一时间频率...") aligned_data = align_to_hourly_frequency(datasets) # 步骤3: 缺失值处理 print("处理缺失值...") cleaned_data = handle_missing_data(aligned_data) # 步骤4: 数据合并与特征工程 print("合并数据并创建特征...") combined_data = merge_and_engineer(cleaned_data) # 步骤5: 分析与可视化 print("分析数据并生成可视化...") analyze_and_visualize(combined_data) print("程序执行成功!") except Exception as e: print(f"\n程序执行出错: {e}") print("建议检查:") print("1. 文件路径是否正确") print("2. Excel文件中的实际列名") print("3. 数据文件是否被其他程序占用") print("4. 日期格式是否正确") print("5. 数据值范围(如负荷为0的情况)") traceback.print_exc() 在当前代码中加入热力分析图等可视化分析,原有的图色彩更加丰富,更加精美化
最新发布
12-06
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值