week3_day2_PD&sql

本文介绍概念结构设计的基本要素,包括实体、属性、域、键和联系,并详细阐述了一对一、一对多及多对多联系的概念。此外,还介绍了如何使用PowerDesigner进行数据建模,并生成SQL Server数据库表结构的过程。最后,文章讲解了数据库设计中的三范式原则,以及SQL的基本操作。

概念结构设计
实体;属性;域;键;联系(Relationship)

根据联系两端的实体个数,确定以下联系类型:
一对一联系(1:1),如职工与社保账户等;
一对多联系(1:m),如公司与职工、出版社与图书等;
多对多联系(m:n),学生与课程、订单与商品等;
er模型
使用PowerDesigner进行数据建模
1.创建概念数据模型
2.由概念数据模型生成物理数据模型
在概念数据模型视图中,点击菜单“工具”(Tools)
→“生成物理数据模型”(Generate Physical Data Model),即可生成数据库物理模型。
3.配置所需生成数据库DBMS
此处选择SQL Server 2012
操作步骤为:PowerDesigner→Database(数据库)→Change Current DBMS(改变当前DBMS),选择SQL Server 2012。
4.由物理模型生成数据库
操作步骤为:PowerDesigner→Database(数据库)→Generate Database(生成数据库),即生成SQL Server数据库表结构的创建代码,将代码拷贝到数据库新建查询即可

三范式
第一范式(1NF)
任何列不可分割
第二范式(2NF)
每个表都有主键
第三范式(3NF)
主表引用从表的主键列,且只引用主键列

sql增删改查
增:insert 表名 (字段列表) values(值列表)
改:update 表名 set 列名1,列名2… where 条件
删:delete from 表名 where 条件
查:SELECT 列名 FROM 表名 where 条件
删除异常解决方法:
1.设置从表外键约束的“删除规则”为”设置Null”
2.设置从表外键约束的“删除规则”为“级联”(慎用)

import oml import xml.etree.ElementTree as ET from datetime import datetime, timedelta import re import numpy as np import pandas as pd import warnings import os import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import seaborn as sns import matplotlib.dates as mdates warnings.filterwarnings(&#39;ignore&#39;) PIC_DIR = "pic" if not os.path.exists(PIC_DIR): os.makedirs(PIC_DIR) def format_timestamp(ts): """格式化时间戳为Oracle可接受的字符串""" return ts.strftime("%Y-%m-%d %H:%M:%S.%f")[:23] def is_holiday(date): """判断日期是否为休息日(简化实现)""" # 在实际应用中,这里应该连接到日历表或API # 周末判断(周六=5,周日=6) if date.weekday() in [5, 6]: return True # 这里可以添加特定假期的判断 # 例如:if date.month == 1 and date.day == 1: return True return False def find_previous_workday(target_date): """找到前一个工作日""" current = target_date - timedelta(days=1) while is_holiday(current): current -= timedelta(days=1) return current def calculate_std_dev(df, exclude_columns): """计算数据框的标准差(排除指定列)""" # 排除不需要的列 numeric_cols = [col for col in df.columns if col not in exclude_columns] if not numeric_cols: return 0, {} # 计算每列的标准差 col_std = {} for col in numeric_cols: if df[col].dtype in [np.int64, np.float64]: col_std[col] = df[col].std() # 计算所有数值列合并后的标准差 all_values = [] for col in numeric_cols: if df[col].dtype in [np.int64, np.float64]: all_values.extend(df[col].values) total_std = np.std(all_values) if all_values else 0 return total_std, col_std def get_relative_change(current, reference): """计算相对变化率,处理除零问题""" if reference == 0 and current == 0: return 0 # 两者都为零,无变化 elif reference == 0: return float(&#39;inf&#39;) # 参考值为零,当前值非零 else: return abs(current - reference) / reference * 100 def classify_change(change_rate): """分类变化率,处理无穷大情况""" if change_rate == float(&#39;inf&#39;): return "极端变化(参考值为零)", "m4" elif change_rate < 5: return "正常波动(可忽略)", "m1" elif 5 <= change_rate < 10: return "值得关注的变化", "m2" elif 10 <= change_rate < 20: return "显著变化", "m3" else: return "重大变化", "m4" def format_change_rate(change_rate): """格式化变化率显示""" if change_rate == float(&#39;inf&#39;): return "无穷大" else: return f"{change_rate:.2f}%" def get_data_samples(cursor, obj_name, inst_number, target_time, num_samples=10): """获取目标时间点之前的样本数据,处理标准差为零的情况""" # 获取目标时间点之前的10个采样点 query = f""" SELECT * FROM "{obj_name}" WHERE instance_number = {inst_number} AND snap_time < TO_TIMESTAMP(&#39;{format_timestamp(target_time)}&#39;, &#39;YYYY-MM-DD HH24:MI:SS.FF3&#39;) ORDER BY snap_time DESC FETCH FIRST {num_samples} ROWS ONLY """ cursor.execute(query) result = cursor.fetchall() if not result: return None, None, None # 获取列名 col_names = [col[0] for col in cursor.description] # 创建DataFrame df = pd.DataFrame(result, columns=col_names) # 排除不需要的列 exclude_cols = [&#39;SNAP_TIME&#39;, &#39;INSTANCE_NUMBER&#39;, &#39;PRED&#39;, &#39;PROB&#39;, &#39;ANOMALYDETAILS&#39;] # 计算标准差 total_std, col_std = calculate_std_dev(df, exclude_cols) # 如果标准差为零,添加微小噪声避免除零错误 if total_std == 0: total_std = 1e-6 for col in col_std: if col_std[col] == 0: col_std[col] = 1e-6 return df, total_std, col_std def generate_description(attr): """生成指标问题描述""" rank = attr[&#39;rank&#39;] weight = attr[&#39;weight&#39;] name = attr[&#39;name&#39;] if rank == 1: return f"主要异常指标({weight*100:.0f}%权重),直接影响系统稳定性" elif weight > 0.7: return f"关键异常指标({weight*100:.0f}%权重),需要立即处理" elif weight > 0.3: return f"重要异常指标({weight*100:.0f}%权重),建议优先排查" else: return f"参考指标({weight*100:.0f}%权重),可作为辅助分析" def escape_sql(text): """转义SQL中的特殊字符""" return text.replace("&#39;", "&#39;&#39;").replace("\n", " ") def analyze_anomalies(): cursor = None conn = None obj_name, anom_time, anom_inst = None, None, None pic_paths = [] # 存储生成的图片路径 m4_indicators = [] # 存储m4级别的指标名称 try: # 1. 连接数据库 conn = oml.connect(user=&#39;aiopsoraoem&#39;, password=&#39;oracle&#39;, host=&#39;localhost&#39;, port=1521, service_name=&#39;aiopspdb&#39;) print("数据库连接成功") # 创建游标对象用于执行SQL cursor = oml.cursor() # 2. 获取待分析的异常记录 query = """ SELECT object_name, anomalies_time, anomalies_inst_number FROM RT_RESULT WHERE alert_analysis_status IS NULL AND anomalies_time > SYSDATE - 10/1440 ORDER BY anomalies_time DESC FETCH FIRST 1 ROWS ONLY """ # 执行查询 cursor.execute(query) result = cursor.fetchall() if not result: print("没有需要分析的异常记录") return # 提取数据 obj_name = result[0][0] anom_time = result[0][1] anom_inst = int(result[0][2]) print(f"分析记录: {obj_name} at {anom_time} (实例 {anom_inst})") # 3. 更新状态为doing update_query = f""" UPDATE RT_RESULT SET alert_analysis_status = &#39;doing&#39; WHERE object_name = &#39;{escape_sql(obj_name)}&#39; AND anomalies_time = TO_TIMESTAMP(&#39;{format_timestamp(anom_time)}&#39;, &#39;YYYY-MM-DD HH24:MI:SS.FF3&#39;) AND anomalies_inst_number = {anom_inst} """ cursor.execute(update_query) cursor.connection.commit() print("状态更新为doing") # 4. 获取异常详情 detail_query = f""" SELECT ANOMALYDETAILS FROM "{obj_name}" WHERE snap_time = TO_TIMESTAMP(&#39;{format_timestamp(anom_time)}&#39;, &#39;YYYY-MM-DD HH24:MI:SS.FF3&#39;) AND instance_number = {anom_inst} """ # 执行查询 cursor.execute(detail_query) details_result = cursor.fetchall() if not details_result: raise Exception(f"{obj_name}表中未找到匹配记录") # 获取XML数据 xml_data = details_result[0][0] # 处理XML命名空间问题 xml_data = re.sub(r&#39;\sxmlns="[^"]+"&#39;, &#39;&#39;, xml_data, count=1) root = ET.fromstring(xml_data) attributes = [] for attr in root.findall(&#39;.//Attribute&#39;): try: attr_data = { &#39;name&#39;: attr.get(&#39;name&#39;), &#39;value&#39;: float(attr.get(&#39;actualValue&#39;)), &#39;weight&#39;: float(attr.get(&#39;weight&#39;)), &#39;rank&#39;: int(attr.get(&#39;rank&#39;)) } attributes.append(attr_data) except (TypeError, ValueError) as e: print(f"解析属性时出错: {e}") # 按rank排序 attributes.sort(key=lambda x: x[&#39;rank&#39;]) # 5. 交叉验证 - 获取异常点前10个采样点 print("获取异常点前10个采样点数据...") anomaly_df, B_total_std, C_col_std = get_data_samples(cursor, obj_name, anom_inst, anom_time) if anomaly_df is None: raise Exception(f"未找到异常点前10个采样点数据: {obj_name} at {anom_time} (实例 {anom_inst})") # 6. 获取正常模式数据 print("获取正常模式数据...") # P1: 一天前相同时间点(跳过休息日) p1_time = anom_time - timedelta(days=1) if is_holiday(p1_time): p1_time = find_previous_workday(p1_time) day_ago_df, A1_total_std, A2_col_std = get_data_samples(cursor, obj_name, anom_inst, p1_time) # P2: 一周前相同时间点 p2_time = anom_time - timedelta(weeks=1) week_ago_df, B1_total_std, B2_col_std = get_data_samples(cursor, obj_name, anom_inst, p2_time) # P3: 一个月前相同日期(跳过休息日) p3_time = anom_time - timedelta(days=30) if is_holiday(p3_time): p3_time = find_previous_workday(p3_time) month_ago_df, C1_total_std, C2_col_std = get_data_samples(cursor, obj_name, anom_inst, p3_time) # 如果正常模式数据为空,使用默认值 if day_ago_df is None: print("警告: 未找到一天前相同时间点数据") day_ago_df = pd.DataFrame() A1_total_std, A2_col_std = 0, {} if week_ago_df is None: print("警告: 未找到一周前相同时间点数据") week_ago_df = pd.DataFrame() B1_total_std, B2_col_std = 0, {} if month_ago_df is None: print("警告: 未找到一个月前相同日期数据") month_ago_df = pd.DataFrame() C1_total_std, C2_col_std = 0, {} # 7. 生成验证结果 validation_results = "\n\n===== 交叉验证结果 =====\n" m4_changes = [] m4_indicator_set = set() # 用于存储m4级别的指标名称 # 第一轮验证:总标准差比较 validation_results += "\n第一轮验证:总标准差比较\n" validation_results += f"异常模式总标准差 (B): {B_total_std:.4f}\n" # 比较1:B vs A1 if A1_total_std is not None: change_rate = get_relative_change(B_total_std, A1_total_std) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一天前相同时间点 (A1: {A1_total_std:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(comp_result.strip()) #m4_indicator_set.add("总标准差") # 比较2:B vs B1 if B1_total_std is not None: change_rate = get_relative_change(B_total_std, B1_total_std) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一周前相同时间点 (B1: {B1_total_std:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(comp_result.strip()) #m4_indicator_set.add("总标准差") # 比较3:B vs C1 if C1_total_std is not None: change_rate = get_relative_change(B_total_std, C1_total_std) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一个月前相同日期 (C1: {C1_total_std:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(comp_result.strip()) #m4_indicator_set.add("总标准差") # 第二轮验证:各列标准差比较 validation_results += "\n第二轮验证:各列标准差比较\n" # 只分析排名前3的指标 top_attributes = attributes[:3] for attr in top_attributes: col_name = attr[&#39;name&#39;] validation_results += f"\n指标: {col_name} (当前值: {attr[&#39;value&#39;]}, 权重: {attr[&#39;weight&#39;]:.2f}, 排名: {attr[&#39;rank&#39;]})\n" # 异常模式该列标准差 col_std_b = C_col_std.get(col_name, 0) validation_results += f"异常模式标准差 (C): {col_std_b:.4f}\n" # 比较1:C vs A2 if A2_col_std and col_name in A2_col_std: col_std_a2 = A2_col_std[col_name] change_rate = get_relative_change(col_std_b, col_std_a2) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一天前相同时间点 (A2: {col_std_a2:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(f"指标 {col_name}: {comp_result.strip()}") m4_indicator_set.add(col_name) # 比较2:C vs B2 if B2_col_std and col_name in B2_col_std: col_std_b2 = B2_col_std[col_name] change_rate = get_relative_change(col_std_b, col_std_b2) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一周前相同时间点 (B2: {col_std_b2:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(f"指标 {col_name}: {comp_result.strip()}") m4_indicator_set.add(col_name) # 比较3:C vs C2 if C2_col_std and col_name in C2_col_std: col_std_c2 = C2_col_std[col_name] change_rate = get_relative_change(col_std_b, col_std_c2) change_desc, change_class = classify_change(change_rate) rate_str = format_change_rate(change_rate) comp_result = f"与一个月前相同日期 (C2: {col_std_c2:.4f}) 比较: 变化率 {rate_str} - {change_desc} ({change_class})\n" validation_results += comp_result if change_class == "m4": m4_changes.append(f"指标 {col_name}: {comp_result.strip()}") m4_indicator_set.add(col_name) # 7. 数据可视化 # 创建特定目录保存图片 timestamp_str = anom_time.strftime("%Y%m%d_%H%M%S") save_dir = os.path.join(PIC_DIR, f"{obj_name}_{anom_inst}_{timestamp_str}") if not os.path.exists(save_dir): os.makedirs(save_dir) # 准备所有样本数据用于聚类 all_data = [] labels = [] # 添加正常模式样本(只添加非空数据) if day_ago_df is not None and not day_ago_df.empty: all_data.append(day_ago_df) labels.extend([&#39;Day Ago&#39;] * len(day_ago_df)) if week_ago_df is not None and not week_ago_df.empty: all_data.append(week_ago_df) labels.extend([&#39;Week Ago&#39;] * len(week_ago_df)) if month_ago_df is not None and not month_ago_df.empty: all_data.append(month_ago_df) labels.extend([&#39;Month Ago&#39;] * len(month_ago_df)) # 添加异常样本 if anomaly_df is not None and not anomaly_df.empty: all_data.append(anomaly_df) labels.extend([&#39;Anomaly&#39;] * len(anomaly_df)) # 检查是否有足够数据 if len(all_data) == 0: print("警告: 没有足够的数据进行可视化") else: # 合并所有数据 combined_df = pd.concat(all_data, ignore_index=True) # 排除非数值列 exclude_cols = [&#39;SNAP_TIME&#39;, &#39;INSTANCE_NUMBER&#39;, &#39;PRED&#39;, &#39;PROB&#39;, &#39;ANOMALYDETAILS&#39;] numeric_cols = [col for col in combined_df.columns if col not in exclude_cols and combined_df[col].dtype in [np.int64, np.float64]] if not numeric_cols: print("警告: 没有数值列可用于可视化") else: # 提取数值数据 data = combined_df[numeric_cols].values # 标准化数据 scaler = StandardScaler() scaled_data = scaler.fit_transform(data) # 使用K-Means聚类 kmeans = KMeans(n_clusters=2, random_state=42) clusters = kmeans.fit_predict(scaled_data) # 使用PCA降维可视化 from sklearn.decomposition import PCA pca = PCA(n_components=2) reduced_data = pca.fit_transform(scaled_data) # 创建聚类图 plt.figure(figsize=(12, 8)) # 为不同标签使用不同颜色 colors = {&#39;Day Ago&#39;: &#39;blue&#39;, &#39;Week Ago&#39;: &#39;green&#39;, &#39;Month Ago&#39;: &#39;purple&#39;, &#39;Anomaly&#39;: &#39;red&#39;} # 绘制所有点 for label in set(labels): idx = [i for i, l in enumerate(labels) if l == label] plt.scatter(reduced_data[idx, 0], reduced_data[idx, 1], c=colors[label], label=label, alpha=0.6) # 标记异常点(红色) anomaly_idx = [i for i, l in enumerate(labels) if l == &#39;Anomaly&#39;] plt.scatter(reduced_data[anomaly_idx, 0], reduced_data[anomaly_idx, 1], c=&#39;red&#39;, marker=&#39;x&#39;, s=100, label=&#39;Anomaly Points&#39;) plt.title(&#39;K-Means Clustering of System Metrics&#39;) plt.xlabel(&#39;Principal Component 1&#39;) plt.ylabel(&#39;Principal Component 2&#39;) plt.legend() plt.grid(True) # 保存聚类图 kmeans_path = os.path.join(save_dir, &#39;kmeans_clustering.png&#39;) plt.savefig(kmeans_path) plt.close() pic_paths.append(kmeans_path) print(f"保存K-Means聚类图到: {kmeans_path}") # 创建时间序列图 plt.figure(figsize=(15, 10)) # 为每个指标创建子图 num_plots = min(5, len(numeric_cols)) # 最多显示5个指标 fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5*num_plots)) if num_plots == 1: axes = [axes] # 确保单图时axes是列表 for i, col in enumerate(numeric_cols[:num_plots]): ax = axes[i] # 绘制正常模式(只绘制非空数据) if day_ago_df is not None and not day_ago_df.empty: day_ago_df[&#39;SNAP_TIME&#39;] = pd.to_datetime(day_ago_df[&#39;SNAP_TIME&#39;]) ax.plot(day_ago_df[&#39;SNAP_TIME&#39;], day_ago_df[col], &#39;b-&#39;, label=&#39;Day Ago&#39;, alpha=0.7) if week_ago_df is not None and not week_ago_df.empty: week_ago_df[&#39;SNAP_TIME&#39;] = pd.to_datetime(week_ago_df[&#39;SNAP_TIME&#39;]) ax.plot(week_ago_df[&#39;SNAP_TIME&#39;], week_ago_df[col], &#39;g-&#39;, label=&#39;Week Ago&#39;, alpha=0.7) if month_ago_df is not None and not month_ago_df.empty: month_ago_df[&#39;SNAP_TIME&#39;] = pd.to_datetime(month_ago_df[&#39;SNAP_TIME&#39;]) ax.plot(month_ago_df[&#39;SNAP_TIME&#39;], month_ago_df[col], &#39;m-&#39;, label=&#39;Month Ago&#39;, alpha=0.7) # 绘制异常点 if anomaly_df is not None and not anomaly_df.empty: anomaly_df[&#39;SNAP_TIME&#39;] = pd.to_datetime(anomaly_df[&#39;SNAP_TIME&#39;]) # 只绘制异常点 ax.scatter(anomaly_df[&#39;SNAP_TIME&#39;], anomaly_df[col], c=&#39;red&#39;, s=50, label=&#39;Anomaly&#39;, zorder=5) ax.xaxis.set_major_formatter(mdates.DateFormatter(&#39;%m-%d %H:%M&#39;)) ax.xaxis.set_major_locator(mdates.AutoDateLocator()) ax.set_title(f&#39;Time Series: {col}&#39;) ax.set_xlabel(&#39;Time&#39;) ax.set_ylabel(&#39;Value&#39;) ax.legend(loc=&#39;upper right&#39;) ax.grid(True, linestyle=&#39;--&#39;, alpha=0.7) plt.setp(ax.get_xticklabels(), rotation=45, ha=&#39;right&#39;) plt.tight_layout() # 保存时间序列图 timeseries_path = os.path.join(save_dir, &#39;timeseries_comparison.png&#39;) plt.savefig(timeseries_path, bbox_inches=&#39;tight&#39;) plt.close() pic_paths.append(timeseries_path) print(f"保存时间序列图到: {timeseries_path}") # 评估哪种可视化更直观 evaluation = """ === 可视化方法评估 === 1. K-Means聚类图: - 优点: 展示数据点在高维空间的分布,可以清晰看到异常点与正常模式的分离 - 缺点: 需要降维处理,可能丢失部分信息;不直观展示时间变化 2. 时间序列图: - 优点: 直观展示指标随时间的变化趋势,容易识别异常点 - 缺点: 当指标过多时难以在同一图中展示 结论: 时间序列图更直观展示异常点信息,特别是当需要分析指标随时间的变化趋势时。 建议将时间序列图作为主要可视化工具,聚类图作为辅助分析工具。 """ print(evaluation) # 保存评估结论 with open(os.path.join(save_dir, &#39;visualization_evaluation.txt&#39;), &#39;w&#39;) as f: f.write(evaluation) # 8. 生成分析报告 analysis_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") summary = f"检测到{len(attributes)}个异常指标" if m4_changes: summary += ",检测到指标有重大变化,请及时关注!" m4_warning = "\n\n===== 重大变化警告 =====\n" for i, change in enumerate(m4_changes, 1): m4_warning += f"{i}. {change}\n" summary += m4_warning details = f"异常分析报告 ({analysis_time})\n" details += f"目标对象: {obj_name}\n" details += f"异常时间: {anom_time}\n" details += f"实例编号: {anom_inst}\n" details += "="*50 + "\n" # 添加指标详情 for attr in attributes: details += ( f"指标 {attr[&#39;rank&#39;]}: {attr[&#39;name&#39;]}\n" f" 当前值: {attr[&#39;value&#39;]}\n" f" 影响权重: {attr[&#39;weight&#39;]:.2f}\n" f" 问题描述: {generate_description(attr)}\n\n" ) # 添加交叉验证结果 details += validation_results # 添加数据样本信息 details += "\n\n===== 数据样本详情 =====\n" # 定义要排除的列 exclude_columns = [&#39;ANOMALYDETAILS&#39;] # 异常点前10个采样点 if anomaly_df is not None: # 排除指定列 anomaly_df_display = anomaly_df.drop(columns=exclude_columns, errors=&#39;ignore&#39;) details += f"异常点前10个采样点:\n{anomaly_df_display.to_string()}\n\n" else: details += "异常点前10个采样点: 无数据\n\n" # 一天前相同时间点采样点 if day_ago_df is not None: # 排除指定列 day_ago_df_display = day_ago_df.drop(columns=exclude_columns, errors=&#39;ignore&#39;) details += f"一天前相同时间点采样点:\n{day_ago_df_display.to_string()}\n\n" else: details += "一天前相同时间点采样点: 无数据\n\n" # 一周前相同时间点采样点 if week_ago_df is not None: # 排除指定列 week_ago_df_display = week_ago_df.drop(columns=exclude_columns, errors=&#39;ignore&#39;) details += f"一周前相同时间点采样点:\n{week_ago_df_display.to_string()}\n\n" else: details += "一周前相同时间点采样点: 无数据\n\n" # 一个月前相同日期采样点 if month_ago_df is not None: # 排除指定列 month_ago_df_display = month_ago_df.drop(columns=exclude_columns, errors=&#39;ignore&#39;) details += f"一个月前相同日期采样点:\n{month_ago_df_display.to_string()}\n" else: details += "一个月前相同日期采样点: 无数据\n" print(details) # 9. 更新分析结果 # 根据是否有重大变化设置alert_status alert_status = "sending" if m4_changes else "normal" # 准备图片路径字符串 pic_paths_str = ",".join(pic_paths) if pic_paths else "无图片" # 准备m4指标字符串 m4_indicators_str = ",".join(m4_indicator_set) if m4_indicator_set else "无" update_query = """ UPDATE RT_RESULT SET alert_analysis_status = :status, alert_analysis_summary = :summary, alert_analysis_detail = :detail, alert_status = :alert_status, alert_analysis_pic = :pic_paths, alert_analysis_dar = :m4_indicators WHERE object_name = :obj_name AND anomalies_time = TO_TIMESTAMP(:anom_time, &#39;YYYY-MM-DD HH24:MI:SS.FF3&#39;) AND anomalies_inst_number = :inst_number """ params = { &#39;status&#39;: &#39;done&#39;, &#39;summary&#39;: summary, &#39;detail&#39;: details, &#39;alert_status&#39;: alert_status, &#39;pic_paths&#39;: pic_paths_str, &#39;m4_indicators&#39;: m4_indicators_str, &#39;obj_name&#39;: obj_name, &#39;anom_time&#39;: format_timestamp(anom_time), &#39;inst_number&#39;: anom_inst } cursor.execute(update_query, params) cursor.connection.commit() print("分析结果保存成功") print(f"图片路径: {pic_paths_str}") print(f"m4指标: {m4_indicators_str}") print(details) except Exception as e: print(f"处理失败: {str(e)}") # 更新状态为error if obj_name and anom_time and anom_inst is not None: error_update = f""" UPDATE RT_RESULT SET alert_analysis_status = &#39;error&#39; WHERE object_name = &#39;{escape_sql(obj_name)}&#39; AND anomalies_time = TO_TIMESTAMP(&#39;{format_timestamp(anom_time)}&#39;, &#39;YYYY-MM-DD HH24:MI:SS.FF3&#39;) AND anomalies_inst_number = {anom_inst} """ try: if cursor: cursor.execute(error_update) cursor.connection.commit() except Exception as inner_e: print(f"更新错误状态时出错: {str(inner_e)}") finally: # 关闭游标 if cursor: cursor.close() # 关闭连接 if conn: conn.close() print("数据库连接关闭") if __name__ == "__main__": analyze_anomalies() ====================== 以上代码在《创建时间序列图》这段代码中画出来的图上图所示 都是按天,我的采样点比较密集,就容易混到一起,是否能按照我采集的数据来做图, 没有日期显示的就不用画出图了,也就是绘制时间序列时跳过无数据的区间。 请修改以上的相关代码段
最新发布
07-17
import pandas as pd import logging from geopy.distance import geodesic import time from contextlib import contextmanager import numpy as np from sqlalchemy import create_engine, text from tenacity import retry, stop_after_attempt, wait_exponential from datetime import datetime # -------------------------- # 通用配置 # -------------------------- DB_CONFIG = { "conn_str": "mysql+pymysql://root:root@127.0.0.1:3306/test?charset=utf8", "pool_size": 10, "pool_recycle": 300 } member_list_path = r"D:\OneDrive\ERIC\维护\平台数据\代维信息\2025-07人员信息快照_20250708.xlsx" member_list_columns = [&#39;登陆账号&#39;, &#39;所属组织&#39;, &#39;所属地市&#39;, &#39;在职状态&#39;] TIME_RANGE = { "start": &#39;2025-07-01 00:00:00&#39;, "end": &#39;2025-07-08 23:59:59&#39; } # 定义日期格式转换器 def format_time_range(time_range): # 解析开始时间和结束时间 start_date = datetime.strptime(time_range["start"], &#39;%Y-%m-%d %H:%M:%S&#39;) end_date = datetime.strptime(time_range["end"], &#39;%Y-%m-%d %H:%M:%S&#39;) # 格式化为YYYYMMDD形式并拼接 return f"{start_date.strftime(&#39;%Y%m%d&#39;)}~{end_date.strftime(&#39;%Y%m%d&#39;)}" # 获取格式化后的时间范围 source_time = format_time_range(TIME_RANGE) FILE_PATHS = { "cran_info": r&#39;D:\OneDrive\ERIC\维护\test\CRAN机房信息.csv&#39;, "output": fr&#39;D:\OneDrive\ERIC\维护\工单\现场执行工单\现场执行工单_{source_time}.xlsx&#39; } TABLE_CONFIG = { &#39;工单_保障&#39;: {&#39;columns&#39;: ["工单当前状态", "工单号", "工单分类", "维护分类"]}, &#39;工单_巡检&#39;: {&#39;columns&#39;: [ "工单当前状态", "工单号", "资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, &#39;工单_拆站&#39;: {&#39;columns&#39;: [ "工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, &#39;工单_验收&#39;: {&#39;columns&#39;: [ "工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, &#39;工单_发电&#39;: {&#39;columns&#39;: [ "工单当前状态", "工单号", "站点ID AS 资源ID", "站点名称 AS 资源名称", "站点经度 AS 资源经度", "站点纬度 AS 资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, &#39;工单_通用&#39;: {&#39;columns&#39;: [ "工单当前状态", "工单号", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, &#39;故障工单&#39;: {&#39;columns&#39;: [ "工单状态 AS 工单当前状态", "工单编号 AS 工单号", "STATION_NAME AS 资源名称", "LONGITUDE AS 资源经度", "LATITUDE AS 资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类", "故障处理时长_小时", "是否延期有效", "是否及时处理", "高频故障", "是否超长退服", "网元分类" ], &#39;where_column&#39;: &#39;工单编号&#39;} } DIMENSIONS = [ {&#39;name&#39;: &#39;执行人&#39;, &#39;keys&#39;: [&#39;执行人&#39;]}, {&#39;name&#39;: &#39;所属组织&#39;, &#39;keys&#39;: [&#39;所属组织&#39;]}, {&#39;name&#39;: &#39;地市&#39;, &#39;keys&#39;: [&#39;地市&#39;]}, {&#39;name&#39;: &#39;代维简称&#39;, &#39;keys&#39;: [&#39;代维简称&#39;]}, {&#39;name&#39;: &#39;地市代维&#39;, &#39;keys&#39;: [&#39;地市&#39;, &#39;代维简称&#39;]} ] # -------------------------- # 工具函数 # -------------------------- @contextmanager def time_monitor(step_name): """耗时监控上下文管理器""" start_time = time.time() try: yield finally: print(f"{step_name} 耗时: {time.time() - start_time:.4f}秒") def setup_logging(): """日志配置""" logging.basicConfig( level=logging.INFO, format=&#39;%(asctime)s - %(levelname)s - %(message)s&#39;, handlers=[ logging.FileHandler("data_processor_v2.log"), logging.StreamHandler() ] ) return logging.getLogger(__name__) logger = setup_logging() @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) def safe_read_sql(sql: text, engine, params: dict = None) -> pd.DataFrame: """带重试机制的SQL读取""" logger.debug(f"执行SQL: {sql} \n参数: {params}") return pd.read_sql(sql, engine, params=params) def init_db_engine(): """初始化数据库引擎""" return create_engine( DB_CONFIG["conn_str"], pool_size=DB_CONFIG["pool_size"], pool_recycle=DB_CONFIG["pool_recycle"] ) # -------------------------- # 数据校验函数 # -------------------------- def is_valid_coordinates(lat, lon): """经纬度有效性校验""" if pd.isna(lat) or pd.isna(lon): return False if not (-90 <= lat <= 90 and -180 <= lon <= 180): return False if abs(lat) < 1e-6 and abs(lon) < 1e-6: return False return True def calculate_distance(coord1, coord2): """带校验的距离计算""" try: lat1, lon1 = coord1 lat2, lon2 = coord2 except (TypeError, ValueError) as e: logger.debug(f"坐标解析失败: {str(e)} | coord1={coord1} coord2={coord2}") return 99999.0 if not (is_valid_coordinates(lat1, lon1) and is_valid_coordinates(lat2, lon2)): return 99999.0 return geodesic(coord1, coord2).meters # -------------------------- # 数据加载模块 # -------------------------- def load_site_orders(engine): """加载现场执行工单数据""" sql = text(""" SELECT 地市, 代维简称, 专业, 工单类型, 工单编号, 执行人, 所属组织, 出发时间, 出发经度, 出发纬度, 签到时间, 签到经度, 签到纬度, 完成时间, 完成经度, 完成纬度, 路途时长, 实际工作时长 FROM 现场执行工单 WHERE 出发时间 BETWEEN :start_time AND :end_time """) params = { &#39;start_time&#39;: TIME_RANGE["start"], &#39;end_time&#39;: TIME_RANGE["end"] } df = safe_read_sql(sql, engine, params) df = df[~df[&#39;所属组织&#39;].str.contains(&#39;高铁&#39;)] df[&#39;代维简称&#39;] = df[&#39;代维简称&#39;].replace(&#39;中移铁通&#39;, &#39;铁通&#39;) valid_companies = [&#39;中贝&#39;, &#39;中通服&#39;, &#39;中邮建&#39;, &#39;唐人&#39;, &#39;宜通&#39;, &#39;怡创&#39;, &#39;浙通服&#39;, &#39;润建&#39;, &#39;虹信&#39;, &#39;超讯&#39;, &#39;铁通&#39;, &#39;长实&#39;] df = df[df[&#39;代维简称&#39;].isin(valid_companies)] df[&#39;地市代维&#39;] = df[&#39;地市&#39;].str.split("-").str[0] + &#39;-&#39; + df[&#39;代维简称&#39;] logger.info(f"加载现场工单记录数: {len(df)}") return df def batch_load_order_data(engine, order_numbers, batch_size=500): """批量加载工单相关数据""" dfs = [] for table, config in TABLE_CONFIG.items(): try: columns = ", ".join(config[&#39;columns&#39;]) where_col = config.get(&#39;where_column&#39;, &#39;工单号&#39;) # 修复:正确构造SQL查询字符串,确保引号闭合 sql_template = text(f"SELECT {columns} FROM `{table}` WHERE `{where_col}` IN :order_nums") table_dfs = [] for i in range(0, len(order_numbers), batch_size): batch = order_numbers[i:i + batch_size] df = safe_read_sql(sql_template, engine, params={&#39;order_nums&#39;: batch}) table_dfs.append(df) if table_dfs: table_df = pd.concat(table_dfs, ignore_index=True) dfs.append(table_df) logger.info(f"表 {table} 加载完成,记录数: {len(table_df)}") except Exception as e: logger.error(f"加载表 {table} 失败: {str(e)}", exc_info=True) return dfs # -------------------------- # 数据处理模块 # -------------------------- def merge_order_data(site_df, order_dfs): """合并工单数据(增强版)""" if not order_dfs: logger.warning("没有需要合并的工单数据") return pd.DataFrame() final_df = pd.concat(order_dfs, axis=0, ignore_index=True) final_df = final_df.drop_duplicates(subset=[&#39;工单号&#39;], keep=&#39;first&#39;) merged_df = pd.merge( left=site_df, right=final_df, left_on=&#39;工单编号&#39;, right_on=&#39;工单号&#39;, how=&#39;left&#39;, ) merged_df.drop(columns=[&#39;工单号&#39;], inplace=True) merged_df[&#39;路途时长(分钟)&#39;] = (merged_df[&#39;路途时长&#39;] * 60).round(2) merged_df[&#39;实际工作时长(分钟)&#39;] = (merged_df[&#39;实际工作时长&#39;] * 60).round(2) # 过滤有效状态 condition = ( merged_df[&#39;工单当前状态&#39;].notna() & (merged_df[&#39;工单当前状态&#39;] != &#39;&#39;) & ~merged_df[&#39;工单当前状态&#39;].str.contains(&#39;已驳回|已撤销|已关闭&#39;, na=False) ) return merged_df[condition] def get_custom_natural_week(date): """ 计算“自然周”编号。本定义中,一周从上周三开始,到本周二结束。 """ if pd.isnull(date): return np.nan weekday = date.weekday() days_since_last_thursday = (weekday + 5) % 7 date = date - pd.Timedelta(days=days_since_last_thursday) return date.isocalendar()[1] # -------------------------- # 核心处理模块 # -------------------------- def generate_time_features(df): """增强时间特征生成""" df[&#39;出发时间&#39;] = pd.to_datetime(df[&#39;出发时间&#39;], errors=&#39;coerce&#39;) df[&#39;自然年&#39;] = df[&#39;出发时间&#39;].dt.year df[&#39;自然月&#39;] = df[&#39;出发时间&#39;].dt.to_period(&#39;M&#39;).astype(str) df[&#39;自然周&#39;] = df[&#39;出发时间&#39;].apply(get_custom_natural_week) df[&#39;出发日期&#39;] = df[&#39;出发时间&#39;].dt.date return df def process_coordinates(df): """坐标处理""" coord_columns = [ "资源纬度", "资源经度", "签到纬度", "签到经度", "LATITUDE", "LONGITUDE", "完成纬度", "完成经度" ] for col in coord_columns: if col in df.columns: df[col] = pd.to_numeric(df[col], errors=&#39;coerce&#39;) else: logger.warning(f"列 {col} 不存在") fault_columns = [&#39;是否超长退服&#39;, &#39;高频故障&#39;, &#39;是否延期有效&#39;, &#39;是否及时处理&#39;] for col in fault_columns: if col not in df.columns: df[col] = &#39;否&#39; return df def calculate_distances(df): """距离计算""" df["签到距离"] = df.apply( lambda row: calculate_distance( (row["签到纬度"], row["签到经度"]), (row["资源纬度"], row["资源经度"]) ), axis=1 ) return df def merge_cran_data(main_df, cran_path): """合并CRAN机房数据并取最近RU距离""" cran_df = pd.read_csv(cran_path) for col in ["ru纬度", "ru经度"]: cran_df[col] = pd.to_numeric(cran_df[col], errors="coerce").fillna(0) merged = pd.merge( main_df[["工单编号", "STATION_NAME", "签到经度", "签到纬度"]], cran_df, left_on="STATION_NAME", right_on="station_name", how="left" ) merged["签到距离_CRAN"] = merged.apply( lambda row: calculate_distance( (row["签到纬度"], row["签到经度"]), (row["ru纬度"], row["ru经度"]) ), axis=1 ) min_distances = merged.groupby("工单编号", as_index=False)["签到距离_CRAN"].min() return pd.merge( main_df, min_distances, on="工单编号", how="left" ).fillna({"签到距离_CRAN": 99999.0}) # -------------------------- # 统计计算模块 # -------------------------- def calculate_workload(df): """工作量评定""" df[&#39;工作量评定&#39;] = &#39;有效&#39; df[&#39;原因&#39;] = &#39;&#39; no_completion = df[&#39;完成时间&#39;].isna() df.loc[no_completion, [&#39;工作量评定&#39;, &#39;原因&#39;]] = [&#39;无效&#39;, &#39;无完成时间&#39;] short_work = df[&#39;实际工作时长(分钟)&#39;] < 5 df.loc[short_work, [&#39;工作量评定&#39;, &#39;原因&#39;]] = [&#39;无效&#39;, &#39;工作时长过短&#39;] if all(col in df.columns for col in [&#39;签到距离&#39;, &#39;签到距离_CRAN&#39;, &#39;维护分类&#39;]): invalid_dist = ( ~df[&#39;维护分类&#39;].isin([&#39;发电&#39;, &#39;保障&#39;]) & (df[&#39;签到距离&#39;] > 300) & (df[&#39;签到距离_CRAN&#39;] > 300) ) df.loc[invalid_dist, [&#39;工作量评定&#39;, &#39;原因&#39;]] = [&#39;无效&#39;, &#39;签到距离过大&#39;] return df def _calculate_base_stats(df, group_keys): """通用基础指标计算""" categories = df[&#39;维护分类&#39;].dropna().unique() agg_dict = { # 总工单数(已去重) "总工单数": (&#39;工单编号&#39;, &#39;nunique&#39;), # 有效工单数(去重) "有效工单数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][df[&#39;工作量评定&#39;] == "有效"].nunique()), # 故障工单数(去重):统计工单分类为"故障工单"的工单编号唯一数 "故障工单数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][df[&#39;工单分类&#39;] == "故障工单"].nunique()), # 故障及时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"是"的工单编号唯一数 "故障及时数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][ (df[&#39;工单分类&#39;] == "故障工单") & (df[&#39;是否及时处理&#39;] == "是") ].nunique()), # 故障超时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"否"的工单编号唯一数 "故障超时数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][ (df[&#39;工单分类&#39;] == "故障工单") & (df[&#39;是否及时处理&#39;] == "否") ].nunique()), # 超长退服工单数(去重):统计是否超长退服为"是"的工单编号唯一数 "超长退服工单数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][df[&#39;是否超长退服&#39;] == "是"].nunique()), # 超频故障工单数(去重):统计高频故障为"是"的工单编号唯一数 "超频故障工单数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][df[&#39;高频故障&#39;] == "是"].nunique()), # 延期工单数(去重):统计是否延期有效为"是"的工单编号唯一数 "延期工单数": (&#39;工单编号&#39;, lambda x: df.loc[x.index, &#39;工单编号&#39;][df[&#39;是否延期有效&#39;] == "是"].nunique()), # 其他非去重指标保持不变 "故障处理时长": (&#39;故障处理时长_小时&#39;, lambda x: x[df[&#39;工单分类&#39;] == "故障工单"].sum()), "路途总时长": (&#39;路途时长(分钟)&#39;, &#39;sum&#39;), "工作总时长": (&#39;实际工作时长(分钟)&#39;, &#39;sum&#39;), "出勤人数": (&#39;执行人&#39;, &#39;nunique&#39;), "出勤人天": (&#39;工单编号&#39;, lambda x: df.loc[x.index][[&#39;执行人&#39;, &#39;出发日期&#39;]].drop_duplicates().shape[0]), } for cat in categories: agg_dict[f"{cat}工单数"] = ( &#39;维护分类&#39;, lambda x, current_cat=cat: x.eq(current_cat).sum() ) stats = df.groupby(group_keys).agg(**agg_dict).reset_index() return stats def _add_derived_metrics(stats, time_level): """通用衍生指标计算""" stats[&#39;出勤总时长&#39;] = stats[&#39;路途总时长&#39;] + stats[&#39;工作总时长&#39;] hour_cols = [&#39;出勤总时长&#39;, &#39;路途总时长&#39;, &#39;工作总时长&#39;] for col in hour_cols: stats[f&#39;{col}(h)&#39;] = (stats[col] / 60).round(2) stats[&#39;平均工单工作时长(h)&#39;] = ( stats[&#39;工作总时长(h)&#39;] / stats[&#39;总工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0).round(2) stats[&#39;平均工单路途时长(h)&#39;] = ( stats[&#39;路途总时长(h)&#39;] / stats[&#39;总工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0).round(2) if time_level == &#39;weekly&#39;: stats[&#39;人均能效_周&#39;] = (stats[&#39;有效工单数&#39;] / stats[&#39;出勤人天&#39;]).round(2) elif time_level == &#39;monthly&#39;: stats[&#39;人均能效_月&#39;] = (stats[&#39;有效工单数&#39;] / stats[&#39;出勤人天&#39;]).round(2) stats[&#39;故障处理及时率&#39;] = (stats[&#39;故障及时数&#39;] / stats[&#39;故障工单数&#39;]).replace(np.inf, 0) stats[&#39;工作时长占比&#39;] = (stats[&#39;工作总时长&#39;] / stats[&#39;出勤总时长&#39;]).replace(np.inf, 0) stats[&#39;人均出勤天数&#39;] = (stats[&#39;出勤人天&#39;] / stats[&#39;出勤人数&#39;]).replace(np.inf, 0).round(2) stats[&#39;日均出勤人数&#39;] = (stats[&#39;出勤人天&#39;] / stats[&#39;出勤天数&#39;]).replace(np.inf, 0).round(2) return stats def calculate_attendance_metrics(df, denominator): """计算出勤相关指标""" df = df.copy() df[&#39;出勤总时长(h)&#39;] = (df[&#39;路途时长(分钟)&#39;] + df[&#39;实际工作时长(分钟)&#39;]) / 60 df[&#39;工作总时长(h)&#39;] = df[&#39;实际工作时长(分钟)&#39;] / 60 # 按执行人和出发日期分组,计算每天的数据 daily_stats = df.groupby([&#39;执行人&#39;, &#39;出发日期&#39;]).agg( station_count=(&#39;STATION_NAME&#39;, &#39;nunique&#39;), total_attendance_hours=(&#39;出勤总时长(h)&#39;, &#39;sum&#39;), total_work_hours=(&#39;工作总时长(h)&#39;, &#39;sum&#39;) ).reset_index() # 计算每天是否达标(条件1) daily_stats[&#39;达标&#39;] = ( (daily_stats[&#39;station_count&#39;] >= 4) & (daily_stats[&#39;total_attendance_hours&#39;] >= 7) & (daily_stats[&#39;total_work_hours&#39;] >= 2) ) # 按执行人聚合计算总指标 executor_stats = daily_stats.groupby(&#39;执行人&#39;).agg( meet_days=(&#39;达标&#39;, &#39;sum&#39;), total_stations=(&#39;station_count&#39;, &#39;sum&#39;), total_attendance_hours=(&#39;total_attendance_hours&#39;, &#39;sum&#39;), total_work_hours=(&#39;total_work_hours&#39;, &#39;sum&#39;) ).reset_index() # 计算衍生指标 executor_stats[&#39;平均每天到站址&#39;] = executor_stats[&#39;total_stations&#39;] / denominator executor_stats[&#39;日均出勤出工时长&#39;] = executor_stats[&#39;total_attendance_hours&#39;] / denominator executor_stats[&#39;日均有效出勤工时&#39;] = executor_stats[&#39;total_work_hours&#39;] / denominator # 添加条件2达标标识 executor_stats[&#39;条件2达标&#39;] = ( (executor_stats[&#39;平均每天到站址&#39;] >= 4) & (executor_stats[&#39;日均出勤出工时长&#39;] >= 7) & (executor_stats[&#39;日均有效出勤工时&#39;] >= 2) ) return executor_stats.rename(columns={&#39;meet_days&#39;: &#39;出勤达标天数&#39;}) def generate_stats(df, dimension, time_level): """通用统计生成函数""" time_key = { &#39;daily&#39;: &#39;出发日期&#39;, &#39;weekly&#39;: &#39;自然周&#39;, &#39;monthly&#39;: &#39;自然月&#39; }[time_level] group_keys = dimension[&#39;keys&#39;] + [time_key] # 获取维度的主键(用于合并的键名) dimension_key = dimension[&#39;keys&#39;][0] # 取维度的第一个键作为合并主键 stats = _calculate_base_stats(df, group_keys) stats[&#39;出勤天数&#39;] = df.groupby(group_keys)[&#39;出发日期&#39;].nunique().values # 计算月度分母(MIN(TIME_RANGE["end"]的日期部分, 22)) denominator = None if time_level == &#39;daily&#39;: stats[&#39;出发日期&#39;] = pd.to_datetime(stats[&#39;出发日期&#39;]).dt.strftime(&#39;%Y-%m-%d&#39;) else: stats[&#39;出勤天数&#39;] = df.groupby(group_keys)[&#39;出发日期&#39;].nunique().values if time_level == &#39;monthly&#39;: # 移除原工作日天数计算,替换为新分母 end_date = datetime.strptime(TIME_RANGE["end"], &#39;%Y-%m-%d %H:%M:%S&#39;) end_day = end_date.day denominator = min(end_day, 22) stats = _add_derived_metrics(stats, time_level) # 当维度为执行人且时间粒度为月度时,合并出勤指标(使用正确的键名) if dimension[&#39;name&#39;] == &#39;执行人&#39; and time_level == &#39;monthly&#39; and denominator is not None: attendance_metrics = calculate_attendance_metrics(df, denominator) # 使用维度的实际键名(如&#39;执行人&#39;)作为合并键,而非固定的&#39;维度&#39; stats = pd.merge( stats, attendance_metrics[[ # 保留原始&#39;执行人&#39;列作为合并键 &#39;执行人&#39;, &#39;出勤达标天数&#39;, &#39;平均每天到站址&#39;, &#39;日均出勤出工时长&#39;, &#39;日均有效出勤工时&#39; ]], on=dimension_key, # 动态使用维度的键名(如&#39;执行人&#39;)进行合并 how=&#39;left&#39; ) return stats # -------------------------- # 代维统计增强模块 # -------------------------- def enhance_maintainer_stats(filtered_data, member_list_df_filter): time_range_data = filtered_data.copy() # 初始化维度框架 GROUP_KEYS = [&#39;所属组织&#39;, &#39;代维简称&#39;, &#39;地市&#39;, &#39;地市代维&#39;] dim_dfs = [] for key in GROUP_KEYS: df = member_list_df_filter.groupby(key).agg( 应出勤人数=(&#39;登陆账号&#39;, &#39;count&#39;) ).reset_index() dim_dfs.append(df) if not time_range_data.empty: # ===================== 原有统计逻辑 ===================== # 执行人级别统计 executor_stats = time_range_data.groupby([&#39;执行人&#39;, &#39;所属组织&#39;, &#39;代维简称&#39;, &#39;地市&#39;, &#39;地市代维&#39;]).agg( 总工单数=(&#39;工单编号&#39;, &#39;nunique&#39;), 有效工单数=(&#39;工作量评定&#39;, lambda s: s.eq(&#39;有效&#39;).sum()), 总路途时长=(&#39;路途时长(分钟)&#39;, &#39;sum&#39;), 总工作时长=(&#39;实际工作时长(分钟)&#39;, &#39;sum&#39;), 总出勤天数=(&#39;出发日期&#39;, &#39;nunique&#39;), 故障工单数=(&#39;工单分类&#39;, lambda s: s.eq(&#39;故障工单&#39;).sum()), 超长退服工单数=(&#39;是否超长退服&#39;, lambda s: s.eq(&#39;是&#39;).sum()), 高频故障工单数=(&#39;高频故障&#39;, lambda s: s.eq(&#39;是&#39;).sum()), 延期工单数=(&#39;是否延期有效&#39;, lambda s: s.eq(&#39;是&#39;).sum()), 故障及时数=(&#39;是否及时处理&#39;, lambda s: s.eq(&#39;是&#39;).sum()) ).reset_index() # 计算执行人级别指标 metrics = [ (&#39;平均路途时长(小时)&#39;, (executor_stats[&#39;总路途时长&#39;] / 60) / executor_stats[&#39;总工单数&#39;].replace(0, np.nan)), (&#39;工作时长占比&#39;, executor_stats[&#39;总工作时长&#39;] / (executor_stats[&#39;总路途时长&#39;] + executor_stats[&#39;总工作时长&#39;]).replace(0, np.nan)), (&#39;人均每日工单数&#39;, executor_stats[&#39;总工单数&#39;] / executor_stats[&#39;总出勤天数&#39;].replace(0, np.nan)), (&#39;人均每日有效工单数&#39;, executor_stats[&#39;有效工单数&#39;] / executor_stats[&#39;总出勤天数&#39;].replace(0, np.nan)), (&#39;人均每日出勤时长(小时)&#39;, (executor_stats[&#39;总路途时长&#39;] + executor_stats[&#39;总工作时长&#39;]) / 60 / executor_stats[&#39;总出勤天数&#39;].replace( 0, np.nan)), (&#39;人均每日工作时长(小时)&#39;, executor_stats[&#39;总工作时长&#39;] / 60 / executor_stats[&#39;总出勤天数&#39;].replace(0, np.nan)), (&#39;每工单路途时长(小时)&#39;, executor_stats[&#39;总路途时长&#39;] / 60 / executor_stats[&#39;总工单数&#39;].replace(0, np.nan)), (&#39;每工单工作时长(小时)&#39;, executor_stats[&#39;总工作时长&#39;] / 60 / executor_stats[&#39;总工单数&#39;].replace(0, np.nan)), (&#39;超长退服工单占比&#39;, executor_stats[&#39;超长退服工单数&#39;] / executor_stats[&#39;故障工单数&#39;].replace(0, np.nan)), (&#39;高频故障工单占比&#39;, executor_stats[&#39;高频故障工单数&#39;] / executor_stats[&#39;故障工单数&#39;].replace(0, np.nan)), (&#39;延期工单占比&#39;, executor_stats[&#39;延期工单数&#39;] / executor_stats[&#39;故障工单数&#39;].replace(0, np.nan)), ] for col, formula in metrics: executor_stats[col] = formula executor_stats = executor_stats.fillna(0).round(4) # 维度聚合统计 def calculate_dimension_stats(grouped_df, current_key): return grouped_df.groupby(current_key).agg( 总工单数=(&#39;总工单数&#39;, &#39;sum&#39;), 有效工单数=(&#39;有效工单数&#39;, &#39;sum&#39;), 工单平均路途超2小时人数=(&#39;平均路途时长(小时)&#39;, lambda x: (x > 2).sum()), 平均工作时长占比低人数=(&#39;工作时长占比&#39;, lambda x: (x < 0.1).sum()), 人均每日工单数=(&#39;人均每日工单数&#39;, &#39;mean&#39;), 人均每日有效工单数=(&#39;人均每日有效工单数&#39;, &#39;mean&#39;), 人均每日出勤时长=(&#39;人均每日出勤时长(小时)&#39;, &#39;mean&#39;), 人均每日工作时长=(&#39;人均每日工作时长(小时)&#39;, &#39;mean&#39;), 每工单路途时长=(&#39;每工单路途时长(小时)&#39;, &#39;mean&#39;), 每工单工作时长=(&#39;每工单工作时长(小时)&#39;, &#39;mean&#39;), 超长退服工单占比=(&#39;超长退服工单占比&#39;, &#39;mean&#39;), 高频故障工单占比=(&#39;高频故障工单占比&#39;, &#39;mean&#39;), 延期工单占比=(&#39;延期工单占比&#39;, &#39;mean&#39;), ).reset_index().round(4) # 合并维度统计 updated_dims = [] for i, key in enumerate(GROUP_KEYS): dim_stats = calculate_dimension_stats(executor_stats, key) dim_stats[&#39;有效工单占比&#39;] = (dim_stats[&#39;有效工单数&#39;] / dim_stats[&#39;总工单数&#39;]).replace(np.inf, 0).fillna( 0).round(4) merged = pd.merge( dim_dfs[i], dim_stats, on=key, how=&#39;left&#39; ).fillna(0) updated_dims.append(merged) dim_dfs = updated_dims # ===================== 新增需求处理 ===================== # 提取问题工单 sign_in_issue = time_range_data[time_range_data[&#39;原因&#39;] == &#39;签到距离过大&#39;] short_duration_issue = time_range_data[time_range_data[&#39;原因&#39;] == &#39;工作时长过短&#39;] top50_travel_orders = time_range_data.nlargest(50, &#39;路途时长(分钟)&#39;)[&#39;工单编号&#39;].unique() top50_travel_data = time_range_data[time_range_data[&#39;工单编号&#39;].isin(top50_travel_orders)] # 处理各维度指标 for i, key in enumerate(GROUP_KEYS): # 合并签到距离过大条目数 sign_in_counts = sign_in_issue.groupby(key)[&#39;工单编号&#39;].nunique().reset_index(name=&#39;签到距离过大条目数&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], sign_in_counts, on=key, how=&#39;left&#39;) # 合并工作时长过短条目数 short_duration_counts = short_duration_issue.groupby(key)[&#39;工单编号&#39;].nunique().reset_index( name=&#39;工作时长过短条目数&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], short_duration_counts, on=key, how=&#39;left&#39;) # 合并路途时长TOP50条目数 top50_counts = top50_travel_data.groupby(key)[&#39;工单编号&#39;].nunique().reset_index(name=&#39;路途时长TOP50条目数&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], top50_counts, on=key, how=&#39;left&#39;) dim_dfs[i] = dim_dfs[i].fillna(0) # ===================== 出勤相关统计 ===================== # 实际出勤人数 def merge_attendance(source_df, target_df, grp_key): att = source_df.groupby(grp_key)[&#39;执行人&#39;].nunique().reset_index(name=&#39;实际出勤人数&#39;) return pd.merge(target_df, att, on=grp_key, how=&#39;left&#39;).fillna(0) for i, key in enumerate(GROUP_KEYS): dim_dfs[i] = merge_attendance(time_range_data, dim_dfs[i], key) dim_dfs[i][&#39;出勤率&#39;] = (dim_dfs[i][&#39;实际出勤人数&#39;] / dim_dfs[i][&#39;应出勤人数&#39;]).replace(np.inf, 0).fillna( 0).round(4) # 出勤天数统计 for i, key in enumerate(GROUP_KEYS): att_cols = [&#39;执行人&#39;, key, &#39;出发日期&#39;] att_days = time_range_data[att_cols].drop_duplicates(subset=[&#39;执行人&#39;, &#39;出发日期&#39;]) # 计算每个执行人的出勤天数 att_days_per_executor = att_days.groupby([key, &#39;执行人&#39;]).size().reset_index(name=&#39;出勤天数&#39;) # 总出勤人天(所有人的出勤天数总和) total_attendance_person_days = att_days_per_executor.groupby(key)[&#39;出勤天数&#39;].sum().reset_index( name=&#39;总出勤人天&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], total_attendance_person_days, on=key, how=&#39;left&#39;).fillna(0) # 出勤天数(不同出发日期的数量) attendance_days = att_days.groupby(key)[&#39;出发日期&#39;].nunique().reset_index(name=&#39;出勤天数_实际&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], attendance_days, on=key, how=&#39;left&#39;).fillna(0) # 计算人均出勤天数和日均出勤人数 dim_dfs[i][&#39;人均出勤天数&#39;] = (dim_dfs[i][&#39;总出勤人天&#39;] / dim_dfs[i][&#39;实际出勤人数&#39;]).replace(np.inf, 0).fillna( 0).round(2) dim_dfs[i][&#39;日均出勤人数&#39;] = (dim_dfs[i][&#39;总出勤人天&#39;] / dim_dfs[i][&#39;出勤天数_实际&#39;]).replace(np.inf, 0).fillna( 0).round(2) # 出勤不足3天人数统计 under3 = att_days_per_executor[att_days_per_executor[&#39;出勤天数&#39;] < 3].groupby(key)[ &#39;执行人&#39;].count().reset_index(name=&#39;出勤不足3天人数&#39;) dim_dfs[i] = pd.merge(dim_dfs[i], under3, on=key, how=&#39;left&#39;).fillna(0) dim_dfs[i][&#39;出勤不足3天人员占比&#39;] = (dim_dfs[i][&#39;出勤不足3天人数&#39;] / dim_dfs[i][&#39;实际出勤人数&#39;]).replace( [np.inf, -np.inf], 0).fillna(0).round(4) # ===================== 故障处理及时率计算 ===================== for i, key in enumerate(GROUP_KEYS): fault_data = time_range_data[time_range_data[&#39;工单分类&#39;] == &#39;故障工单&#39;] # 初始化要计算的占比列 ratio_columns = [&#39;故障处理及时率&#39;, &#39;超长退服工单占比&#39;, &#39;高频故障工单占比&#39;, &#39;延期工单占比&#39;] if not fault_data.empty: agg_dict = { &#39;是否及时处理&#39;: lambda x: x.eq(&#39;是&#39;).any(), &#39;是否超长退服&#39;: lambda x: x.eq(&#39;是&#39;).any(), &#39;高频故障&#39;: lambda x: x.eq(&#39;是&#39;).any(), &#39;是否延期有效&#39;: lambda x: x.eq(&#39;是&#39;).any(), &#39;所属组织&#39;: &#39;first&#39;, &#39;代维简称&#39;: &#39;first&#39;, &#39;地市&#39;: &#39;first&#39;, &#39;地市代维&#39;: &#39;first&#39; } if key == &#39;地市代维&#39;: agg_dict[&#39;地市&#39;] = &#39;first&#39; agg_dict[&#39;代维简称&#39;] = &#39;first&#39; unique_faults = fault_data.groupby(&#39;工单编号&#39;).agg(agg_dict).reset_index() # 按维度key分组统计 fault_stats_current = unique_faults.groupby(key).agg( 总故障工单数=(&#39;工单编号&#39;, &#39;nunique&#39;), 总故障及时数=(&#39;是否及时处理&#39;, &#39;sum&#39;), 超长退服工单数=(&#39;是否超长退服&#39;, &#39;sum&#39;), 高频故障工单数=(&#39;高频故障&#39;, &#39;sum&#39;), 延期工单数=(&#39;是否延期有效&#39;, &#39;sum&#39;) ).reset_index() # 计算各项占比 fault_stats_current[&#39;故障处理及时率&#39;] = ( fault_stats_current[&#39;总故障及时数&#39;] / fault_stats_current[&#39;总故障工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current[&#39;超长退服工单占比&#39;] = ( fault_stats_current[&#39;超长退服工单数&#39;] / fault_stats_current[&#39;总故障工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current[&#39;高频故障工单占比&#39;] = ( fault_stats_current[&#39;高频故障工单数&#39;] / fault_stats_current[&#39;总故障工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current[&#39;延期工单占比&#39;] = ( fault_stats_current[&#39;延期工单数&#39;] / fault_stats_current[&#39;总故障工单数&#39;] ).replace([np.inf, -np.inf], 0).fillna(0) else: # 创建空数据框并初始化所有占比列为0 fault_stats_current = pd.DataFrame(columns=[key] + ratio_columns) for col in ratio_columns: fault_stats_current[col] = 0.0 # 输出统计结果 logger.info(f"维度[{key}]故障统计结果:") logger.info(f" 总故障工单数: {fault_stats_current.get(&#39;总故障工单数&#39;, 0).sum()}") logger.info(f" 超长退服工单数: {fault_stats_current.get(&#39;超长退服工单数&#39;, 0).sum()}") logger.info(f" 高频故障工单数: {fault_stats_current.get(&#39;高频故障工单数&#39;, 0).sum()}") logger.info(f" 延期工单数: {fault_stats_current.get(&#39;延期工单数&#39;, 0).sum()}") # 使用map函数手动添加占比数据 for col in ratio_columns: mapping = dict(zip(fault_stats_current[key], fault_stats_current[col])) dim_dfs[i][col] = dim_dfs[i][key].map(mapping).fillna(0) # 输出合并后占比 logger.info(f"维度[{key}]合并后占比:") for col in ratio_columns: mean_value = dim_dfs[i][col].mean() logger.info(f" {col}: {mean_value}") # ===================== 调整字段顺序 ===================== base_columns = [ &#39;应出勤人数&#39;, &#39;实际出勤人数&#39;, &#39;出勤率&#39;, &#39;出勤不足3天人数&#39;, &#39;出勤不足3天人员占比&#39;, &#39;总工单数&#39;, &#39;有效工单数&#39;, &#39;有效工单占比&#39;, &#39;签到距离过大条目数&#39;, &#39;工作时长过短条目数&#39;, &#39;路途时长TOP50条目数&#39;, &#39;人均出勤天数&#39;, &#39;日均出勤人数&#39;, &#39;工单平均路途超2小时人数&#39;, &#39;平均工作时长占比低人数&#39;, &#39;人均每日工单数&#39;, &#39;人均每日有效工单数&#39;, &#39;人均每日出勤时长&#39;, &#39;人均每日工作时长&#39;, &#39;每工单路途时长&#39;, &#39;每工单工作时长&#39;, &#39;超长退服工单占比&#39;, &#39;高频故障工单占比&#39;, &#39;延期工单占比&#39;, &#39;故障处理及时率&#39; ] # 确保所有列都存在 for i, key in enumerate(GROUP_KEYS): # 只保留数据框中实际存在的列 available_columns = [col for col in base_columns if col in dim_dfs[i].columns] ordered_columns = [key] + available_columns dim_dfs[i] = dim_dfs[i][ordered_columns] # ===================== 总计行处理 ===================== for i in range(len(dim_dfs)): df = dim_dfs[i] group_key = GROUP_KEYS[i] # 创建总计行字典 total_row = {group_key: &#39;总计&#39;} # 处理需要sum的字段 sum_cols = [ &#39;应出勤人数&#39;, &#39;实际出勤人数&#39;, &#39;出勤不足3天人数&#39;, &#39;总工单数&#39;, &#39;有效工单数&#39;, &#39;签到距离过大条目数&#39;, &#39;工作时长过短条目数&#39;, &#39;路途时长TOP50条目数&#39;, &#39;工单平均路途超2小时人数&#39;, &#39;平均工作时长占比低人数&#39; ] # 检查并添加出勤相关列 has_total_attendance = &#39;总出勤人天&#39; in df.columns has_attendance_days = &#39;出勤天数_实际&#39; in df.columns if has_total_attendance: sum_cols.append(&#39;总出勤人天&#39;) if has_attendance_days: sum_cols.append(&#39;出勤天数_实际&#39;) # 计算总计行的和 for col in sum_cols: if col in df.columns: total_row[col] = df[df[group_key] != &#39;总计&#39;][col].sum() # 计算平均值字段 mean_cols = [ &#39;人均每日工单数&#39;, &#39;人均每日有效工单数&#39;, &#39;人均每日出勤时长&#39;, &#39;人均每日工作时长&#39;, &#39;人均出勤天数&#39;, &#39;日均出勤人数&#39;, &#39;每工单路途时长&#39;, &#39;每工单工作时长&#39;, &#39;超长退服工单占比&#39;, &#39;高频故障工单占比&#39;, &#39;延期工单占比&#39;, &#39;故障处理及时率&#39; ] for col in mean_cols: if col in df.columns: # 使用加权平均计算总计行的平均值 if &#39;实际出勤人数&#39; in df.columns: total_row[col] = (df[df[group_key] != &#39;总计&#39;][col] * df[df[group_key] != &#39;总计&#39;][ &#39;实际出勤人数&#39;]).sum() / df[df[group_key] != &#39;总计&#39;][&#39;实际出勤人数&#39;].sum() else: total_row[col] = df[df[group_key] != &#39;总计&#39;][col].mean() # 重新计算总计行中的百分比和比率字段 # 出勤率 total_actual = total_row.get(&#39;实际出勤人数&#39;, 0) total_expected = total_row.get(&#39;应出勤人数&#39;, 0) total_row[&#39;出勤率&#39;] = total_actual / total_expected if total_expected > 0 else 0 # 出勤不足3天人员占比 total_under3 = total_row.get(&#39;出勤不足3天人数&#39;, 0) total_row[&#39;出勤不足3天人员占比&#39;] = total_under3 / total_actual if total_actual > 0 else 0 # 有效工单占比 total_valid = total_row.get(&#39;有效工单数&#39;, 0) total_orders = total_row.get(&#39;总工单数&#39;, 0) total_row[&#39;有效工单占比&#39;] = total_valid / total_orders if total_orders > 0 else 0 # 添加总计行到数据框 total_df = pd.DataFrame([total_row]) dim_dfs[i] = pd.concat([df, total_df], ignore_index=True) return tuple(dim_dfs) # -------------------------- # 输出模块 # -------------------------- def save_to_excel(dataframes, path): """增强版Excel输出(带百分比格式)""" with pd.ExcelWriter(path, engine=&#39;xlsxwriter&#39;) as writer: workbook = writer.book # 定义格式(新增垂直居中设置) header_format = workbook.add_format({ &#39;bold&#39;: True, &#39;text_wrap&#39;: True, # 自动换行 &#39;border&#39;: 1, &#39;bg_color&#39;: &#39;#D9EAD3&#39;, # 浅绿色背景 &#39;align&#39;: &#39;center&#39;, # 水平居中 &#39;valign&#39;: &#39;vcenter&#39; # 垂直居中(新增) }) cell_format = workbook.add_format({ &#39;border&#39;: 1, &#39;align&#39;: &#39;center&#39;, # 内容水平居中 &#39;valign&#39;: &#39;vcenter&#39; # 内容垂直居中(新增) }) percent_format = workbook.add_format({ &#39;num_format&#39;: &#39;0.00%&#39;, # 百分比格式 &#39;border&#39;: 1, &#39;align&#39;: &#39;center&#39;, # 内容水平居中 &#39;valign&#39;: &#39;vcenter&#39; # 内容垂直居中(新增) }) for name, df in dataframes.items(): # 写入数据时添加标题格式 df.to_excel( writer, sheet_name=name, index=False, header=False, # 禁用自动标题 startrow=1 # 从第二行开始写数据 ) worksheet = writer.sheets[name] # 动态识别百分比列 percent_columns = [ col for col in df.columns if col.endswith((&#39;率&#39;, &#39;占比&#39;)) ] # 设置列格式(包含标题和数据) for col_num, col_name in enumerate(df.columns): col_width = max(12, len(col_name) * 1.5) # 动态列宽 # 设置标题 worksheet.write(0, col_num, col_name, header_format) # 设置列格式 if col_name in percent_columns: fmt = percent_format else: fmt = cell_format # 应用格式到整列 worksheet.set_column( first_col=col_num, last_col=col_num, width=col_width, cell_format=fmt ) # 冻结首行 worksheet.freeze_panes(1, 0) # 自动筛选(仅对数据量小于10000行的sheet启用) if len(df) < 10000: worksheet.autofilter(0, 0, len(df), len(df.columns) - 1) # -------------------------- # 主流程 # -------------------------- def main(): engine = init_db_engine() try: with time_monitor("数据加载"): site_orders = load_site_orders(engine) order_numbers = site_orders[&#39;工单编号&#39;].unique().tolist() order_dfs = batch_load_order_data(engine, order_numbers) with time_monitor("数据合并与处理"): merged_data = merge_order_data(site_orders, order_dfs) if merged_data.empty: logger.warning("无有效数据") return processed_data = ( merged_data .pipe(process_coordinates) .pipe(calculate_distances) .pipe(merge_cran_data, cran_path=FILE_PATHS["cran_info"]) .pipe(generate_time_features) .pipe(calculate_workload) ) output_dfs = {&#39;原始数据&#39;: processed_data} with time_monitor("统计计算"): for dim in DIMENSIONS: for time_level in [&#39;daily&#39;, &#39;weekly&#39;, &#39;monthly&#39;]: stats_df = generate_stats( processed_data, dimension=dim, time_level=time_level ) level_name = { &#39;daily&#39;: &#39;日统计&#39;, &#39;weekly&#39;: &#39;周统计&#39;, &#39;monthly&#39;: &#39;月统计&#39; }[time_level] sheet_name = f"{level_name}_{dim[&#39;name&#39;]}" output_dfs[sheet_name] = stats_df with time_monitor("处理代维组织数据"): member_list_df = pd.read_excel(member_list_path, usecols=member_list_columns, engine=&#39;calamine&#39;) pattern = r&#39;(?=.*无线2023)(?=.*维护组)(?!.*OLT)&#39; member_list_df = member_list_df[member_list_df[&#39;所属组织&#39;].str.contains(pattern, regex=True, na=False)] member_list_df[&#39;代维简称&#39;] = member_list_df[&#39;所属组织&#39;].str.split("-").str[0] member_list_df[&#39;代维简称&#39;] = member_list_df[&#39;代维简称&#39;].replace(&#39;中移铁通&#39;, &#39;铁通&#39;) member_list_df = member_list_df.rename(columns={&#39;所属地市&#39;: &#39;地市&#39;}) member_list_df[&#39;地市代维&#39;] = member_list_df[&#39;地市&#39;].str.split("-").str[0] + &#39;-&#39; + member_list_df[&#39;代维简称&#39;] member_list_df = member_list_df.drop_duplicates(subset=[&#39;登陆账号&#39;]) member_list_df[&#39;时间戳&#39;] = source_time member_list_df_filter = member_list_df[member_list_df[&#39;在职状态&#39;] == &#39;在职&#39;] filtered_data = processed_data[processed_data[&#39;所属组织&#39;].isin(member_list_df_filter[&#39;所属组织&#39;])] output_dfs[&#39;原始数据_副本&#39;] = filtered_data.copy() stats_group, stats_company, stats_city, stats_area = enhance_maintainer_stats( filtered_data, member_list_df_filter ) output_dfs.update({ &#39;代维组维度&#39;: stats_group, &#39;代维公司维度&#39;: stats_company, &#39;地市维度&#39;: stats_city, &#39;地市代维维度&#39;: stats_area }) with time_monitor("生成Top50统计"): # 从原始数据_副本中提取Top50 top50_travel = filtered_data.nlargest(50, &#39;路途时长(分钟)&#39;) # 选择需要的列 top50_travel = top50_travel[[&#39;工单编号&#39;, &#39;执行人&#39;, &#39;所属组织&#39;, &#39;地市&#39;, &#39;代维简称&#39;, &#39;路途时长(分钟)&#39;]] output_dfs[&#39;路途时长Top50&#39;] = top50_travel # 在指定的结果表第一列添加时间戳 target_sheets = [&#39;代维组维度&#39;, &#39;代维公司维度&#39;, &#39;地市维度&#39;, &#39;地市代维维度&#39;, &#39;路途时长Top50&#39;] for sheet in target_sheets: df = output_dfs[sheet] df.insert(0, &#39;时间戳&#39;, source_time) with time_monitor("保存结果"): save_to_excel(output_dfs, FILE_PATHS["output"]) logger.info(f"结果已保存至:{FILE_PATHS[&#39;output&#39;]}") except Exception as e: logger.error(f"处理失败:{str(e)}", exc_info=True) raise if __name__ == "__main__": main()
07-17
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值