第十四章:应用构建模块-configparser:处理配置文件-用拼接合并值-禁用拼接

本文介绍如何在Python的ConfigParser中禁用拼接功能,通过设置interpolation为None,可以安全地忽略所有拼接语法,防止配置文件中的变量替换。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

14.7.7.5 禁用拼接
如果要禁用拼接,则应传入None而不是一个Interpolation对象。

from configparser import ConfigParser

parser = ConfigParser(interpolation=None)
parser.read('interpolation.ini')

print('Without interpolation:',parser.get('bug_tracker','url'))

通过禁用拼接,原先拼接对象能处理的所有语法都会被安全地忽略。
在这里插入图片描述

import pandas as pd import logging from geopy.distance import geodesic import time from contextlib import contextmanager import numpy as np from sqlalchemy import create_engine, text from tenacity import retry, stop_after_attempt, wait_exponential from datetime import datetime # -------------------------- # 通用配置 # -------------------------- DB_CONFIG = { "conn_str": "mysql+pymysql://root:root@127.0.0.1:3306/test?charset=utf8", "pool_size": 10, "pool_recycle": 300 } member_list_path = r"D:\OneDrive\ERIC\维护\平台数据\代维信息\2025-07人员信息快照_20250708.xlsx" member_list_columns = ['登陆账号', '所属组织', '所属地市', '在职状态'] TIME_RANGE = { "start": '2025-07-01 00:00:00', "end": '2025-07-08 23:59:59' } # 定义日期格式转换器 def format_time_range(time_range): # 解析开始时间和结束时间 start_date = datetime.strptime(time_range["start"], '%Y-%m-%d %H:%M:%S') end_date = datetime.strptime(time_range["end"], '%Y-%m-%d %H:%M:%S') # 格式化为YYYYMMDD形式并拼接 return f"{start_date.strftime('%Y%m%d')}~{end_date.strftime('%Y%m%d')}" # 获取格式化后的时间范围 source_time = format_time_range(TIME_RANGE) FILE_PATHS = { "cran_info": r'D:\OneDrive\ERIC\维护\test\CRAN机房信息.csv', "output": fr'D:\OneDrive\ERIC\维护\工单\现场执行工单\现场执行工单_{source_time}.xlsx' } TABLE_CONFIG = { '工单_保障': {'columns': ["工单当前状态", "工单号", "工单分类", "维护分类"]}, '工单_巡检': {'columns': [ "工单当前状态", "工单号", "资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, '工单_拆站': {'columns': [ "工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, '工单_验收': {'columns': [ "工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, '工单_发电': {'columns': [ "工单当前状态", "工单号", "站点ID AS 资源ID", "站点名称 AS 资源名称", "站点经度 AS 资源经度", "站点纬度 AS 资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, '工单_通用': {'columns': [ "工单当前状态", "工单号", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类" ]}, '故障工单': {'columns': [ "工单状态 AS 工单当前状态", "工单编号 AS 工单号", "STATION_NAME AS 资源名称", "LONGITUDE AS 资源经度", "LATITUDE AS 资源纬度", "STATION_NAME", "STATION_ID", "STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类", "故障处理时长_小时", "是否延期有效", "是否及时处理", "高频故障", "是否超长退服", "网元分类" ], 'where_column': '工单编号'} } DIMENSIONS = [ {'name': '执行人', 'keys': ['执行人']}, {'name': '所属组织', 'keys': ['所属组织']}, {'name': '地市', 'keys': ['地市']}, {'name': '代维简称', 'keys': ['代维简称']}, {'name': '地市代维', 'keys': ['地市', '代维简称']} ] # -------------------------- # 工具函数 # -------------------------- @contextmanager def time_monitor(step_name): """耗时监控上下文管理器""" start_time = time.time() try: yield finally: print(f"{step_name} 耗时: {time.time() - start_time:.4f}秒") def setup_logging(): """日志配置""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("data_processor_v2.log"), logging.StreamHandler() ] ) return logging.getLogger(__name__) logger = setup_logging() @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) def safe_read_sql(sql: text, engine, params: dict = None) -> pd.DataFrame: """带重试机制的SQL读取""" logger.debug(f"执行SQL: {sql} \n参数: {params}") return pd.read_sql(sql, engine, params=params) def init_db_engine(): """初始化数据库引擎""" return create_engine( DB_CONFIG["conn_str"], pool_size=DB_CONFIG["pool_size"], pool_recycle=DB_CONFIG["pool_recycle"] ) # -------------------------- # 数据校验函数 # -------------------------- def is_valid_coordinates(lat, lon): """经纬度有效性校验""" if pd.isna(lat) or pd.isna(lon): return False if not (-90 <= lat <= 90 and -180 <= lon <= 180): return False if abs(lat) < 1e-6 and abs(lon) < 1e-6: return False return True def calculate_distance(coord1, coord2): """带校验的距离计算""" try: lat1, lon1 = coord1 lat2, lon2 = coord2 except (TypeError, ValueError) as e: logger.debug(f"坐标解析失败: {str(e)} | coord1={coord1} coord2={coord2}") return 99999.0 if not (is_valid_coordinates(lat1, lon1) and is_valid_coordinates(lat2, lon2)): return 99999.0 return geodesic(coord1, coord2).meters # -------------------------- # 数据加载模块 # -------------------------- def load_site_orders(engine): """加载现场执行工单数据""" sql = text(""" SELECT 地市, 代维简称, 专业, 工单类型, 工单编号, 执行人, 所属组织, 出发时间, 出发经度, 出发纬度, 签到时间, 签到经度, 签到纬度, 完成时间, 完成经度, 完成纬度, 路途时长, 实际工作时长 FROM 现场执行工单 WHERE 出发时间 BETWEEN :start_time AND :end_time """) params = { 'start_time': TIME_RANGE["start"], 'end_time': TIME_RANGE["end"] } df = safe_read_sql(sql, engine, params) df = df[~df['所属组织'].str.contains('高铁')] df['代维简称'] = df['代维简称'].replace('中移铁通', '铁通') valid_companies = ['中贝', '中通服', '中邮建', '唐人', '宜通', '怡创', '浙通服', '润建', '虹信', '超讯', '铁通', '长实'] df = df[df['代维简称'].isin(valid_companies)] df['地市代维'] = df['地市'].str.split("-").str[0] + '-' + df['代维简称'] logger.info(f"加载现场工单记录数: {len(df)}") return df def batch_load_order_data(engine, order_numbers, batch_size=500): """批量加载工单相关数据""" dfs = [] for table, config in TABLE_CONFIG.items(): try: columns = ", ".join(config['columns']) where_col = config.get('where_column', '工单号') # 修复:正确构造SQL查询字符串,确保引号闭合 sql_template = text(f"SELECT {columns} FROM `{table}` WHERE `{where_col}` IN :order_nums") table_dfs = [] for i in range(0, len(order_numbers), batch_size): batch = order_numbers[i:i + batch_size] df = safe_read_sql(sql_template, engine, params={'order_nums': batch}) table_dfs.append(df) if table_dfs: table_df = pd.concat(table_dfs, ignore_index=True) dfs.append(table_df) logger.info(f"表 {table} 加载完成,记录数: {len(table_df)}") except Exception as e: logger.error(f"加载表 {table} 失败: {str(e)}", exc_info=True) return dfs # -------------------------- # 数据处理模块 # -------------------------- def merge_order_data(site_df, order_dfs): """合并工单数据(增强版)""" if not order_dfs: logger.warning("没有需要合并的工单数据") return pd.DataFrame() final_df = pd.concat(order_dfs, axis=0, ignore_index=True) final_df = final_df.drop_duplicates(subset=['工单号'], keep='first') merged_df = pd.merge( left=site_df, right=final_df, left_on='工单编号', right_on='工单号', how='left', ) merged_df.drop(columns=['工单号'], inplace=True) merged_df['路途时长(分钟)'] = (merged_df['路途时长'] * 60).round(2) merged_df['实际工作时长(分钟)'] = (merged_df['实际工作时长'] * 60).round(2) # 过滤有效状态 condition = ( merged_df['工单当前状态'].notna() & (merged_df['工单当前状态'] != '') & ~merged_df['工单当前状态'].str.contains('已驳回|已撤销|已关闭', na=False) ) return merged_df[condition] def get_custom_natural_week(date): """ 计算“自然周”编号。本定义中,一周从上周三开始,到本周二结束。 """ if pd.isnull(date): return np.nan weekday = date.weekday() days_since_last_thursday = (weekday + 5) % 7 date = date - pd.Timedelta(days=days_since_last_thursday) return date.isocalendar()[1] # -------------------------- # 核心处理模块 # -------------------------- def generate_time_features(df): """增强时间特征生成""" df['出发时间'] = pd.to_datetime(df['出发时间'], errors='coerce') df['自然年'] = df['出发时间'].dt.year df['自然月'] = df['出发时间'].dt.to_period('M').astype(str) df['自然周'] = df['出发时间'].apply(get_custom_natural_week) df['出发日期'] = df['出发时间'].dt.date return df def process_coordinates(df): """坐标处理""" coord_columns = [ "资源纬度", "资源经度", "签到纬度", "签到经度", "LATITUDE", "LONGITUDE", "完成纬度", "完成经度" ] for col in coord_columns: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') else: logger.warning(f"列 {col} 不存在") fault_columns = ['是否超长退服', '高频故障', '是否延期有效', '是否及时处理'] for col in fault_columns: if col not in df.columns: df[col] = '否' return df def calculate_distances(df): """距离计算""" df["签到距离"] = df.apply( lambda row: calculate_distance( (row["签到纬度"], row["签到经度"]), (row["资源纬度"], row["资源经度"]) ), axis=1 ) return df def merge_cran_data(main_df, cran_path): """合并CRAN机房数据并取最近RU距离""" cran_df = pd.read_csv(cran_path) for col in ["ru纬度", "ru经度"]: cran_df[col] = pd.to_numeric(cran_df[col], errors="coerce").fillna(0) merged = pd.merge( main_df[["工单编号", "STATION_NAME", "签到经度", "签到纬度"]], cran_df, left_on="STATION_NAME", right_on="station_name", how="left" ) merged["签到距离_CRAN"] = merged.apply( lambda row: calculate_distance( (row["签到纬度"], row["签到经度"]), (row["ru纬度"], row["ru经度"]) ), axis=1 ) min_distances = merged.groupby("工单编号", as_index=False)["签到距离_CRAN"].min() return pd.merge( main_df, min_distances, on="工单编号", how="left" ).fillna({"签到距离_CRAN": 99999.0}) # -------------------------- # 统计计算模块 # -------------------------- def calculate_workload(df): """工作量评定""" df['工作量评定'] = '有效' df['原因'] = '' no_completion = df['完成时间'].isna() df.loc[no_completion, ['工作量评定', '原因']] = ['无效', '无完成时间'] short_work = df['实际工作时长(分钟)'] < 5 df.loc[short_work, ['工作量评定', '原因']] = ['无效', '工作时长过短'] if all(col in df.columns for col in ['签到距离', '签到距离_CRAN', '维护分类']): invalid_dist = ( ~df['维护分类'].isin(['发电', '保障']) & (df['签到距离'] > 300) & (df['签到距离_CRAN'] > 300) ) df.loc[invalid_dist, ['工作量评定', '原因']] = ['无效', '签到距离过大'] return df def _calculate_base_stats(df, group_keys): """通用基础指标计算""" categories = df['维护分类'].dropna().unique() agg_dict = { # 总工单数(已去重) "总工单数": ('工单编号', 'nunique'), # 有效工单数(去重) "有效工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['工作量评定'] == "有效"].nunique()), # 故障工单数(去重):统计工单分类为"故障工单"的工单编号唯一数 "故障工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['工单分类'] == "故障工单"].nunique()), # 故障及时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"是"的工单编号唯一数 "故障及时数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][ (df['工单分类'] == "故障工单") & (df['是否及时处理'] == "是") ].nunique()), # 故障超时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"否"的工单编号唯一数 "故障超时数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][ (df['工单分类'] == "故障工单") & (df['是否及时处理'] == "否") ].nunique()), # 超长退服工单数(去重):统计是否超长退服为"是"的工单编号唯一数 "超长退服工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['是否超长退服'] == "是"].nunique()), # 超频故障工单数(去重):统计高频故障为"是"的工单编号唯一数 "超频故障工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['高频故障'] == "是"].nunique()), # 延期工单数(去重):统计是否延期有效为"是"的工单编号唯一数 "延期工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['是否延期有效'] == "是"].nunique()), # 其他非去重指标保持不变 "故障处理时长": ('故障处理时长_小时', lambda x: x[df['工单分类'] == "故障工单"].sum()), "路途总时长": ('路途时长(分钟)', 'sum'), "工作总时长": ('实际工作时长(分钟)', 'sum'), "出勤人数": ('执行人', 'nunique'), "出勤人天": ('工单编号', lambda x: df.loc[x.index][['执行人', '出发日期']].drop_duplicates().shape[0]), } for cat in categories: agg_dict[f"{cat}工单数"] = ( '维护分类', lambda x, current_cat=cat: x.eq(current_cat).sum() ) stats = df.groupby(group_keys).agg(**agg_dict).reset_index() return stats def _add_derived_metrics(stats, time_level): """通用衍生指标计算""" stats['出勤总时长'] = stats['路途总时长'] + stats['工作总时长'] hour_cols = ['出勤总时长', '路途总时长', '工作总时长'] for col in hour_cols: stats[f'{col}(h)'] = (stats[col] / 60).round(2) stats['平均工单工作时长(h)'] = ( stats['工作总时长(h)'] / stats['总工单数'] ).replace([np.inf, -np.inf], 0).fillna(0).round(2) stats['平均工单路途时长(h)'] = ( stats['路途总时长(h)'] / stats['总工单数'] ).replace([np.inf, -np.inf], 0).fillna(0).round(2) if time_level == 'weekly': stats['人均能效_周'] = (stats['有效工单数'] / stats['出勤人天']).round(2) elif time_level == 'monthly': stats['人均能效_月'] = (stats['有效工单数'] / stats['出勤人天']).round(2) stats['故障处理及时率'] = (stats['故障及时数'] / stats['故障工单数']).replace(np.inf, 0) stats['工作时长占比'] = (stats['工作总时长'] / stats['出勤总时长']).replace(np.inf, 0) stats['人均出勤天数'] = (stats['出勤人天'] / stats['出勤人数']).replace(np.inf, 0).round(2) stats['日均出勤人数'] = (stats['出勤人天'] / stats['出勤天数']).replace(np.inf, 0).round(2) return stats def calculate_attendance_metrics(df, denominator): """计算出勤相关指标""" df = df.copy() df['出勤总时长(h)'] = (df['路途时长(分钟)'] + df['实际工作时长(分钟)']) / 60 df['工作总时长(h)'] = df['实际工作时长(分钟)'] / 60 # 按执行人和出发日期分组,计算每天的数据 daily_stats = df.groupby(['执行人', '出发日期']).agg( station_count=('STATION_NAME', 'nunique'), total_attendance_hours=('出勤总时长(h)', 'sum'), total_work_hours=('工作总时长(h)', 'sum') ).reset_index() # 计算每天是否达标(条件1) daily_stats['达标'] = ( (daily_stats['station_count'] >= 4) & (daily_stats['total_attendance_hours'] >= 7) & (daily_stats['total_work_hours'] >= 2) ) # 按执行人聚合计算总指标 executor_stats = daily_stats.groupby('执行人').agg( meet_days=('达标', 'sum'), total_stations=('station_count', 'sum'), total_attendance_hours=('total_attendance_hours', 'sum'), total_work_hours=('total_work_hours', 'sum') ).reset_index() # 计算衍生指标 executor_stats['平均每天到站址'] = executor_stats['total_stations'] / denominator executor_stats['日均出勤出工时长'] = executor_stats['total_attendance_hours'] / denominator executor_stats['日均有效出勤工时'] = executor_stats['total_work_hours'] / denominator # 添加条件2达标标识 executor_stats['条件2达标'] = ( (executor_stats['平均每天到站址'] >= 4) & (executor_stats['日均出勤出工时长'] >= 7) & (executor_stats['日均有效出勤工时'] >= 2) ) return executor_stats.rename(columns={'meet_days': '出勤达标天数'}) def generate_stats(df, dimension, time_level): """通用统计生成函数""" time_key = { 'daily': '出发日期', 'weekly': '自然周', 'monthly': '自然月' }[time_level] group_keys = dimension['keys'] + [time_key] # 获取维度的主键(用于合并的键名) dimension_key = dimension['keys'][0] # 取维度的第一个键作为合并主键 stats = _calculate_base_stats(df, group_keys) stats['出勤天数'] = df.groupby(group_keys)['出发日期'].nunique().values # 计算月度分母(MIN(TIME_RANGE["end"]的日期部分, 22)) denominator = None if time_level == 'daily': stats['出发日期'] = pd.to_datetime(stats['出发日期']).dt.strftime('%Y-%m-%d') else: stats['出勤天数'] = df.groupby(group_keys)['出发日期'].nunique().values if time_level == 'monthly': # 移除原工作日天数计算,替换为新分母 end_date = datetime.strptime(TIME_RANGE["end"], '%Y-%m-%d %H:%M:%S') end_day = end_date.day denominator = min(end_day, 22) stats = _add_derived_metrics(stats, time_level) # 当维度为执行人且时间粒度为月度时,合并出勤指标(使用正确的键名) if dimension['name'] == '执行人' and time_level == 'monthly' and denominator is not None: attendance_metrics = calculate_attendance_metrics(df, denominator) # 使用维度的实际键名(如'执行人')作为合并键,而非固定的'维度' stats = pd.merge( stats, attendance_metrics[[ # 保留原始'执行人'列作为合并键 '执行人', '出勤达标天数', '平均每天到站址', '日均出勤出工时长', '日均有效出勤工时' ]], on=dimension_key, # 动态使用维度的键名(如'执行人')进行合并 how='left' ) return stats # -------------------------- # 代维统计增强模块 # -------------------------- def enhance_maintainer_stats(filtered_data, member_list_df_filter): time_range_data = filtered_data.copy() # 初始化维度框架 GROUP_KEYS = ['所属组织', '代维简称', '地市', '地市代维'] dim_dfs = [] for key in GROUP_KEYS: df = member_list_df_filter.groupby(key).agg( 应出勤人数=('登陆账号', 'count') ).reset_index() dim_dfs.append(df) if not time_range_data.empty: # ===================== 原有统计逻辑 ===================== # 执行人级别统计 executor_stats = time_range_data.groupby(['执行人', '所属组织', '代维简称', '地市', '地市代维']).agg( 总工单数=('工单编号', 'nunique'), 有效工单数=('工作量评定', lambda s: s.eq('有效').sum()), 总路途时长=('路途时长(分钟)', 'sum'), 总工作时长=('实际工作时长(分钟)', 'sum'), 总出勤天数=('出发日期', 'nunique'), 故障工单数=('工单分类', lambda s: s.eq('故障工单').sum()), 超长退服工单数=('是否超长退服', lambda s: s.eq('是').sum()), 高频故障工单数=('高频故障', lambda s: s.eq('是').sum()), 延期工单数=('是否延期有效', lambda s: s.eq('是').sum()), 故障及时数=('是否及时处理', lambda s: s.eq('是').sum()) ).reset_index() # 计算执行人级别指标 metrics = [ ('平均路途时长(小时)', (executor_stats['总路途时长'] / 60) / executor_stats['总工单数'].replace(0, np.nan)), ('工作时长占比', executor_stats['总工作时长'] / (executor_stats['总路途时长'] + executor_stats['总工作时长']).replace(0, np.nan)), ('人均每日工单数', executor_stats['总工单数'] / executor_stats['总出勤天数'].replace(0, np.nan)), ('人均每日有效工单数', executor_stats['有效工单数'] / executor_stats['总出勤天数'].replace(0, np.nan)), ('人均每日出勤时长(小时)', (executor_stats['总路途时长'] + executor_stats['总工作时长']) / 60 / executor_stats['总出勤天数'].replace( 0, np.nan)), ('人均每日工作时长(小时)', executor_stats['总工作时长'] / 60 / executor_stats['总出勤天数'].replace(0, np.nan)), ('每工单路途时长(小时)', executor_stats['总路途时长'] / 60 / executor_stats['总工单数'].replace(0, np.nan)), ('每工单工作时长(小时)', executor_stats['总工作时长'] / 60 / executor_stats['总工单数'].replace(0, np.nan)), ('超长退服工单占比', executor_stats['超长退服工单数'] / executor_stats['故障工单数'].replace(0, np.nan)), ('高频故障工单占比', executor_stats['高频故障工单数'] / executor_stats['故障工单数'].replace(0, np.nan)), ('延期工单占比', executor_stats['延期工单数'] / executor_stats['故障工单数'].replace(0, np.nan)), ] for col, formula in metrics: executor_stats[col] = formula executor_stats = executor_stats.fillna(0).round(4) # 维度聚合统计 def calculate_dimension_stats(grouped_df, current_key): return grouped_df.groupby(current_key).agg( 总工单数=('总工单数', 'sum'), 有效工单数=('有效工单数', 'sum'), 工单平均路途超2小时人数=('平均路途时长(小时)', lambda x: (x > 2).sum()), 平均工作时长占比低人数=('工作时长占比', lambda x: (x < 0.1).sum()), 人均每日工单数=('人均每日工单数', 'mean'), 人均每日有效工单数=('人均每日有效工单数', 'mean'), 人均每日出勤时长=('人均每日出勤时长(小时)', 'mean'), 人均每日工作时长=('人均每日工作时长(小时)', 'mean'), 每工单路途时长=('每工单路途时长(小时)', 'mean'), 每工单工作时长=('每工单工作时长(小时)', 'mean'), 超长退服工单占比=('超长退服工单占比', 'mean'), 高频故障工单占比=('高频故障工单占比', 'mean'), 延期工单占比=('延期工单占比', 'mean'), ).reset_index().round(4) # 合并维度统计 updated_dims = [] for i, key in enumerate(GROUP_KEYS): dim_stats = calculate_dimension_stats(executor_stats, key) dim_stats['有效工单占比'] = (dim_stats['有效工单数'] / dim_stats['总工单数']).replace(np.inf, 0).fillna( 0).round(4) merged = pd.merge( dim_dfs[i], dim_stats, on=key, how='left' ).fillna(0) updated_dims.append(merged) dim_dfs = updated_dims # ===================== 新增需求处理 ===================== # 提取问题工单 sign_in_issue = time_range_data[time_range_data['原因'] == '签到距离过大'] short_duration_issue = time_range_data[time_range_data['原因'] == '工作时长过短'] top50_travel_orders = time_range_data.nlargest(50, '路途时长(分钟)')['工单编号'].unique() top50_travel_data = time_range_data[time_range_data['工单编号'].isin(top50_travel_orders)] # 处理各维度指标 for i, key in enumerate(GROUP_KEYS): # 合并签到距离过大条目数 sign_in_counts = sign_in_issue.groupby(key)['工单编号'].nunique().reset_index(name='签到距离过大条目数') dim_dfs[i] = pd.merge(dim_dfs[i], sign_in_counts, on=key, how='left') # 合并工作时长过短条目数 short_duration_counts = short_duration_issue.groupby(key)['工单编号'].nunique().reset_index( name='工作时长过短条目数') dim_dfs[i] = pd.merge(dim_dfs[i], short_duration_counts, on=key, how='left') # 合并路途时长TOP50条目数 top50_counts = top50_travel_data.groupby(key)['工单编号'].nunique().reset_index(name='路途时长TOP50条目数') dim_dfs[i] = pd.merge(dim_dfs[i], top50_counts, on=key, how='left') dim_dfs[i] = dim_dfs[i].fillna(0) # ===================== 出勤相关统计 ===================== # 实际出勤人数 def merge_attendance(source_df, target_df, grp_key): att = source_df.groupby(grp_key)['执行人'].nunique().reset_index(name='实际出勤人数') return pd.merge(target_df, att, on=grp_key, how='left').fillna(0) for i, key in enumerate(GROUP_KEYS): dim_dfs[i] = merge_attendance(time_range_data, dim_dfs[i], key) dim_dfs[i]['出勤率'] = (dim_dfs[i]['实际出勤人数'] / dim_dfs[i]['应出勤人数']).replace(np.inf, 0).fillna( 0).round(4) # 出勤天数统计 for i, key in enumerate(GROUP_KEYS): att_cols = ['执行人', key, '出发日期'] att_days = time_range_data[att_cols].drop_duplicates(subset=['执行人', '出发日期']) # 计算每个执行人的出勤天数 att_days_per_executor = att_days.groupby([key, '执行人']).size().reset_index(name='出勤天数') # 总出勤人天(所有人的出勤天数总和) total_attendance_person_days = att_days_per_executor.groupby(key)['出勤天数'].sum().reset_index( name='总出勤人天') dim_dfs[i] = pd.merge(dim_dfs[i], total_attendance_person_days, on=key, how='left').fillna(0) # 出勤天数(不同出发日期的数量) attendance_days = att_days.groupby(key)['出发日期'].nunique().reset_index(name='出勤天数_实际') dim_dfs[i] = pd.merge(dim_dfs[i], attendance_days, on=key, how='left').fillna(0) # 计算人均出勤天数和日均出勤人数 dim_dfs[i]['人均出勤天数'] = (dim_dfs[i]['总出勤人天'] / dim_dfs[i]['实际出勤人数']).replace(np.inf, 0).fillna( 0).round(2) dim_dfs[i]['日均出勤人数'] = (dim_dfs[i]['总出勤人天'] / dim_dfs[i]['出勤天数_实际']).replace(np.inf, 0).fillna( 0).round(2) # 出勤不足3天人数统计 under3 = att_days_per_executor[att_days_per_executor['出勤天数'] < 3].groupby(key)[ '执行人'].count().reset_index(name='出勤不足3天人数') dim_dfs[i] = pd.merge(dim_dfs[i], under3, on=key, how='left').fillna(0) dim_dfs[i]['出勤不足3天人员占比'] = (dim_dfs[i]['出勤不足3天人数'] / dim_dfs[i]['实际出勤人数']).replace( [np.inf, -np.inf], 0).fillna(0).round(4) # ===================== 故障处理及时率计算 ===================== for i, key in enumerate(GROUP_KEYS): fault_data = time_range_data[time_range_data['工单分类'] == '故障工单'] # 初始化要计算的占比列 ratio_columns = ['故障处理及时率', '超长退服工单占比', '高频故障工单占比', '延期工单占比'] if not fault_data.empty: agg_dict = { '是否及时处理': lambda x: x.eq('是').any(), '是否超长退服': lambda x: x.eq('是').any(), '高频故障': lambda x: x.eq('是').any(), '是否延期有效': lambda x: x.eq('是').any(), '所属组织': 'first', '代维简称': 'first', '地市': 'first', '地市代维': 'first' } if key == '地市代维': agg_dict['地市'] = 'first' agg_dict['代维简称'] = 'first' unique_faults = fault_data.groupby('工单编号').agg(agg_dict).reset_index() # 按维度key分组统计 fault_stats_current = unique_faults.groupby(key).agg( 总故障工单数=('工单编号', 'nunique'), 总故障及时数=('是否及时处理', 'sum'), 超长退服工单数=('是否超长退服', 'sum'), 高频故障工单数=('高频故障', 'sum'), 延期工单数=('是否延期有效', 'sum') ).reset_index() # 计算各项占比 fault_stats_current['故障处理及时率'] = ( fault_stats_current['总故障及时数'] / fault_stats_current['总故障工单数'] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current['超长退服工单占比'] = ( fault_stats_current['超长退服工单数'] / fault_stats_current['总故障工单数'] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current['高频故障工单占比'] = ( fault_stats_current['高频故障工单数'] / fault_stats_current['总故障工单数'] ).replace([np.inf, -np.inf], 0).fillna(0) fault_stats_current['延期工单占比'] = ( fault_stats_current['延期工单数'] / fault_stats_current['总故障工单数'] ).replace([np.inf, -np.inf], 0).fillna(0) else: # 创建空数据框并初始化所有占比列为0 fault_stats_current = pd.DataFrame(columns=[key] + ratio_columns) for col in ratio_columns: fault_stats_current[col] = 0.0 # 输出统计结果 logger.info(f"维度[{key}]故障统计结果:") logger.info(f" 总故障工单数: {fault_stats_current.get('总故障工单数', 0).sum()}") logger.info(f" 超长退服工单数: {fault_stats_current.get('超长退服工单数', 0).sum()}") logger.info(f" 高频故障工单数: {fault_stats_current.get('高频故障工单数', 0).sum()}") logger.info(f" 延期工单数: {fault_stats_current.get('延期工单数', 0).sum()}") # 使用map函数手动添加占比数据 for col in ratio_columns: mapping = dict(zip(fault_stats_current[key], fault_stats_current[col])) dim_dfs[i][col] = dim_dfs[i][key].map(mapping).fillna(0) # 输出合并后占比 logger.info(f"维度[{key}]合并后占比:") for col in ratio_columns: mean_value = dim_dfs[i][col].mean() logger.info(f" {col}: {mean_value}") # ===================== 调整字段顺序 ===================== base_columns = [ '应出勤人数', '实际出勤人数', '出勤率', '出勤不足3天人数', '出勤不足3天人员占比', '总工单数', '有效工单数', '有效工单占比', '签到距离过大条目数', '工作时长过短条目数', '路途时长TOP50条目数', '人均出勤天数', '日均出勤人数', '工单平均路途超2小时人数', '平均工作时长占比低人数', '人均每日工单数', '人均每日有效工单数', '人均每日出勤时长', '人均每日工作时长', '每工单路途时长', '每工单工作时长', '超长退服工单占比', '高频故障工单占比', '延期工单占比', '故障处理及时率' ] # 确保所有列都存在 for i, key in enumerate(GROUP_KEYS): # 只保留数据框中实际存在的列 available_columns = [col for col in base_columns if col in dim_dfs[i].columns] ordered_columns = [key] + available_columns dim_dfs[i] = dim_dfs[i][ordered_columns] # ===================== 总计行处理 ===================== for i in range(len(dim_dfs)): df = dim_dfs[i] group_key = GROUP_KEYS[i] # 创建总计行字典 total_row = {group_key: '总计'} # 处理需要sum的字段 sum_cols = [ '应出勤人数', '实际出勤人数', '出勤不足3天人数', '总工单数', '有效工单数', '签到距离过大条目数', '工作时长过短条目数', '路途时长TOP50条目数', '工单平均路途超2小时人数', '平均工作时长占比低人数' ] # 检查并添加出勤相关列 has_total_attendance = '总出勤人天' in df.columns has_attendance_days = '出勤天数_实际' in df.columns if has_total_attendance: sum_cols.append('总出勤人天') if has_attendance_days: sum_cols.append('出勤天数_实际') # 计算总计行的和 for col in sum_cols: if col in df.columns: total_row[col] = df[df[group_key] != '总计'][col].sum() # 计算平均字段 mean_cols = [ '人均每日工单数', '人均每日有效工单数', '人均每日出勤时长', '人均每日工作时长', '人均出勤天数', '日均出勤人数', '每工单路途时长', '每工单工作时长', '超长退服工单占比', '高频故障工单占比', '延期工单占比', '故障处理及时率' ] for col in mean_cols: if col in df.columns: # 使用加权平均计算总计行的平均 if '实际出勤人数' in df.columns: total_row[col] = (df[df[group_key] != '总计'][col] * df[df[group_key] != '总计'][ '实际出勤人数']).sum() / df[df[group_key] != '总计']['实际出勤人数'].sum() else: total_row[col] = df[df[group_key] != '总计'][col].mean() # 重新计算总计行中的百分比和比率字段 # 出勤率 total_actual = total_row.get('实际出勤人数', 0) total_expected = total_row.get('应出勤人数', 0) total_row['出勤率'] = total_actual / total_expected if total_expected > 0 else 0 # 出勤不足3天人员占比 total_under3 = total_row.get('出勤不足3天人数', 0) total_row['出勤不足3天人员占比'] = total_under3 / total_actual if total_actual > 0 else 0 # 有效工单占比 total_valid = total_row.get('有效工单数', 0) total_orders = total_row.get('总工单数', 0) total_row['有效工单占比'] = total_valid / total_orders if total_orders > 0 else 0 # 添加总计行到数据框 total_df = pd.DataFrame([total_row]) dim_dfs[i] = pd.concat([df, total_df], ignore_index=True) return tuple(dim_dfs) # -------------------------- # 输出模块 # -------------------------- def save_to_excel(dataframes, path): """增强版Excel输出(带百分比格式)""" with pd.ExcelWriter(path, engine='xlsxwriter') as writer: workbook = writer.book # 定义格式(新增垂直居中设置) header_format = workbook.add_format({ 'bold': True, 'text_wrap': True, # 自动换行 'border': 1, 'bg_color': '#D9EAD3', # 浅绿色背景 'align': 'center', # 水平居中 'valign': 'vcenter' # 垂直居中(新增) }) cell_format = workbook.add_format({ 'border': 1, 'align': 'center', # 内容水平居中 'valign': 'vcenter' # 内容垂直居中(新增) }) percent_format = workbook.add_format({ 'num_format': '0.00%', # 百分比格式 'border': 1, 'align': 'center', # 内容水平居中 'valign': 'vcenter' # 内容垂直居中(新增) }) for name, df in dataframes.items(): # 写入数据时添加标题格式 df.to_excel( writer, sheet_name=name, index=False, header=False, # 禁用自动标题 startrow=1 # 从第二行开始写数据 ) worksheet = writer.sheets[name] # 动态识别百分比列 percent_columns = [ col for col in df.columns if col.endswith(('率', '占比')) ] # 设置列格式(包含标题和数据) for col_num, col_name in enumerate(df.columns): col_width = max(12, len(col_name) * 1.5) # 动态列宽 # 设置标题 worksheet.write(0, col_num, col_name, header_format) # 设置列格式 if col_name in percent_columns: fmt = percent_format else: fmt = cell_format # 应用格式到整列 worksheet.set_column( first_col=col_num, last_col=col_num, width=col_width, cell_format=fmt ) # 冻结首行 worksheet.freeze_panes(1, 0) # 自动筛选(仅对数据量小于10000行的sheet启用) if len(df) < 10000: worksheet.autofilter(0, 0, len(df), len(df.columns) - 1) # -------------------------- # 主流程 # -------------------------- def main(): engine = init_db_engine() try: with time_monitor("数据加载"): site_orders = load_site_orders(engine) order_numbers = site_orders['工单编号'].unique().tolist() order_dfs = batch_load_order_data(engine, order_numbers) with time_monitor("数据合并处理"): merged_data = merge_order_data(site_orders, order_dfs) if merged_data.empty: logger.warning("无有效数据") return processed_data = ( merged_data .pipe(process_coordinates) .pipe(calculate_distances) .pipe(merge_cran_data, cran_path=FILE_PATHS["cran_info"]) .pipe(generate_time_features) .pipe(calculate_workload) ) output_dfs = {'原始数据': processed_data} with time_monitor("统计计算"): for dim in DIMENSIONS: for time_level in ['daily', 'weekly', 'monthly']: stats_df = generate_stats( processed_data, dimension=dim, time_level=time_level ) level_name = { 'daily': '日统计', 'weekly': '周统计', 'monthly': '月统计' }[time_level] sheet_name = f"{level_name}_{dim['name']}" output_dfs[sheet_name] = stats_df with time_monitor("处理代维组织数据"): member_list_df = pd.read_excel(member_list_path, usecols=member_list_columns, engine='calamine') pattern = r'(?=.*无线2023)(?=.*维护组)(?!.*OLT)' member_list_df = member_list_df[member_list_df['所属组织'].str.contains(pattern, regex=True, na=False)] member_list_df['代维简称'] = member_list_df['所属组织'].str.split("-").str[0] member_list_df['代维简称'] = member_list_df['代维简称'].replace('中移铁通', '铁通') member_list_df = member_list_df.rename(columns={'所属地市': '地市'}) member_list_df['地市代维'] = member_list_df['地市'].str.split("-").str[0] + '-' + member_list_df['代维简称'] member_list_df = member_list_df.drop_duplicates(subset=['登陆账号']) member_list_df['时间戳'] = source_time member_list_df_filter = member_list_df[member_list_df['在职状态'] == '在职'] filtered_data = processed_data[processed_data['所属组织'].isin(member_list_df_filter['所属组织'])] output_dfs['原始数据_副本'] = filtered_data.copy() stats_group, stats_company, stats_city, stats_area = enhance_maintainer_stats( filtered_data, member_list_df_filter ) output_dfs.update({ '代维组维度': stats_group, '代维公司维度': stats_company, '地市维度': stats_city, '地市代维维度': stats_area }) with time_monitor("生成Top50统计"): # 从原始数据_副本中提取Top50 top50_travel = filtered_data.nlargest(50, '路途时长(分钟)') # 选择需要的列 top50_travel = top50_travel[['工单编号', '执行人', '所属组织', '地市', '代维简称', '路途时长(分钟)']] output_dfs['路途时长Top50'] = top50_travel # 在指定的结果表第一列添加时间戳 target_sheets = ['代维组维度', '代维公司维度', '地市维度', '地市代维维度', '路途时长Top50'] for sheet in target_sheets: df = output_dfs[sheet] df.insert(0, '时间戳', source_time) with time_monitor("保存结果"): save_to_excel(output_dfs, FILE_PATHS["output"]) logger.info(f"结果已保存至:{FILE_PATHS['output']}") except Exception as e: logger.error(f"处理失败:{str(e)}", exc_info=True) raise if __name__ == "__main__": main()
07-17
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值