import pandas as pd
import logging
from geopy.distance import geodesic
import time
from contextlib import contextmanager
import numpy as np
from sqlalchemy import create_engine, text
from tenacity import retry, stop_after_attempt, wait_exponential
from datetime import datetime
# --------------------------
# 通用配置
# --------------------------
DB_CONFIG = {
"conn_str": "mysql+pymysql://root:root@127.0.0.1:3306/test?charset=utf8",
"pool_size": 10,
"pool_recycle": 300
}
member_list_path = r"D:\OneDrive\ERIC\维护\平台数据\代维信息\2025-07人员信息快照_20250708.xlsx"
member_list_columns = ['登陆账号', '所属组织', '所属地市', '在职状态']
TIME_RANGE = {
"start": '2025-07-01 00:00:00',
"end": '2025-07-08 23:59:59'
}
# 定义日期格式转换器
def format_time_range(time_range):
# 解析开始时间和结束时间
start_date = datetime.strptime(time_range["start"], '%Y-%m-%d %H:%M:%S')
end_date = datetime.strptime(time_range["end"], '%Y-%m-%d %H:%M:%S')
# 格式化为YYYYMMDD形式并拼接
return f"{start_date.strftime('%Y%m%d')}~{end_date.strftime('%Y%m%d')}"
# 获取格式化后的时间范围
source_time = format_time_range(TIME_RANGE)
FILE_PATHS = {
"cran_info": r'D:\OneDrive\ERIC\维护\test\CRAN机房信息.csv',
"output": fr'D:\OneDrive\ERIC\维护\工单\现场执行工单\现场执行工单_{source_time}.xlsx'
}
TABLE_CONFIG = {
'工单_保障': {'columns': ["工单当前状态", "工单号", "工单分类", "维护分类"]},
'工单_巡检': {'columns': [
"工单当前状态", "工单号", "资源ID", "资源名称",
"资源经度", "资源纬度", "STATION_NAME",
"STATION_ID", "STATION_LEVEL", "LONGITUDE",
"LATITUDE", "工单分类", "维护分类"
]},
'工单_拆站': {'columns': [
"工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称",
"资源经度", "资源纬度", "STATION_NAME",
"STATION_ID", "STATION_LEVEL", "LONGITUDE",
"LATITUDE", "工单分类", "维护分类"
]},
'工单_验收': {'columns': [
"工单当前状态", "工单号", "资源cid AS 资源ID", "资源名称",
"资源经度", "资源纬度", "STATION_NAME",
"STATION_ID", "STATION_LEVEL", "LONGITUDE",
"LATITUDE", "工单分类", "维护分类"
]},
'工单_发电': {'columns': [
"工单当前状态", "工单号", "站点ID AS 资源ID", "站点名称 AS 资源名称",
"站点经度 AS 资源经度", "站点纬度 AS 资源纬度", "STATION_NAME",
"STATION_ID", "STATION_LEVEL", "LONGITUDE",
"LATITUDE", "工单分类", "维护分类"
]},
'工单_通用': {'columns': [
"工单当前状态", "工单号", "资源名称", "资源经度", "资源纬度", "STATION_NAME", "STATION_ID",
"STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类"
]},
'故障工单': {'columns': [
"工单状态 AS 工单当前状态", "工单编号 AS 工单号", "STATION_NAME AS 资源名称", "LONGITUDE AS 资源经度",
"LATITUDE AS 资源纬度", "STATION_NAME", "STATION_ID",
"STATION_LEVEL", "LONGITUDE", "LATITUDE", "工单分类", "维护分类", "故障处理时长_小时",
"是否延期有效", "是否及时处理", "高频故障", "是否超长退服", "网元分类"
], 'where_column': '工单编号'}
}
DIMENSIONS = [
{'name': '执行人', 'keys': ['执行人']},
{'name': '所属组织', 'keys': ['所属组织']},
{'name': '地市', 'keys': ['地市']},
{'name': '代维简称', 'keys': ['代维简称']},
{'name': '地市代维', 'keys': ['地市', '代维简称']}
]
# --------------------------
# 工具函数
# --------------------------
@contextmanager
def time_monitor(step_name):
"""耗时监控上下文管理器"""
start_time = time.time()
try:
yield
finally:
print(f"{step_name} 耗时: {time.time() - start_time:.4f}秒")
def setup_logging():
"""日志配置"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("data_processor_v2.log"),
logging.StreamHandler()
]
)
return logging.getLogger(__name__)
logger = setup_logging()
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def safe_read_sql(sql: text, engine, params: dict = None) -> pd.DataFrame:
"""带重试机制的SQL读取"""
logger.debug(f"执行SQL: {sql} \n参数: {params}")
return pd.read_sql(sql, engine, params=params)
def init_db_engine():
"""初始化数据库引擎"""
return create_engine(
DB_CONFIG["conn_str"],
pool_size=DB_CONFIG["pool_size"],
pool_recycle=DB_CONFIG["pool_recycle"]
)
# --------------------------
# 数据校验函数
# --------------------------
def is_valid_coordinates(lat, lon):
"""经纬度有效性校验"""
if pd.isna(lat) or pd.isna(lon):
return False
if not (-90 <= lat <= 90 and -180 <= lon <= 180):
return False
if abs(lat) < 1e-6 and abs(lon) < 1e-6:
return False
return True
def calculate_distance(coord1, coord2):
"""带校验的距离计算"""
try:
lat1, lon1 = coord1
lat2, lon2 = coord2
except (TypeError, ValueError) as e:
logger.debug(f"坐标解析失败: {str(e)} | coord1={coord1} coord2={coord2}")
return 99999.0
if not (is_valid_coordinates(lat1, lon1) and is_valid_coordinates(lat2, lon2)):
return 99999.0
return geodesic(coord1, coord2).meters
# --------------------------
# 数据加载模块
# --------------------------
def load_site_orders(engine):
"""加载现场执行工单数据"""
sql = text("""
SELECT
地市, 代维简称, 专业, 工单类型,
工单编号, 执行人, 所属组织,
出发时间, 出发经度, 出发纬度,
签到时间, 签到经度, 签到纬度,
完成时间, 完成经度, 完成纬度,
路途时长, 实际工作时长
FROM 现场执行工单
WHERE 出发时间 BETWEEN :start_time AND :end_time
""")
params = {
'start_time': TIME_RANGE["start"],
'end_time': TIME_RANGE["end"]
}
df = safe_read_sql(sql, engine, params)
df = df[~df['所属组织'].str.contains('高铁')]
df['代维简称'] = df['代维简称'].replace('中移铁通', '铁通')
valid_companies = ['中贝', '中通服', '中邮建', '唐人', '宜通', '怡创', '浙通服',
'润建', '虹信', '超讯', '铁通', '长实']
df = df[df['代维简称'].isin(valid_companies)]
df['地市代维'] = df['地市'].str.split("-").str[0] + '-' + df['代维简称']
logger.info(f"加载现场工单记录数: {len(df)}")
return df
def batch_load_order_data(engine, order_numbers, batch_size=500):
"""批量加载工单相关数据"""
dfs = []
for table, config in TABLE_CONFIG.items():
try:
columns = ", ".join(config['columns'])
where_col = config.get('where_column', '工单号')
# 修复:正确构造SQL查询字符串,确保引号闭合
sql_template = text(f"SELECT {columns} FROM `{table}` WHERE `{where_col}` IN :order_nums")
table_dfs = []
for i in range(0, len(order_numbers), batch_size):
batch = order_numbers[i:i + batch_size]
df = safe_read_sql(sql_template, engine, params={'order_nums': batch})
table_dfs.append(df)
if table_dfs:
table_df = pd.concat(table_dfs, ignore_index=True)
dfs.append(table_df)
logger.info(f"表 {table} 加载完成,记录数: {len(table_df)}")
except Exception as e:
logger.error(f"加载表 {table} 失败: {str(e)}", exc_info=True)
return dfs
# --------------------------
# 数据处理模块
# --------------------------
def merge_order_data(site_df, order_dfs):
"""合并工单数据(增强版)"""
if not order_dfs:
logger.warning("没有需要合并的工单数据")
return pd.DataFrame()
final_df = pd.concat(order_dfs, axis=0, ignore_index=True)
final_df = final_df.drop_duplicates(subset=['工单号'], keep='first')
merged_df = pd.merge(
left=site_df,
right=final_df,
left_on='工单编号',
right_on='工单号',
how='left',
)
merged_df.drop(columns=['工单号'], inplace=True)
merged_df['路途时长(分钟)'] = (merged_df['路途时长'] * 60).round(2)
merged_df['实际工作时长(分钟)'] = (merged_df['实际工作时长'] * 60).round(2)
# 过滤有效状态
condition = (
merged_df['工单当前状态'].notna() &
(merged_df['工单当前状态'] != '') &
~merged_df['工单当前状态'].str.contains('已驳回|已撤销|已关闭', na=False)
)
return merged_df[condition]
def get_custom_natural_week(date):
"""
计算“自然周”编号。本定义中,一周从上周三开始,到本周二结束。
"""
if pd.isnull(date):
return np.nan
weekday = date.weekday()
days_since_last_thursday = (weekday + 5) % 7
date = date - pd.Timedelta(days=days_since_last_thursday)
return date.isocalendar()[1]
# --------------------------
# 核心处理模块
# --------------------------
def generate_time_features(df):
"""增强时间特征生成"""
df['出发时间'] = pd.to_datetime(df['出发时间'], errors='coerce')
df['自然年'] = df['出发时间'].dt.year
df['自然月'] = df['出发时间'].dt.to_period('M').astype(str)
df['自然周'] = df['出发时间'].apply(get_custom_natural_week)
df['出发日期'] = df['出发时间'].dt.date
return df
def process_coordinates(df):
"""坐标处理"""
coord_columns = [
"资源纬度", "资源经度", "签到纬度", "签到经度",
"LATITUDE", "LONGITUDE", "完成纬度", "完成经度"
]
for col in coord_columns:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
else:
logger.warning(f"列 {col} 不存在")
fault_columns = ['是否超长退服', '高频故障', '是否延期有效', '是否及时处理']
for col in fault_columns:
if col not in df.columns:
df[col] = '否'
return df
def calculate_distances(df):
"""距离计算"""
df["签到距离"] = df.apply(
lambda row: calculate_distance(
(row["签到纬度"], row["签到经度"]),
(row["资源纬度"], row["资源经度"])
), axis=1
)
return df
def merge_cran_data(main_df, cran_path):
"""合并CRAN机房数据并取最近RU距离"""
cran_df = pd.read_csv(cran_path)
for col in ["ru纬度", "ru经度"]:
cran_df[col] = pd.to_numeric(cran_df[col], errors="coerce").fillna(0)
merged = pd.merge(
main_df[["工单编号", "STATION_NAME", "签到经度", "签到纬度"]],
cran_df,
left_on="STATION_NAME",
right_on="station_name",
how="left"
)
merged["签到距离_CRAN"] = merged.apply(
lambda row: calculate_distance(
(row["签到纬度"], row["签到经度"]),
(row["ru纬度"], row["ru经度"])
), axis=1
)
min_distances = merged.groupby("工单编号", as_index=False)["签到距离_CRAN"].min()
return pd.merge(
main_df,
min_distances,
on="工单编号",
how="left"
).fillna({"签到距离_CRAN": 99999.0})
# --------------------------
# 统计计算模块
# --------------------------
def calculate_workload(df):
"""工作量评定"""
df['工作量评定'] = '有效'
df['原因'] = ''
no_completion = df['完成时间'].isna()
df.loc[no_completion, ['工作量评定', '原因']] = ['无效', '无完成时间']
short_work = df['实际工作时长(分钟)'] < 5
df.loc[short_work, ['工作量评定', '原因']] = ['无效', '工作时长过短']
if all(col in df.columns for col in ['签到距离', '签到距离_CRAN', '维护分类']):
invalid_dist = (
~df['维护分类'].isin(['发电', '保障']) &
(df['签到距离'] > 300) &
(df['签到距离_CRAN'] > 300)
)
df.loc[invalid_dist, ['工作量评定', '原因']] = ['无效', '签到距离过大']
return df
def _calculate_base_stats(df, group_keys):
"""通用基础指标计算"""
categories = df['维护分类'].dropna().unique()
agg_dict = {
# 总工单数(已去重)
"总工单数": ('工单编号', 'nunique'),
# 有效工单数(去重)
"有效工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['工作量评定'] == "有效"].nunique()),
# 故障工单数(去重):统计工单分类为"故障工单"的工单编号唯一数
"故障工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['工单分类'] == "故障工单"].nunique()),
# 故障及时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"是"的工单编号唯一数
"故障及时数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][
(df['工单分类'] == "故障工单") &
(df['是否及时处理'] == "是")
].nunique()),
# 故障超时数(去重):统计同时满足工单分类为故障工单且是否及时处理为"否"的工单编号唯一数
"故障超时数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][
(df['工单分类'] == "故障工单") &
(df['是否及时处理'] == "否")
].nunique()),
# 超长退服工单数(去重):统计是否超长退服为"是"的工单编号唯一数
"超长退服工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['是否超长退服'] == "是"].nunique()),
# 超频故障工单数(去重):统计高频故障为"是"的工单编号唯一数
"超频故障工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['高频故障'] == "是"].nunique()),
# 延期工单数(去重):统计是否延期有效为"是"的工单编号唯一数
"延期工单数": ('工单编号', lambda x: df.loc[x.index, '工单编号'][df['是否延期有效'] == "是"].nunique()),
# 其他非去重指标保持不变
"故障处理时长": ('故障处理时长_小时', lambda x: x[df['工单分类'] == "故障工单"].sum()),
"路途总时长": ('路途时长(分钟)', 'sum'),
"工作总时长": ('实际工作时长(分钟)', 'sum'),
"出勤人数": ('执行人', 'nunique'),
"出勤人天": ('工单编号', lambda x: df.loc[x.index][['执行人', '出发日期']].drop_duplicates().shape[0]),
}
for cat in categories:
agg_dict[f"{cat}工单数"] = (
'维护分类',
lambda x, current_cat=cat: x.eq(current_cat).sum()
)
stats = df.groupby(group_keys).agg(**agg_dict).reset_index()
return stats
def _add_derived_metrics(stats, time_level):
"""通用衍生指标计算"""
stats['出勤总时长'] = stats['路途总时长'] + stats['工作总时长']
hour_cols = ['出勤总时长', '路途总时长', '工作总时长']
for col in hour_cols:
stats[f'{col}(h)'] = (stats[col] / 60).round(2)
stats['平均工单工作时长(h)'] = (
stats['工作总时长(h)'] / stats['总工单数']
).replace([np.inf, -np.inf], 0).fillna(0).round(2)
stats['平均工单路途时长(h)'] = (
stats['路途总时长(h)'] / stats['总工单数']
).replace([np.inf, -np.inf], 0).fillna(0).round(2)
if time_level == 'weekly':
stats['人均能效_周'] = (stats['有效工单数'] / stats['出勤人天']).round(2)
elif time_level == 'monthly':
stats['人均能效_月'] = (stats['有效工单数'] / stats['出勤人天']).round(2)
stats['故障处理及时率'] = (stats['故障及时数'] / stats['故障工单数']).replace(np.inf, 0)
stats['工作时长占比'] = (stats['工作总时长'] / stats['出勤总时长']).replace(np.inf, 0)
stats['人均出勤天数'] = (stats['出勤人天'] / stats['出勤人数']).replace(np.inf, 0).round(2)
stats['日均出勤人数'] = (stats['出勤人天'] / stats['出勤天数']).replace(np.inf, 0).round(2)
return stats
def calculate_attendance_metrics(df, denominator):
"""计算出勤相关指标"""
df = df.copy()
df['出勤总时长(h)'] = (df['路途时长(分钟)'] + df['实际工作时长(分钟)']) / 60
df['工作总时长(h)'] = df['实际工作时长(分钟)'] / 60
# 按执行人和出发日期分组,计算每天的数据
daily_stats = df.groupby(['执行人', '出发日期']).agg(
station_count=('STATION_NAME', 'nunique'),
total_attendance_hours=('出勤总时长(h)', 'sum'),
total_work_hours=('工作总时长(h)', 'sum')
).reset_index()
# 计算每天是否达标(条件1)
daily_stats['达标'] = (
(daily_stats['station_count'] >= 4) &
(daily_stats['total_attendance_hours'] >= 7) &
(daily_stats['total_work_hours'] >= 2)
)
# 按执行人聚合计算总指标
executor_stats = daily_stats.groupby('执行人').agg(
meet_days=('达标', 'sum'),
total_stations=('station_count', 'sum'),
total_attendance_hours=('total_attendance_hours', 'sum'),
total_work_hours=('total_work_hours', 'sum')
).reset_index()
# 计算衍生指标
executor_stats['平均每天到站址'] = executor_stats['total_stations'] / denominator
executor_stats['日均出勤出工时长'] = executor_stats['total_attendance_hours'] / denominator
executor_stats['日均有效出勤工时'] = executor_stats['total_work_hours'] / denominator
# 添加条件2达标标识
executor_stats['条件2达标'] = (
(executor_stats['平均每天到站址'] >= 4) &
(executor_stats['日均出勤出工时长'] >= 7) &
(executor_stats['日均有效出勤工时'] >= 2)
)
return executor_stats.rename(columns={'meet_days': '出勤达标天数'})
def generate_stats(df, dimension, time_level):
"""通用统计生成函数"""
time_key = {
'daily': '出发日期',
'weekly': '自然周',
'monthly': '自然月'
}[time_level]
group_keys = dimension['keys'] + [time_key]
# 获取维度的主键(用于合并的键名)
dimension_key = dimension['keys'][0] # 取维度的第一个键作为合并主键
stats = _calculate_base_stats(df, group_keys)
stats['出勤天数'] = df.groupby(group_keys)['出发日期'].nunique().values
# 计算月度分母(MIN(TIME_RANGE["end"]的日期部分, 22))
denominator = None
if time_level == 'daily':
stats['出发日期'] = pd.to_datetime(stats['出发日期']).dt.strftime('%Y-%m-%d')
else:
stats['出勤天数'] = df.groupby(group_keys)['出发日期'].nunique().values
if time_level == 'monthly':
# 移除原工作日天数计算,替换为新分母
end_date = datetime.strptime(TIME_RANGE["end"], '%Y-%m-%d %H:%M:%S')
end_day = end_date.day
denominator = min(end_day, 22)
stats = _add_derived_metrics(stats, time_level)
# 当维度为执行人且时间粒度为月度时,合并出勤指标(使用正确的键名)
if dimension['name'] == '执行人' and time_level == 'monthly' and denominator is not None:
attendance_metrics = calculate_attendance_metrics(df, denominator)
# 使用维度的实际键名(如'执行人')作为合并键,而非固定的'维度'
stats = pd.merge(
stats,
attendance_metrics[[ # 保留原始'执行人'列作为合并键
'执行人', '出勤达标天数', '平均每天到站址',
'日均出勤出工时长', '日均有效出勤工时'
]],
on=dimension_key, # 动态使用维度的键名(如'执行人')进行合并
how='left'
)
return stats
# --------------------------
# 代维统计增强模块
# --------------------------
def enhance_maintainer_stats(filtered_data, member_list_df_filter):
time_range_data = filtered_data.copy()
# 初始化维度框架
GROUP_KEYS = ['所属组织', '代维简称', '地市', '地市代维']
dim_dfs = []
for key in GROUP_KEYS:
df = member_list_df_filter.groupby(key).agg(
应出勤人数=('登陆账号', 'count')
).reset_index()
dim_dfs.append(df)
if not time_range_data.empty:
# ===================== 原有统计逻辑 =====================
# 执行人级别统计
executor_stats = time_range_data.groupby(['执行人', '所属组织', '代维简称', '地市', '地市代维']).agg(
总工单数=('工单编号', 'nunique'),
有效工单数=('工作量评定', lambda s: s.eq('有效').sum()),
总路途时长=('路途时长(分钟)', 'sum'),
总工作时长=('实际工作时长(分钟)', 'sum'),
总出勤天数=('出发日期', 'nunique'),
故障工单数=('工单分类', lambda s: s.eq('故障工单').sum()),
超长退服工单数=('是否超长退服', lambda s: s.eq('是').sum()),
高频故障工单数=('高频故障', lambda s: s.eq('是').sum()),
延期工单数=('是否延期有效', lambda s: s.eq('是').sum()),
故障及时数=('是否及时处理', lambda s: s.eq('是').sum())
).reset_index()
# 计算执行人级别指标
metrics = [
('平均路途时长(小时)', (executor_stats['总路途时长'] / 60) / executor_stats['总工单数'].replace(0, np.nan)),
('工作时长占比',
executor_stats['总工作时长'] / (executor_stats['总路途时长'] + executor_stats['总工作时长']).replace(0,
np.nan)),
('人均每日工单数', executor_stats['总工单数'] / executor_stats['总出勤天数'].replace(0, np.nan)),
('人均每日有效工单数', executor_stats['有效工单数'] / executor_stats['总出勤天数'].replace(0, np.nan)),
('人均每日出勤时长(小时)',
(executor_stats['总路途时长'] + executor_stats['总工作时长']) / 60 / executor_stats['总出勤天数'].replace(
0, np.nan)),
('人均每日工作时长(小时)',
executor_stats['总工作时长'] / 60 / executor_stats['总出勤天数'].replace(0, np.nan)),
('每工单路途时长(小时)', executor_stats['总路途时长'] / 60 / executor_stats['总工单数'].replace(0, np.nan)),
('每工单工作时长(小时)', executor_stats['总工作时长'] / 60 / executor_stats['总工单数'].replace(0, np.nan)),
('超长退服工单占比', executor_stats['超长退服工单数'] / executor_stats['故障工单数'].replace(0, np.nan)),
('高频故障工单占比', executor_stats['高频故障工单数'] / executor_stats['故障工单数'].replace(0, np.nan)),
('延期工单占比', executor_stats['延期工单数'] / executor_stats['故障工单数'].replace(0, np.nan)),
]
for col, formula in metrics:
executor_stats[col] = formula
executor_stats = executor_stats.fillna(0).round(4)
# 维度聚合统计
def calculate_dimension_stats(grouped_df, current_key):
return grouped_df.groupby(current_key).agg(
总工单数=('总工单数', 'sum'),
有效工单数=('有效工单数', 'sum'),
工单平均路途超2小时人数=('平均路途时长(小时)', lambda x: (x > 2).sum()),
平均工作时长占比低人数=('工作时长占比', lambda x: (x < 0.1).sum()),
人均每日工单数=('人均每日工单数', 'mean'),
人均每日有效工单数=('人均每日有效工单数', 'mean'),
人均每日出勤时长=('人均每日出勤时长(小时)', 'mean'),
人均每日工作时长=('人均每日工作时长(小时)', 'mean'),
每工单路途时长=('每工单路途时长(小时)', 'mean'),
每工单工作时长=('每工单工作时长(小时)', 'mean'),
超长退服工单占比=('超长退服工单占比', 'mean'),
高频故障工单占比=('高频故障工单占比', 'mean'),
延期工单占比=('延期工单占比', 'mean'),
).reset_index().round(4)
# 合并维度统计
updated_dims = []
for i, key in enumerate(GROUP_KEYS):
dim_stats = calculate_dimension_stats(executor_stats, key)
dim_stats['有效工单占比'] = (dim_stats['有效工单数'] / dim_stats['总工单数']).replace(np.inf, 0).fillna(
0).round(4)
merged = pd.merge(
dim_dfs[i],
dim_stats,
on=key,
how='left'
).fillna(0)
updated_dims.append(merged)
dim_dfs = updated_dims
# ===================== 新增需求处理 =====================
# 提取问题工单
sign_in_issue = time_range_data[time_range_data['原因'] == '签到距离过大']
short_duration_issue = time_range_data[time_range_data['原因'] == '工作时长过短']
top50_travel_orders = time_range_data.nlargest(50, '路途时长(分钟)')['工单编号'].unique()
top50_travel_data = time_range_data[time_range_data['工单编号'].isin(top50_travel_orders)]
# 处理各维度指标
for i, key in enumerate(GROUP_KEYS):
# 合并签到距离过大条目数
sign_in_counts = sign_in_issue.groupby(key)['工单编号'].nunique().reset_index(name='签到距离过大条目数')
dim_dfs[i] = pd.merge(dim_dfs[i], sign_in_counts, on=key, how='left')
# 合并工作时长过短条目数
short_duration_counts = short_duration_issue.groupby(key)['工单编号'].nunique().reset_index(
name='工作时长过短条目数')
dim_dfs[i] = pd.merge(dim_dfs[i], short_duration_counts, on=key, how='left')
# 合并路途时长TOP50条目数
top50_counts = top50_travel_data.groupby(key)['工单编号'].nunique().reset_index(name='路途时长TOP50条目数')
dim_dfs[i] = pd.merge(dim_dfs[i], top50_counts, on=key, how='left')
dim_dfs[i] = dim_dfs[i].fillna(0)
# ===================== 出勤相关统计 =====================
# 实际出勤人数
def merge_attendance(source_df, target_df, grp_key):
att = source_df.groupby(grp_key)['执行人'].nunique().reset_index(name='实际出勤人数')
return pd.merge(target_df, att, on=grp_key, how='left').fillna(0)
for i, key in enumerate(GROUP_KEYS):
dim_dfs[i] = merge_attendance(time_range_data, dim_dfs[i], key)
dim_dfs[i]['出勤率'] = (dim_dfs[i]['实际出勤人数'] / dim_dfs[i]['应出勤人数']).replace(np.inf, 0).fillna(
0).round(4)
# 出勤天数统计
for i, key in enumerate(GROUP_KEYS):
att_cols = ['执行人', key, '出发日期']
att_days = time_range_data[att_cols].drop_duplicates(subset=['执行人', '出发日期'])
# 计算每个执行人的出勤天数
att_days_per_executor = att_days.groupby([key, '执行人']).size().reset_index(name='出勤天数')
# 总出勤人天(所有人的出勤天数总和)
total_attendance_person_days = att_days_per_executor.groupby(key)['出勤天数'].sum().reset_index(
name='总出勤人天')
dim_dfs[i] = pd.merge(dim_dfs[i], total_attendance_person_days, on=key, how='left').fillna(0)
# 出勤天数(不同出发日期的数量)
attendance_days = att_days.groupby(key)['出发日期'].nunique().reset_index(name='出勤天数_实际')
dim_dfs[i] = pd.merge(dim_dfs[i], attendance_days, on=key, how='left').fillna(0)
# 计算人均出勤天数和日均出勤人数
dim_dfs[i]['人均出勤天数'] = (dim_dfs[i]['总出勤人天'] / dim_dfs[i]['实际出勤人数']).replace(np.inf,
0).fillna(
0).round(2)
dim_dfs[i]['日均出勤人数'] = (dim_dfs[i]['总出勤人天'] / dim_dfs[i]['出勤天数_实际']).replace(np.inf,
0).fillna(
0).round(2)
# 出勤不足3天人数统计
under3 = att_days_per_executor[att_days_per_executor['出勤天数'] < 3].groupby(key)[
'执行人'].count().reset_index(name='出勤不足3天人数')
dim_dfs[i] = pd.merge(dim_dfs[i], under3, on=key, how='left').fillna(0)
dim_dfs[i]['出勤不足3天人员占比'] = (dim_dfs[i]['出勤不足3天人数'] / dim_dfs[i]['实际出勤人数']).replace(
[np.inf, -np.inf], 0).fillna(0).round(4)
# ===================== 故障处理及时率计算 =====================
for i, key in enumerate(GROUP_KEYS):
fault_data = time_range_data[time_range_data['工单分类'] == '故障工单']
# 初始化要计算的占比列
ratio_columns = ['故障处理及时率', '超长退服工单占比', '高频故障工单占比', '延期工单占比']
if not fault_data.empty:
agg_dict = {
'是否及时处理': lambda x: x.eq('是').any(),
'是否超长退服': lambda x: x.eq('是').any(),
'高频故障': lambda x: x.eq('是').any(),
'是否延期有效': lambda x: x.eq('是').any(),
'所属组织': 'first',
'代维简称': 'first',
'地市': 'first',
'地市代维': 'first'
}
if key == '地市代维':
agg_dict['地市'] = 'first'
agg_dict['代维简称'] = 'first'
unique_faults = fault_data.groupby('工单编号').agg(agg_dict).reset_index()
# 按维度key分组统计
fault_stats_current = unique_faults.groupby(key).agg(
总故障工单数=('工单编号', 'nunique'),
总故障及时数=('是否及时处理', 'sum'),
超长退服工单数=('是否超长退服', 'sum'),
高频故障工单数=('高频故障', 'sum'),
延期工单数=('是否延期有效', 'sum')
).reset_index()
# 计算各项占比
fault_stats_current['故障处理及时率'] = (
fault_stats_current['总故障及时数'] /
fault_stats_current['总故障工单数']
).replace([np.inf, -np.inf], 0).fillna(0)
fault_stats_current['超长退服工单占比'] = (
fault_stats_current['超长退服工单数'] /
fault_stats_current['总故障工单数']
).replace([np.inf, -np.inf], 0).fillna(0)
fault_stats_current['高频故障工单占比'] = (
fault_stats_current['高频故障工单数'] /
fault_stats_current['总故障工单数']
).replace([np.inf, -np.inf], 0).fillna(0)
fault_stats_current['延期工单占比'] = (
fault_stats_current['延期工单数'] /
fault_stats_current['总故障工单数']
).replace([np.inf, -np.inf], 0).fillna(0)
else:
# 创建空数据框并初始化所有占比列为0
fault_stats_current = pd.DataFrame(columns=[key] + ratio_columns)
for col in ratio_columns:
fault_stats_current[col] = 0.0
# 输出统计结果
logger.info(f"维度[{key}]故障统计结果:")
logger.info(f" 总故障工单数: {fault_stats_current.get('总故障工单数', 0).sum()}")
logger.info(f" 超长退服工单数: {fault_stats_current.get('超长退服工单数', 0).sum()}")
logger.info(f" 高频故障工单数: {fault_stats_current.get('高频故障工单数', 0).sum()}")
logger.info(f" 延期工单数: {fault_stats_current.get('延期工单数', 0).sum()}")
# 使用map函数手动添加占比数据
for col in ratio_columns:
mapping = dict(zip(fault_stats_current[key], fault_stats_current[col]))
dim_dfs[i][col] = dim_dfs[i][key].map(mapping).fillna(0)
# 输出合并后占比
logger.info(f"维度[{key}]合并后占比:")
for col in ratio_columns:
mean_value = dim_dfs[i][col].mean()
logger.info(f" {col}: {mean_value}")
# ===================== 调整字段顺序 =====================
base_columns = [
'应出勤人数', '实际出勤人数', '出勤率', '出勤不足3天人数', '出勤不足3天人员占比', '总工单数', '有效工单数',
'有效工单占比', '签到距离过大条目数', '工作时长过短条目数', '路途时长TOP50条目数',
'人均出勤天数', '日均出勤人数',
'工单平均路途超2小时人数', '平均工作时长占比低人数',
'人均每日工单数', '人均每日有效工单数', '人均每日出勤时长', '人均每日工作时长',
'每工单路途时长', '每工单工作时长',
'超长退服工单占比', '高频故障工单占比', '延期工单占比', '故障处理及时率'
]
# 确保所有列都存在
for i, key in enumerate(GROUP_KEYS):
# 只保留数据框中实际存在的列
available_columns = [col for col in base_columns if col in dim_dfs[i].columns]
ordered_columns = [key] + available_columns
dim_dfs[i] = dim_dfs[i][ordered_columns]
# ===================== 总计行处理 =====================
for i in range(len(dim_dfs)):
df = dim_dfs[i]
group_key = GROUP_KEYS[i]
# 创建总计行字典
total_row = {group_key: '总计'}
# 处理需要sum的字段
sum_cols = [
'应出勤人数', '实际出勤人数', '出勤不足3天人数', '总工单数', '有效工单数',
'签到距离过大条目数', '工作时长过短条目数', '路途时长TOP50条目数',
'工单平均路途超2小时人数', '平均工作时长占比低人数'
]
# 检查并添加出勤相关列
has_total_attendance = '总出勤人天' in df.columns
has_attendance_days = '出勤天数_实际' in df.columns
if has_total_attendance:
sum_cols.append('总出勤人天')
if has_attendance_days:
sum_cols.append('出勤天数_实际')
# 计算总计行的和
for col in sum_cols:
if col in df.columns:
total_row[col] = df[df[group_key] != '总计'][col].sum()
# 计算平均值字段
mean_cols = [
'人均每日工单数', '人均每日有效工单数', '人均每日出勤时长', '人均每日工作时长', '人均出勤天数',
'日均出勤人数', '每工单路途时长', '每工单工作时长', '超长退服工单占比', '高频故障工单占比',
'延期工单占比', '故障处理及时率'
]
for col in mean_cols:
if col in df.columns:
# 使用加权平均计算总计行的平均值
if '实际出勤人数' in df.columns:
total_row[col] = (df[df[group_key] != '总计'][col] * df[df[group_key] != '总计'][
'实际出勤人数']).sum() / df[df[group_key] != '总计']['实际出勤人数'].sum()
else:
total_row[col] = df[df[group_key] != '总计'][col].mean()
# 重新计算总计行中的百分比和比率字段
# 出勤率
total_actual = total_row.get('实际出勤人数', 0)
total_expected = total_row.get('应出勤人数', 0)
total_row['出勤率'] = total_actual / total_expected if total_expected > 0 else 0
# 出勤不足3天人员占比
total_under3 = total_row.get('出勤不足3天人数', 0)
total_row['出勤不足3天人员占比'] = total_under3 / total_actual if total_actual > 0 else 0
# 有效工单占比
total_valid = total_row.get('有效工单数', 0)
total_orders = total_row.get('总工单数', 0)
total_row['有效工单占比'] = total_valid / total_orders if total_orders > 0 else 0
# 添加总计行到数据框
total_df = pd.DataFrame([total_row])
dim_dfs[i] = pd.concat([df, total_df], ignore_index=True)
return tuple(dim_dfs)
# --------------------------
# 输出模块
# --------------------------
def save_to_excel(dataframes, path):
"""增强版Excel输出(带百分比格式)"""
with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
workbook = writer.book
# 定义格式(新增垂直居中设置)
header_format = workbook.add_format({
'bold': True,
'text_wrap': True, # 自动换行
'border': 1,
'bg_color': '#D9EAD3', # 浅绿色背景
'align': 'center', # 水平居中
'valign': 'vcenter' # 垂直居中(新增)
})
cell_format = workbook.add_format({
'border': 1,
'align': 'center', # 内容水平居中
'valign': 'vcenter' # 内容垂直居中(新增)
})
percent_format = workbook.add_format({
'num_format': '0.00%', # 百分比格式
'border': 1,
'align': 'center', # 内容水平居中
'valign': 'vcenter' # 内容垂直居中(新增)
})
for name, df in dataframes.items():
# 写入数据时添加标题格式
df.to_excel(
writer,
sheet_name=name,
index=False,
header=False, # 禁用自动标题
startrow=1 # 从第二行开始写数据
)
worksheet = writer.sheets[name]
# 动态识别百分比列
percent_columns = [
col for col in df.columns
if col.endswith(('率', '占比'))
]
# 设置列格式(包含标题和数据)
for col_num, col_name in enumerate(df.columns):
col_width = max(12, len(col_name) * 1.5) # 动态列宽
# 设置标题
worksheet.write(0, col_num, col_name, header_format)
# 设置列格式
if col_name in percent_columns:
fmt = percent_format
else:
fmt = cell_format
# 应用格式到整列
worksheet.set_column(
first_col=col_num,
last_col=col_num,
width=col_width,
cell_format=fmt
)
# 冻结首行
worksheet.freeze_panes(1, 0)
# 自动筛选(仅对数据量小于10000行的sheet启用)
if len(df) < 10000:
worksheet.autofilter(0, 0, len(df), len(df.columns) - 1)
# --------------------------
# 主流程
# --------------------------
def main():
engine = init_db_engine()
try:
with time_monitor("数据加载"):
site_orders = load_site_orders(engine)
order_numbers = site_orders['工单编号'].unique().tolist()
order_dfs = batch_load_order_data(engine, order_numbers)
with time_monitor("数据合并与处理"):
merged_data = merge_order_data(site_orders, order_dfs)
if merged_data.empty:
logger.warning("无有效数据")
return
processed_data = (
merged_data
.pipe(process_coordinates)
.pipe(calculate_distances)
.pipe(merge_cran_data, cran_path=FILE_PATHS["cran_info"])
.pipe(generate_time_features)
.pipe(calculate_workload)
)
output_dfs = {'原始数据': processed_data}
with time_monitor("统计计算"):
for dim in DIMENSIONS:
for time_level in ['daily', 'weekly', 'monthly']:
stats_df = generate_stats(
processed_data,
dimension=dim,
time_level=time_level
)
level_name = {
'daily': '日统计',
'weekly': '周统计',
'monthly': '月统计'
}[time_level]
sheet_name = f"{level_name}_{dim['name']}"
output_dfs[sheet_name] = stats_df
with time_monitor("处理代维组织数据"):
member_list_df = pd.read_excel(member_list_path, usecols=member_list_columns, engine='calamine')
pattern = r'(?=.*无线2023)(?=.*维护组)(?!.*OLT)'
member_list_df = member_list_df[member_list_df['所属组织'].str.contains(pattern, regex=True, na=False)]
member_list_df['代维简称'] = member_list_df['所属组织'].str.split("-").str[0]
member_list_df['代维简称'] = member_list_df['代维简称'].replace('中移铁通', '铁通')
member_list_df = member_list_df.rename(columns={'所属地市': '地市'})
member_list_df['地市代维'] = member_list_df['地市'].str.split("-").str[0] + '-' + member_list_df['代维简称']
member_list_df = member_list_df.drop_duplicates(subset=['登陆账号'])
member_list_df['时间戳'] = source_time
member_list_df_filter = member_list_df[member_list_df['在职状态'] == '在职']
filtered_data = processed_data[processed_data['所属组织'].isin(member_list_df_filter['所属组织'])]
output_dfs['原始数据_副本'] = filtered_data.copy()
stats_group, stats_company, stats_city, stats_area = enhance_maintainer_stats(
filtered_data, member_list_df_filter
)
output_dfs.update({
'代维组维度': stats_group,
'代维公司维度': stats_company,
'地市维度': stats_city,
'地市代维维度': stats_area
})
with time_monitor("生成Top50统计"):
# 从原始数据_副本中提取Top50
top50_travel = filtered_data.nlargest(50, '路途时长(分钟)')
# 选择需要的列
top50_travel = top50_travel[['工单编号', '执行人', '所属组织', '地市', '代维简称', '路途时长(分钟)']]
output_dfs['路途时长Top50'] = top50_travel
# 在指定的结果表第一列添加时间戳
target_sheets = ['代维组维度', '代维公司维度', '地市维度', '地市代维维度', '路途时长Top50']
for sheet in target_sheets:
df = output_dfs[sheet]
df.insert(0, '时间戳', source_time)
with time_monitor("保存结果"):
save_to_excel(output_dfs, FILE_PATHS["output"])
logger.info(f"结果已保存至:{FILE_PATHS['output']}")
except Exception as e:
logger.error(f"处理失败:{str(e)}", exc_info=True)
raise
if __name__ == "__main__":
main()
最新发布