import os
import re
import json
import logging
import traceback
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import Engine
from dotenv import load_dotenv
# 初始化日志
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
# 加载环境变量
load_dotenv()
# 输出文件配置
OUTPUT_DIR = r"D:\AEB Management\output"
OUTPUT_FILE = "processed_data.csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ------------------ 配置加载 ------------------
def load_config(config_path):
"""加载 JSON 配置文件为字典"""
try:
with open(config_path, "r", encoding="utf-8") as file:
config = json.load(file)
logging.info(f"成功加载配置文件: {config_path}")
return config
except Exception as e:
logging.error(f"加载配置文件失败: {str(e)}")
logging.error(traceback.format_exc())
return {} # 返回空字典避免后续错误
# ------------------ 数据列处理 ------------------
def add_columns(dataframe, bsw_staff_set, progress_callback=None):
"""根据业务规则动态添加新列"""
current_time = pd.Timestamp.now()
total_rows = len(dataframe)
# 添加基础列
dataframe['HorizontalExpansion PN'] = None
# 1. KOCHI_PL 项目处理逻辑
kochi_condition_include = (
(dataframe['Project key'] == 'KOCHI_PL') &
(dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False))
)
kochi_condition_exclude = (
(dataframe['Project key'] == 'KOCHI_PL') &
(~dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False))
)
if kochi_condition_include.any():
extracted_projects = dataframe.loc[kochi_condition_include, 'Summary'].str.extract(r'(P\d{5})', expand=False)
dataframe.loc[kochi_condition_include, 'HorizontalExpansion PN'] = extracted_projects
valid_extraction = extracted_projects.notna()
dataframe.loc[kochi_condition_include & valid_extraction, 'Project key'] = extracted_projects[valid_extraction]
if kochi_condition_exclude.any():
dataframe.loc[kochi_condition_exclude, 'Project key'] = 'noneed'
# ================== 新增分析列 ==================
# 1. ProjectNum
dataframe['ProjectNum'] = dataframe['Project key']
# 2. BSW Self Test
def is_bsw_self_test(row):
if row['Involvement of BSW'] != 'Yes':
return 0
found_by = row.get('Custom field (Found By)', '')
labels = row.get('Labels', '')
cond1 = found_by in ['Development Audit', 'Review', 'SW Unit Test', 'Development']
cond2 = (found_by == 'SW Integration Test') and ('BSW_IT' in labels)
return 1 if cond1 or cond2 else 0
dataframe['BSW Self Test'] = dataframe.apply(is_bsw_self_test, axis=1)
# 3. BSW Issue
def is_bsw_issue(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes'):
return 0
custom_field = str(row.get('Custom field (Custom_2)', ''))
keywords = ['平台问题', '开发设计问题', '三方包问题',
'Development Design Issue', 'Platform Issue', 'Party Package Issue']
return 1 if any(keyword in custom_field for keyword in keywords) else 0
dataframe['BSW Issue'] = dataframe.apply(is_bsw_issue, axis=1)
# 4. Stack BSW Analyzed
def is_stack_analyzed(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes'):
return 0
components = str(row.get('Component/s', ''))
custom_field = str(row.get('Custom field (Custom_2)', ''))
cond1 = 'Stack' in components
cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field
return 1 if cond1 or cond2 else 0
dataframe['Stack BSW Analyzed'] = dataframe.apply(is_stack_analyzed, axis=1)
# 5. Stack BSW Unanalyzed
def is_stack_unanalyzed(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes'):
return 0
components = str(row.get('Component/s', ''))
custom_field = str(row.get('Custom field (Custom_2)', ''))
cond1 = 'Stack' in components
cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field
return 1 if not (cond1 or cond2) else 0
dataframe['Stack BSW Unanalyzed'] = dataframe.apply(is_stack_unanalyzed, axis=1)
# 6. Stack Total
dataframe['Stack Total'] = dataframe.apply(
lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
'Stack' in str(row.get('Component/s', ''))) else 0,
axis=1
)
# 7. BSW Involve
dataframe['BSW Involve'] = dataframe.apply(
lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes') else 0,
axis=1
)
# 8. BSW Involve Unclosed
dataframe['BSW Involve Unclosed'] = dataframe.apply(
lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes' and
row.get('Status', '') != 'Closed') else 0,
axis=1
)
# ------------------ HorizontalExpansion 分析列 ------------------
# 处理日期转换
if 'Created' in dataframe.columns:
dataframe['Created'] = pd.to_datetime(dataframe['Created'], errors='coerce')
days_since_creation = (current_time - dataframe['Created']).dt.days
else:
days_since_creation = pd.Series([0] * len(dataframe))
# 1. HorizontalExpansion Count
dataframe['HorizontalExpansion Count'] = dataframe['HorizontalExpansion PN'].notna().astype(int)
# 2. HorizontalExpansion Unfinished
dataframe['HorizontalExpansion Unfinished'] = (
dataframe['HorizontalExpansion PN'].notna() &
(dataframe['Status'] != 'Done')
).astype(int)
# 3. HorizontalExpansion Delay
dataframe['HorizontalExpansion Delay'] = (
dataframe['HorizontalExpansion PN'].notna() &
(dataframe['Status'] != 'Done') &
(days_since_creation > 30)
).astype(int)
# ------------------ 原有业务逻辑列 ------------------
# BSW Analysis Conclusions
def bsw_analysis_conclusions(row):
if row['Involvement of BSW'] == 'No':
return 'Non-BSW'
elif row['Involvement of BSW'] == 'Yes':
if pd.notna(row['Custom field (Custom_2)']):
custom_field = row['Custom field (Custom_2)']
if '非BSW问题' in custom_field or 'Non-BSW' in custom_field:
return 'Analyzed-Non-BSW'
elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field:
return 'Analyzed-BSW'
elif '需求问题' in custom_field or 'Requirement Issue' in custom_field:
return 'Analyzed-Non-BSW'
elif '平台问题' in custom_field or 'Platform Issue' in custom_field:
return 'Analyzed-BSW'
elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field:
return 'Analyzed-三方包问题'
return 'Analyzing-Unclassified'
dataframe['BSW Analysis Conclusions'] = dataframe.apply(bsw_analysis_conclusions, axis=1)
# Stack Classification
def stack_classification(component):
if pd.isna(component):
return 'Unclassed'
stacks = ['Com_Stack', 'Cdd_Stack', 'CybSec_Stack', 'Diag_Stack',
'Fbl_Stack', 'FuSa_Stack', 'IoHwAb_Stack', 'Mem_Stack',
'ModeMgm_Stack', 'OS_Stack', 'Sys_Stack']
for stack in stacks:
if stack in component:
return stack
return 'Unclassed'
dataframe['Stack Classification'] = dataframe['Component/s'].apply(stack_classification)
# BSW Reason Classification
def calculate_bsw_reason_classification(row):
if row['Involvement of BSW'] == 'No':
return '底层未参与分析'
if pd.notna(row['Custom field (Custom_2)']):
custom_field = row['Custom field (Custom_2)']
if '非BSW问题' in custom_field or 'Non-BSW' in custom_field:
return '非BSW问题'
elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field:
return '开发设计问题'
elif '需求问题' in custom_field or 'Requirement Issue' in custom_field:
return '需求问题'
elif '平台问题' in custom_field or 'Platform Issue' in custom_field:
return '平台问题'
elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field:
return '三方包问题'
return 'Analyzing-Unclassified'
dataframe['BSW Reason Classification'] = dataframe.apply(calculate_bsw_reason_classification, axis=1)
# total 列
dataframe['total'] = dataframe.apply(
lambda row: 0 if row['Project key'] == 'KOCHI_PL' else 1
if row['Project key'] != 'KOCHI_PL' and row['Issue Type'] in ["KOCHI AE Issue", "Problem"] else 0,
axis=1
)
# BSW Analyzed
# 修复 BSW Analyzed 列的计算错误
def calculate_bsw_analyzed(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes'):
return 0
custom_field = str(row.get('Custom field (Custom_2)', ''))
keywords = ['需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题',
'Non-BSW', 'Development Design Issue', 'Requirement Issue',
'Platform Issue', 'Party Package Issue']
# 使用正则表达式检查关键词
pattern = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue'
return 1 if re.search(pattern, custom_field) else 0
dataframe['BSW Analyzed'] = dataframe.apply(calculate_bsw_analyzed, axis=1)
# BSW Unanalyzed
def calculate_bsw_unanalyzed(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes'):
return 0
custom_field = str(row.get('Custom field (Custom_2)', ''))
keywords = ['需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题',
'Non-BSW', 'Development Design Issue', 'Requirement Issue',
'Platform Issue', 'Party Package Issue']
# 使用正则表达式检查关键词
pattern = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue'
return 1 if pd.isna(row['Custom field (Custom_2)']) or not re.search(pattern, custom_field) else 0
dataframe['BSW Unanalyzed'] = dataframe.apply(calculate_bsw_unanalyzed, axis=1)
# BSW Staff List
dataframe['BSW Staff List'] = dataframe['Assignee'].apply(
lambda x: "Yes" if x in bsw_staff_set else "No"
)
# Found By Classification
def classify_found_by(row):
found_by = row['Custom field (Found By)']
labels = row['Labels'] if pd.notna(row['Labels']) else ""
if found_by == 'SW Integration Test' and 'BSW_IT' in labels:
return 'BSW_IT'
elif found_by == 'SW Integration Test':
return 'SWIT'
elif found_by == 'Cybersecurity Monitoring':
return 'Cybersecurity Monitoring'
elif found_by in ['HIL TEST', 'HILP TEST', 'System Test']:
return 'System Test'
elif found_by in ['OEM Test', 'OEM', 'End user']:
return 'OEM&End user'
elif found_by == 'Development Audit' and 'BSW_PT' in labels:
return 'BSW_PT'
elif found_by in ['Review', 'SW Unit Test', 'Development']:
return '开发自测'
return 'Others'
dataframe['Found By Classification'] = dataframe.apply(classify_found_by, axis=1)
# High Priority Unanalyzed
def calculate_high_priority_unanalyzed(row):
if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes' and
row['Priority'] == 'High'):
return 0
if pd.isna(row['Created']):
return 0
days_diff = (current_time - row['Created']).days
if days_diff <= 15:
return 0
custom_field = str(row.get('Custom field (Custom_2)', ''))
exclude_keywords = [
'需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题',
'Non-BSW', 'Development Design Issue', 'Requirement Issue',
'Platform Issue', 'Party Package Issue'
]
if any(keyword in custom_field for keyword in exclude_keywords):
return 0
return 1
dataframe['High Priority Unanalyzed'] = dataframe.apply(calculate_high_priority_unanalyzed, axis=1)
# Stack_Analyzed
dataframe['Stack_Analyzed'] = dataframe.apply(
lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes' and
('Stack' in str(row['Component/s']) or
'非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or
'Non-BSW' in str(row.get('Custom field (Custom_2)', ''))))
else 0,
axis=1
)
# Stack_Unanalyzed
dataframe['Stack_Unanalyzed'] = dataframe.apply(
lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and
row['Involvement of BSW'] == 'Yes' and
not ('Stack' in str(row['Component/s']) or
'非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or
'Non-BSW' in str(row.get('Custom field (Custom_2)', ''))))
else 0,
axis=1
)
# 日期相关列
if 'Updated' in dataframe.columns:
dataframe['Updated'] = pd.to_datetime(dataframe['Updated'], errors='coerce')
days_since_update = (current_time - dataframe['Updated']).dt.days
condition_base = (
(dataframe['Issue Type'].isin(["KOCHI AE Issue", "Problem"])) &
(dataframe['Involvement of BSW'] == 'Yes') &
(dataframe['BSW Staff List'] == 'Yes')
)
dataframe['update_over_15'] = (condition_base & (days_since_update >= 15)).astype(int)
dataframe['update_over_7'] = (condition_base & (days_since_update > 7) & (days_since_update < 15)).astype(int)
logging.info("成功添加所有分析列")
# 处理完成后调用进度回调
if progress_callback:
progress_callback(total_rows, total_rows)
return dataframe
def assign_responsibles(dataframe, config_dict):
"""根据配置字典为 DataFrame 增加 'Responsible' 和 'Email' 列"""
if not config_dict:
logging.warning("配置字典为空,无法分配责任人")
dataframe['Responsible'] = "Unassigned"
dataframe['Email'] = ""
return dataframe
def get_responsible_and_email(project_key):
# 处理无效项目键
project_key = str(project_key)
# 首先检查具体项目配置
if project_key in config_dict:
project_config = config_dict[project_key]
return (
project_config.get("Responsible", "Unassigned"),
project_config.get("Email", "")
)
# 其次检查默认配置
default_config = config_dict.get("Default", {})
return (
default_config.get("Responsible", "Unassigned"),
default_config.get("Email", "")
)
dataframe['Responsible'], dataframe['Email'] = zip(*dataframe['Project key'].apply(get_responsible_and_email))
return dataframe
# ------------------ 数据库相关 ------------------
def get_database_engine(db_name="default"):
"""创建数据库连接引擎"""
prefix = "BSW_" if db_name == "bsw_jira" else ""
host = os.getenv(f"{prefix}DB_HOST")
user = os.getenv(f"{prefix}DB_USER")
password = os.getenv(f"{prefix}DB_PASSWORD")
database = os.getenv(f"{prefix}DB_NAME", db_name)
port = os.getenv(f"{prefix}DB_PORT", 3306)
if not all([host, user, password, database]):
raise ValueError(f"缺少{db_name}数据库配置")
connection_uri = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
return create_engine(connection_uri)
def write_to_database(dataframe, target_engine, target_table):
"""严格按目标表列名写入数据"""
inspector = inspect(target_engine)
if inspector.has_table(target_table):
target_columns = [col['name'] for col in inspector.get_columns(target_table)]
valid_columns = [col for col in dataframe.columns if col in target_columns]
filtered_df = dataframe[valid_columns]
skipped = set(dataframe.columns) - set(valid_columns)
if skipped:
logging.warning(f"跳过目标表不存在的列: {', '.join(skipped)}")
else:
filtered_df = dataframe
logging.info(f"创建新表: {target_table}")
filtered_df.to_sql(
name=target_table,
con=target_engine,
index=False,
if_exists='append',
chunksize=500
)
return filtered_df
def process_and_write_data(source_engine, target_engine, config_path):
"""查询、处理并写入数据(单次写入版本)"""
try:
source_table = os.getenv("SOURCE_TABLE_NAME")
target_table = os.getenv("TARGET_TABLE_NAME")
if not source_table or not target_table:
raise ValueError("缺少环境变量 SOURCE_TABLE_NAME 或 TARGET_TABLE_NAME")
logging.info(f"从 {source_table} 读取数据,写入 {target_table}")
# 加载配置文件为字典
config_dict = load_config(config_path)
if not config_dict:
logging.error("配置加载失败,无法分配责任人")
# 继续处理但跳过责任分配
bsw_staff_set = set()
else:
bsw_staff_set = set(config_dict.get("bsw_staff_list", []))
project_keys = ["P09135", "P09192", "P08499", "KOCHI_PL"]
# 构建查询
query = f"""
SELECT *
FROM `{source_table}`
WHERE `Project key` IN ({', '.join([f"'{key}'" for key in project_keys])})
"""
# 一次性读取全部数据
df = pd.read_sql(query, con=source_engine)
total_rows = len(df)
if total_rows == 0:
logging.info("未读取到数据,退出处理")
return 0
logging.info(f"读取到 {total_rows} 条记录,开始处理...")
# 添加列名存在性检查 - 关键修复点
required_columns = [
'Issue Type', 'Involvement of BSW', 'Custom field (Custom_2)',
'Project key', 'Labels', 'Summary', 'Status', 'Component/s',
'Priority', 'Custom field (Found By)', 'Assignee'
]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
logging.warning(f"数据源缺少必要列: {', '.join(missing_columns)}")
# 添加默认值避免后续错误
for col in missing_columns:
df[col] = None
# 进度计数器初始化
processed_count = 0
progress_percent = 0
# 进度回调函数
def progress_callback(current, total):
nonlocal processed_count, progress_percent
processed_count = current
current_percent = int(current / total * 100)
if current_percent > progress_percent:
progress_percent = current_percent
logging.info(f"处理进度: {current}/{total} ({progress_percent}%)")
# 处理数据(添加进度回调)
processed_df = add_columns(df, bsw_staff_set, progress_callback)
processed_df = assign_responsibles(processed_df, config_dict)
logging.info("数据处理完成,开始写入数据库...")
# 单次写入数据库
filtered_df = write_to_database(processed_df, target_engine, target_table)
actual_rows = len(filtered_df)
logging.info(f"成功写入 {actual_rows} 条记录到 {target_table}")
return actual_rows
except Exception as e:
logging.error(f"数据处理失败: {str(e)}")
logging.error(traceback.format_exc())
return 0
# ------------------ 主函数 ------------------
if __name__ == "__main__":
required_vars = ["SOURCE_TABLE_NAME", "TARGET_TABLE_NAME"]
missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
logging.error(f"缺少环境变量: {', '.join(missing_vars)}")
exit(1)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 创建双数据库连接
try:
source_engine = get_database_engine("default")
target_engine = get_database_engine("bsw_jira")
except Exception as e:
logging.error(f"数据库连接失败: {str(e)}")
exit(1)
# 处理并写入数据
try:
row_count = process_and_write_data(
source_engine=source_engine,
target_engine=target_engine,
config_path="responsible_config.json"
)
logging.info(f"数据处理完成,共写入 {row_count} 条记录")
exit(0)
except Exception as e:
logging.error(f"主流程执行失败: {str(e)}")
exit(1)