Python.Sqlalchemy.使用OEM相关语句

本文通过一系列SQLAlchemy ORM操作示例,展示了如何进行数据增删改查、条件筛选、排序、聚合函数使用及事务处理等内容。

新增数据

user = User(name='a')
session.add(user)
user = User(name='b')
session.add(user)
user = User(name='a')
session.add(user)
user = User()
session.add(user)
session.commit()

使用User表相关操作

from sqlalchemy import func, or_, not_
import sqlalchemy as SA

print session.query(User) # 显示SQL 语句
print session.query(User).statement # 同上
for user in session.query(User): # 遍历时查询
    print user.name
print session.query(User).all() # 返回的是一个类似列表的对象
print session.query(User).first().name # 记录不存在时,first() 会返回 None
print session.query(User).one().name # 不存在,或有多行记录时会抛出异常
print session.query(User).filter(User.id == 2).first()
print session.query(User).get(2) # 以主键获取,等效于上句
print session.query(User).filter('id = 2').first().name # 支持字符串

print session.query(User).all() # 每行是个元组
print session.query(User).limit(1).all() # 最多返回 1 条记录
print session.query(User).offset(1).all() # 从第 2 条记录开始返回
print session.query(User).order_by(User.name).all()
print session.query(User).order_by('name').all()
print session.query(User).order_by(User.name.desc()).all()
print session.query(User).order_by('name desc').all()
print session.query(User).order_by(User.name.desc(), User.id).all() # 几种排序方式
print session.query(User).filter(User.id == 1).scalar() # 如果有记录,返回第一条记录的第一个元素
print session.query('id').select_from(User).filter('id = 1').scalar()

print session.query(User).filter(User.id > 1, User.name != 'a').scalar() # and
print session.query(User).filter(
    SA._and(User.id > 1, User.name != 'a')).scalar() # and
print session.query(User).filter(or_(User.id == 1, User.id == 2)).all() # or
print session.query(User).filter(User.id.in_((1, 2))).all() # in

print session.query(User).filter(not_(User.name == None)).all() # not
print session.query(User).filter(User.name == None).scalar()
print session.query(User).filter('name is null').scalar()
print session.query(User).filter(User.name != None).all()

print session.query(func.count('*')).select_from(User).scalar()
print session.query(func.count('1')).select_from(User).scalar()
print session.query(func.count(User.id)).scalar()
print session.query(func.count('*')).filter(User.id > 0).scalar() # filter() 中包含 User,因此不需要指定表
print session.query(func.count('*')).filter(User.name == 'a').limit(1).scalar() == 1 # 可以用 limit() 限制 count() 的返回数
print session.query(func.sum(User.id)).scalar()
print session.query(func.now()).scalar() # func 后可以跟任意函数名,只要该数据库支持
print session.query(func.current_timestamp()).scalar()
print session.query(func.md5(User.name)).filter(User.id == 1).scalar()

session.query(User).filter(User.id == 1).update({User.name: 'c'})
user.name = 'd'
session.flush() # 写数据库,但并不提交

print session.query(User).get(1).name
session.delete(user)
session.flush()
print session.query(User).get(1)
session.rollback() # 回滚操作

print session.query(User).get(1).name
session.query(User).filter(User.id == 1).delete()

session.commit() # 提交
import os import re import json import logging import traceback import pandas as pd from datetime import datetime from sqlalchemy import create_engine, inspect from sqlalchemy.engine import Engine from dotenv import load_dotenv from sqlalchemy.sql import text # 添加缺失的导入 # 初始化日志 logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # 加载环境变量 load_dotenv() # 输出文件配置 OUTPUT_DIR = r"D:\AEB Management\output" OUTPUT_FILE = "processed_data.csv" os.makedirs(OUTPUT_DIR, exist_ok=True) # ------------------ 配置加载 ------------------ def load_config(config_path): """加载 JSON 配置文件为字典""" try: with open(config_path, "r", encoding="utf-8") as file: config = json.load(file) logging.info(f"成功加载配置文件: {config_path}") return config except Exception as e: logging.error(f"加载配置文件失败: {str(e)}") logging.error(traceback.format_exc()) return {} # 返回空字典避免后续错误 # ------------------ 数据列处理 ------------------ def add_columns(dataframe, bsw_staff_set, progress_callback=None): """根据业务规则动态添加新列""" current_time = pd.Timestamp.now() total_rows = len(dataframe) # 添加基础列 dataframe['HorizontalExpansion PN'] = None # 1. 仅处理KOCHI_PL项目 kochi_condition_include = ( (dataframe['Project key'] == 'KOCHI_PL') & (dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False)) ) kochi_condition_exclude = ( (dataframe['Project key'] == 'KOCHI_PL') & (~dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False)) ) # 2. 包含特殊标签的处理 if kochi_condition_include.any(): # 从最后一个中括号提取项目编号(P+5位数字) bracket_pattern = r'.*\[([^\]]*)\]' # 匹配最后一个中括号内容 extracted_content = dataframe.loc[kochi_condition_include, 'Summary'].str.extract(bracket_pattern, expand=False) # 从括号内容中提取P+5位数字,失败则赋值PL project_pattern = r'(P\d{5})' extracted_projects = extracted_content.str.extract(project_pattern, expand=False) extracted_projects = extracted_projects.fillna('PL') # 未提取到则设为PL # 赋值给新列 dataframe.loc[kochi_condition_include, 'HorizontalExpansion PN'] = extracted_projects # 将提取值(包括PL)赋给Project key列 dataframe.loc[kochi_condition_include, 'Project key'] = extracted_projects # 3. 不包含特殊标签的处理 if kochi_condition_exclude.any(): # 标记为特殊值'NoNeed' dataframe.loc[kochi_condition_exclude, 'Project key'] = 'NoNeed' # ================== 新增分析列 ================== # 1. ProjectNum dataframe['ProjectNum'] = dataframe['Project key'] # 2. BSW Self Test def is_bsw_self_test(row): if row['Involvement of BSW'] != 'Yes': return 0 found_by = row.get('Custom field (Found By)', '') labels = row.get('Labels', '') cond1 = found_by in ['Development Audit', 'Review', 'SW Unit Test', 'Development'] cond2 = (found_by == 'SW Integration Test') and ('BSW_IT' in labels) return 1 if cond1 or cond2 else 0 dataframe['BSW Self Test'] = dataframe.apply(is_bsw_self_test, axis=1) # 3. BSW Issue def is_bsw_issue(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) keywords = ['平台问题', '开发设计问题', '三方包问题', 'Development Design Issue', 'Platform Issue', 'Party Package Issue'] return 1 if any(keyword in custom_field for keyword in keywords) else 0 dataframe['BSW Issue'] = dataframe.apply(is_bsw_issue, axis=1) # 4. Stack BSW Analyzed def is_stack_analyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 components = str(row.get('Component/s', '')) custom_field = str(row.get('Custom field (Custom_2)', '')) cond1 = 'Stack' in components cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field return 1 if cond1 or cond2 else 0 dataframe['Stack BSW Analyzed'] = dataframe.apply(is_stack_analyzed, axis=1) # 5. Stack BSW Unanalyzed def is_stack_unanalyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 components = str(row.get('Component/s', '')) custom_field = str(row.get('Custom field (Custom_2)', '')) cond1 = 'Stack' in components cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field return 1 if not (cond1 or cond2) else 0 dataframe['Stack BSW Unanalyzed'] = dataframe.apply(is_stack_unanalyzed, axis=1) # 6. Stack Total dataframe['Stack Total'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and 'Stack' in str(row.get('Component/s', ''))) else 0, axis=1 ) # 7. BSW Involve dataframe['BSW Involve'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes') else 0, axis=1 ) # 8. BSW Involve Unclosed dataframe['BSW Involve Unclosed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and row.get('Status', '') != 'Closed') else 0, axis=1 ) # ------------------ HorizontalExpansion 分析列 ------------------ # 处理日期转换 if 'Created' in dataframe.columns: dataframe['Created'] = pd.to_datetime(dataframe['Created'], errors='coerce') days_since_creation = (current_time - dataframe['Created']).dt.days else: days_since_creation = pd.Series([0] * len(dataframe)) # 1. HorizontalExpansion Count dataframe['HorizontalExpansion Count'] = dataframe['HorizontalExpansion PN'].notna().astype(int) # 2. HorizontalExpansion Unfinished dataframe['HorizontalExpansion Unfinished'] = ( dataframe['HorizontalExpansion PN'].notna() & (dataframe['Status'] != 'Done') ).astype(int) # 3. HorizontalExpansion Delay dataframe['HorizontalExpansion Delay'] = ( dataframe['HorizontalExpansion PN'].notna() & (dataframe['Status'] != 'Done') & (days_since_creation > 30) ).astype(int) # ------------------ 原有业务逻辑列 ------------------ # BSW Analysis Conclusions def bsw_analysis_conclusions(row): if row['Involvement of BSW'] == 'No': return 'Non-BSW' elif row['Involvement of BSW'] == 'Yes': if pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return 'Analyzed-Non-BSW' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return 'Analyzed-BSW' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return 'Analyzed-Non-BSW' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return 'Analyzed-BSW' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return 'Analyzed-三方包问题' return 'Analyzing-Unclassified' dataframe['BSW Analysis Conclusions'] = dataframe.apply(bsw_analysis_conclusions, axis=1) # Stack Classification def stack_classification(component): if pd.isna(component): return 'Unclassed' stacks = ['Com_Stack', 'Cdd_Stack', 'CybSec_Stack', 'Diag_Stack', 'Fbl_Stack', 'FuSa_Stack', 'IoHwAb_Stack', 'Mem_Stack', 'ModeMgm_Stack', 'OS_Stack', 'Sys_Stack'] for stack in stacks: if stack in component: return stack return 'Unclassed' dataframe['Stack Classification'] = dataframe['Component/s'].apply(stack_classification) # BSW Reason Classification def calculate_bsw_reason_classification(row): if row['Involvement of BSW'] == 'No': return '底层未参与分析' if pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return '非BSW问题' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return '开发设计问题' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return '需求问题' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return '平台问题' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return '三方包问题' return 'Analyzing-Unclassified' dataframe['BSW Reason Classification'] = dataframe.apply(calculate_bsw_reason_classification, axis=1) # total 列 dataframe['total'] = dataframe.apply( lambda row: 0 if row['Project key'] == 'KOCHI_PL' else 1 if row['Project key'] != 'KOCHI_PL' and row['Issue Type'] in ["KOCHI AE Issue", "Problem"] else 0, axis=1 ) # BSW Analyzed # 修复 BSW Analyzed 列的计算错误 def calculate_bsw_analyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) keywords = ['需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题', 'Non-BSW', 'Development Design Issue', 'Requirement Issue', 'Platform Issue', 'Party Package Issue'] # 使用正则表达式检查关键词 pattern = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue' return 1 if re.search(pattern, custom_field) else 0 dataframe['BSW Analyzed'] = dataframe.apply(calculate_bsw_analyzed, axis=1) # BSW Unanalyzed def calculate_bsw_unanalyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) keywords = ['需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题', 'Non-BSW', 'Development Design Issue', 'Requirement Issue', 'Platform Issue', 'Party Package Issue'] # 使用正则表达式检查关键词 pattern = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue' return 1 if pd.isna(row['Custom field (Custom_2)']) or not re.search(pattern, custom_field) else 0 dataframe['BSW Unanalyzed'] = dataframe.apply(calculate_bsw_unanalyzed, axis=1) # BSW Staff List dataframe['BSW Staff List'] = dataframe['Assignee'].apply( lambda x: "Yes" if x in bsw_staff_set else "No" ) # Found By Classification def classify_found_by(row): found_by = row['Custom field (Found By)'] labels = row['Labels'] if pd.notna(row['Labels']) else "" if found_by == 'SW Integration Test' and 'BSW_IT' in labels: return 'BSW_IT' elif found_by == 'SW Integration Test': return 'SWIT' elif found_by == 'Cybersecurity Monitoring': return 'Cybersecurity Monitoring' elif found_by in ['HIL TEST', 'HILP TEST', 'System Test']: return 'System Test' elif found_by in ['OEM Test', 'OEM', 'End user']: return 'OEM&End user' elif found_by == 'Development Audit' and 'BSW_PT' in labels: return 'BSW_PT' elif found_by in ['Review', 'SW Unit Test', 'Development']: return '开发自测' return 'Others' dataframe['Found By Classification'] = dataframe.apply(classify_found_by, axis=1) # High Priority Unanalyzed def calculate_high_priority_unanalyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and row['Priority'] == 'High'): return 0 if pd.isna(row['Created']): return 0 days_diff = (current_time - row['Created']).days if days_diff <= 15: return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) exclude_keywords = [ '需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题', 'Non-BSW', 'Development Design Issue', 'Requirement Issue', 'Platform Issue', 'Party Package Issue' ] if any(keyword in custom_field for keyword in exclude_keywords): return 0 return 1 dataframe['High Priority Unanalyzed'] = dataframe.apply(calculate_high_priority_unanalyzed, axis=1) # Stack_Analyzed dataframe['Stack_Analyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and ('Stack' in str(row['Component/s']) or '非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or 'Non-BSW' in str(row.get('Custom field (Custom_2)', '')))) else 0, axis=1 ) # Stack_Unanalyzed dataframe['Stack_Unanalyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and not ('Stack' in str(row['Component/s']) or '非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or 'Non-BSW' in str(row.get('Custom field (Custom_2)', '')))) else 0, axis=1 ) # 日期相关列 if 'Updated' in dataframe.columns: dataframe['Updated'] = pd.to_datetime(dataframe['Updated'], errors='coerce') days_since_update = (current_time - dataframe['Updated']).dt.days condition_base = ( (dataframe['Issue Type'].isin(["KOCHI AE Issue", "Problem"])) & (dataframe['Involvement of BSW'] == 'Yes') & (dataframe['BSW Staff List'] == 'Yes') ) dataframe['update_over_15'] = (condition_base & (days_since_update >= 15)).astype(int) dataframe['update_over_7'] = (condition_base & (days_since_update > 7) & (days_since_update < 15)).astype(int) logging.info("成功添加所有分析列") # 处理完成后调用进度回调 if progress_callback: progress_callback(total_rows, total_rows) return dataframe def assign_responsibles(dataframe, config_dict): """根据配置字典为 DataFrame 增加 'Responsible' 和 'Email' 列""" if not config_dict: logging.warning("配置字典为空,无法分配责任人") dataframe['Responsible'] = "Unassigned" dataframe['Email'] = "" return dataframe def get_responsible_and_email(project_key): # 处理无效项目键 project_key = str(project_key) # 首先检查具体项目配置 if project_key in config_dict: project_config = config_dict[project_key] return ( project_config.get("Responsible", "Unassigned"), project_config.get("Email", "") ) # 其次检查默认配置 default_config = config_dict.get("Default", {}) return ( default_config.get("Responsible", "Unassigned"), default_config.get("Email", "") ) dataframe['Responsible'], dataframe['Email'] = zip(*dataframe['Project key'].apply(get_responsible_and_email)) return dataframe # ------------------ 数据库相关 ------------------ def get_database_engine(db_name="default"): """创建数据库连接引擎""" prefix = "BSW_" if db_name == "bsw_jira" else "" host = os.getenv(f"{prefix}DB_HOST") user = os.getenv(f"{prefix}DB_USER") password = os.getenv(f"{prefix}DB_PASSWORD") database = os.getenv(f"{prefix}DB_NAME", db_name) port = os.getenv(f"{prefix}DB_PORT", 3306) if not all([host, user, password, database]): raise ValueError(f"缺少{db_name}数据库配置") connection_uri = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}" return create_engine(connection_uri) def write_to_database(dataframe, target_engine, target_table): """使用动态列过滤实现安全的UPSERT操作""" from sqlalchemy import MetaData, Table from sqlalchemy.dialects.mysql import insert inspector = inspect(target_engine) # 确保目标表存在 if not inspector.has_table(target_table): logging.info(f"创建新表: {target_table}") dataframe.to_sql( name=target_table, con=target_engine, index=False, if_exists='replace' ) return dataframe # 获取目标表实际列名(大小写敏感) target_columns = [col['name'] for col in inspector.get_columns(target_table)] # 动态过滤数据框列 valid_columns = [col for col in dataframe.columns if col in target_columns] skipped_columns = set(dataframe.columns) - set(valid_columns) if skipped_columns: logging.warning(f"跳过目标表不存在的列: {', '.join(skipped_columns)}") filtered_df = dataframe[valid_columns] # 获取主键信息 pk_constraints = inspector.get_pk_constraint(target_table) primary_keys = pk_constraints['constrained_columns'] if pk_constraints else [] # 确保主键存在 if 'Issue key' not in primary_keys: logging.warning("目标表缺少主键约束,正在添加...") with target_engine.begin() as conn: conn.execute(text(f"ALTER TABLE `{target_table}` ADD PRIMARY KEY (`Issue key`);")) primary_keys = ['Issue key'] # 分批处理数据 batch_size = 500 total_batches = (len(filtered_df) // batch_size) + 1 total_success = 0 # 获取最新表结构 metadata = MetaData() try: metadata.reflect(bind=target_engine, only=[target_table]) table = Table(target_table, metadata, autoload_with=target_engine) except Exception as e: logging.error(f"表结构反射失败: {str(e)}") raise for i in range(0, len(filtered_df), batch_size): batch_df = filtered_df.iloc[i:i + batch_size] records = batch_df.to_dict(orient='records') try: with target_engine.begin() as conn: # 构建insert语句 stmt = insert(table).values(records) # 动态创建更新字典(只更新目标表实际存在的列) update_dict = {} for col in table.columns: if col.name not in primary_keys and col.name in valid_columns: update_dict[col.name] = stmt.inserted[col.name] # 执行带冲突解决的插入 if update_dict: # 确保更新字典不为空 upsert_stmt = stmt.on_duplicate_key_update(**update_dict) result = conn.execute(upsert_stmt) total_success += result.rowcount else: logging.warning("更新字典为空,跳过批次") logging.info(f"已处理批次 [{i // batch_size + 1}/{total_batches}]") except Exception as batch_e: logging.error(f"批量操作失败: {str(batch_e)}") logging.info("回退到逐行处理...") # 逐行处理 for record_idx, record in enumerate(records): try: # 动态过滤记录列 valid_record = {k: v for k, v in record.items() if k in valid_columns} with target_engine.begin() as conn: stmt = insert(table).values([valid_record]) # 动态创建行级更新字典 row_update_dict = {} for col in table.columns: if col.name not in primary_keys and col.name in valid_record: row_update_dict[col.name] = stmt.inserted[col.name] if row_update_dict: upsert_stmt = stmt.on_duplicate_key_update(**row_update_dict) conn.execute(upsert_stmt) total_success += 1 else: logging.warning(f"行级更新字典为空,跳过记录 {record.get('Issue key', 'UNKNOWN')}") except Exception as row_e: key = record.get('Issue key', f"UNKNOWN-{record_idx}") logging.error(f"行级操作失败: {key}, {str(row_e)}") logging.info(f"成功写入/更新 {total_success} 条记录") return filtered_df def process_and_write_data(source_engine, target_engine, config_path): """查询、处理并写入数据(单次写入版本)""" try: source_table = os.getenv("SOURCE_TABLE_NAME") target_table = os.getenv("TARGET_TABLE_NAME") if not source_table or not target_table: raise ValueError("缺少环境变量 SOURCE_TABLE_NAME 或 TARGET_TABLE_NAME") logging.info(f"从 {source_table} 读取数据,写入 {target_table}") # 加载配置文件为字典 config_dict = load_config(config_path) if not config_dict: logging.error("配置加载失败,无法分配责任人") # 继续处理但跳过责任分配 bsw_staff_set = set() else: bsw_staff_set = set(config_dict.get("bsw_staff_list", [])) project_keys = ["P09135", "P09192", "P08385", "P08499", "KOCHI_PL"] # 构建查询 query = f""" SELECT * FROM `{source_table}` WHERE `Project key` IN ({', '.join([f"'{key}'" for key in project_keys])}) """ # 一次性读取全部数据 df = pd.read_sql(query, con=source_engine) total_rows = len(df) if total_rows == 0: logging.info("未读取到数据,退出处理") return 0 logging.info(f"读取到 {total_rows} 条记录,开始处理...") # 添加列名存在性检查 - 关键修复点 required_columns = [ 'Issue Type', 'Involvement of BSW', 'Custom field (Custom_2)', 'Project key', 'Labels', 'Summary', 'Status', 'Component/s', 'Priority', 'Custom field (Found By)', 'Assignee' ] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: logging.warning(f"数据源缺少必要列: {', '.join(missing_columns)}") # 添加默认值避免后续错误 for col in missing_columns: df[col] = None # 进度计数器初始化 processed_count = 0 progress_percent = 0 # 进度回调函数 def progress_callback(current, total): nonlocal processed_count, progress_percent processed_count = current current_percent = int(current / total * 100) if current_percent > progress_percent: progress_percent = current_percent logging.info(f"处理进度: {current}/{total} ({progress_percent}%)") # 处理数据(添加进度回调) processed_df = add_columns(df, bsw_staff_set, progress_callback) processed_df = assign_responsibles(processed_df, config_dict) logging.info("数据处理完成,开始写入数据库...") # 单次写入数据库 filtered_df = write_to_database(processed_df, target_engine, target_table) actual_rows = len(filtered_df) logging.info(f"成功写入 {actual_rows} 条记录到 {target_table}") return actual_rows except Exception as e: logging.error(f"数据处理失败: {str(e)}") logging.error(traceback.format_exc()) return 0 # ------------------ 主函数 ------------------ if __name__ == "__main__": required_vars = ["SOURCE_TABLE_NAME", "TARGET_TABLE_NAME"] missing_vars = [var for var in required_vars if not os.getenv(var)] if missing_vars: logging.error(f"缺少环境变量: {', '.join(missing_vars)}") exit(1) os.makedirs(OUTPUT_DIR, exist_ok=True) # 创建双数据库连接 try: source_engine = get_database_engine("default") target_engine = get_database_engine("bsw_jira") except Exception as e: logging.error(f"数据库连接失败: {str(e)}") exit(1) # 处理并写入数据 try: row_count = process_and_write_data( source_engine=source_engine, target_engine=target_engine, config_path="responsible_config.json" ) logging.info(f"数据处理完成,共写入 {row_count} 条记录") exit(0) except Exception as e: logging.error(f"主流程执行失败: {str(e)}") exit(1)
最新发布
11-26
"C:\Program Files\Python39\python.exe" "D:\AEB Management\9.py\BSW_JIRA_deal\bsw_task_jira_deal.py" 2025-11-24 15:34:25,985 - INFO - ============================================================ 2025-11-24 15:34:25,985 - INFO - BSW 数据处理流程启动 2025-11-24 15:34:25,986 - INFO - ============================================================ 2025-11-24 15:34:26,075 - INFO - 成功连接到数据库 system_test(default配置) 2025-11-24 15:34:26,097 - INFO - 成功连接到数据库 bsw_jira(bsw_jira配置) 2025-11-24 15:34:26,097 - INFO - 源表: jira_task_bsw, 目标表: jira_task_bsw_deal 2025-11-24 15:34:26,098 - INFO - 成功加载配置文件: responsible_config.json 2025-11-24 15:34:26,098 - INFO - 成功加载 BSW Staff List,包含 0 人 2025-11-24 15:34:26,512 - INFO - 处理分块 1 [1000行] 2025-11-24 15:34:26,601 - WARNING - 计算延迟时出错: Cannot subtract tz-naive and tz-aware datetime-like objects. 2025-11-24 15:34:26,636 - INFO - 新增 25 个计算列: Stack Classification, BSW Staff(Assignee), BSW Involve, Stack BSW Unanalyzed, Issue Status Resolved, ProjectNum, BSW Issue, Issue Status InProgress, Priority Medium, Priority Low, Issue Status Closed, BSW Analysis Conclusions, HorizontalExpansion PN, BSW Involve Unclosed, HorizontalExpansion Unfinished, HorizontalExpansion Count, Issue Status Open, HorizontalExpansion Delay, Stack BSW Analyzed, Priority High, BSW Self Test, Priority Highest, Priority Lowest, BSW Staff(Reporter), Stack Total 2025-11-24 15:34:26,671 - ERROR - 目标表 jira_task_bsw_deal 缺少以下列: 2025-11-24 15:34:26,671 - ERROR - - Custom field (Found in Mech version) 2025-11-24 15:34:26,671 - ERROR - - Priority Low 2025-11-24 15:34:26,671 - ERROR - - Custom field (Project Number P) 2025-11-24 15:34:26,671 - ERROR - - BSW Staff(Assignee) 2025-11-24 15:34:26,671 - ERROR - - Priority High 2025-11-24 15:34:26,671 - ERROR - - Issue Status Closed 2025-11-24 15:34:26,671 - ERROR - - Custom field (Car project OEM) 2025-11-24 15:34:26,671 - ERROR - - Issue Status Resolved 2025-11-24 15:34:26,672 - ERROR - - Custom field (Found in SVN revision) 2025-11-24 15:34:26,672 - ERROR - - Custom field (DanTe_StatusTransitionCounter) 2025-11-24 15:34:26,672 - ERROR - - Priority Highest 2025-11-24 15:34:26,672 - ERROR - - Issue Status Open 2025-11-24 15:34:26,672 - ERROR - - Issue Status InProgress 2025-11-24 15:34:26,672 - ERROR - - Sum Remaining Estimate 2025-11-24 15:34:26,672 - ERROR - - Custom field (Impact on Functional Safety) 2025-11-24 15:34:26,672 - ERROR - - Priority Lowest 2025-11-24 15:34:26,672 - ERROR - - BSW Staff(Reporter) 2025-11-24 15:34:26,672 - ERROR - - Priority Medium 2025-11-24 15:34:26,672 - WARNING - 将排除这些列并继续处理其他有效列
11-25
import os import json import logging import traceback import pandas as pd import pytz from datetime import datetime from sqlalchemy import create_engine from dotenv import load_dotenv # 初始化日志 logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # 加载环境变量 load_dotenv() # 输出文件配置 OUTPUT_DIR = r"D:\AEB Management\output" OUTPUT_FILE = "processed_data.csv" # 确保输出目录存在 os.makedirs(OUTPUT_DIR, exist_ok=True) # ------------------ 配置加载 ------------------ def load_config(config_path): """加载 JSON 配置文件""" try: with open(config_path, "r", encoding="utf-8") as file: config = json.load(file) logging.info(f"成功加载配置文件: {config_path}") return config except Exception as e: logging.error("加载配置文件失败,请检查路径或格式!") logging.error(e) raise def load_bsw_staff_list(config_path): """从 JSON 配置文件中加载 BSW Staff 列表""" try: with open(config_path, "r", encoding="utf-8") as file: config = json.load(file) staff_list = config.get("bsw_staff_list", []) logging.info(f"成功加载 BSW Staff List,包含 {len(staff_list)} 人") return set(staff_list) # 转为 set,提高查询效率 except Exception as e: logging.error("加载 BSW Staff List 配置失败,请检查路径或格式!") logging.error(e) raise # ------------------ 数据列处理 ------------------ def add_columns(dataframe, bsw_staff_set): """ 根据列 'Involvement of BSW' 和其他逻辑动态添加新列 :param dataframe: pandas DataFrame :param bsw_staff_set: BSW人员集合 :return: 新增列后的 DataFrame """ # ------------------ 新增 HorizontalExpansion PN 列 ------------------ # 创建临时列存储提取值 dataframe['HorizontalExpansion PN'] = None # 定义条件筛选 kochi_condition = ( (dataframe['Project key'] == 'KOCHI_PL') & (dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False, regex=False)) ) # 提取项目编号并替换Project key if kochi_condition.any(): # 使用正则表达式提取P+5位数字的项目编号 extracted_projects = dataframe.loc[kochi_condition, 'Summary'].str.extract(r'(P\d{5})', expand=False) # 更新HorizontalExpansion PN列 dataframe.loc[kochi_condition, 'HorizontalExpansion PN'] = extracted_projects # 替换Project key的值 valid_extraction = extracted_projects.notna() dataframe.loc[kochi_condition & valid_extraction, 'Project key'] = extracted_projects[valid_extraction] # 添加 'BSW Analysis Conclusions' 列 def bsw_analysis_conclusions(row): if row['Involvement of BSW'] == 'No': return 'Non-BSW' elif row['Involvement of BSW'] == 'Yes': if pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return 'Analyzed-Non-BSW' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return 'Analyzed-BSW' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return 'Analyzed-Non-BSW' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return 'Analyzed-BSW' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return 'Analyzed-三方包问题' else: return 'Analyzing-Unclassified' return 'Analyzing-Unclassified' dataframe['BSW Analysis Conclusions'] = dataframe.apply(bsw_analysis_conclusions, axis=1) # 添加 'Stack Classification' 列 def stack_classification(component): if pd.isna(component): return 'Unclassed' elif 'Com_Stack' in component: return 'Com_Stack' elif 'Cdd_Stack' in component: return 'Cdd_Stack' elif 'CybSec_Stack' in component: return 'CybSec_Stack' elif 'Diag_Stack' in component: return 'Diag_Stack' elif 'Fbl_Stack' in component: return 'Fbl_Stack' elif 'FuSa_Stack' in component: return 'FuSa_Stack' elif 'IoHwAb_Stack' in component: return 'IoHwAb_Stack' elif 'Mem_Stack' in component: return 'Mem_Stack' elif 'ModeMgm_Stack' in component: return 'ModeMgm_Stack' elif 'OS_Stack' in component: return 'OS_Stack' elif 'Sys_Stack' in component: return 'Sys_Stack' else: return 'Unclassed' dataframe['Stack Classification'] = dataframe['Component/s'].apply(stack_classification) # 添加 'BSW Reason Classification' 列 def calculate_bsw_reason_classification(row): """ 定义 'BSW Reason Classification' 列的逻辑. 当 'Involvement of BSW' 是 'No' 时,直接返回 '底层未参与分析'。 否则根据 'Custom field (Custom_2)' 的值匹配具体分类。 """ if row['Involvement of BSW'] == 'No': return '底层未参与分析' # 如果 BSW 不涉及,直接返回此值 if row['Involvement of BSW'] == 'Yes' and pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return '非BSW问题' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return '开发设计问题' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return '需求问题' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return '平台问题' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return '三方包问题' else: return 'Analyzing-Unclassified' else: # 如果 'Custom field (Custom_2)' 为空,直接返回未分类状态 return 'Analyzing-Unclassified' dataframe['BSW Reason Classification'] = dataframe.apply(calculate_bsw_reason_classification, axis=1) # 添加新的 'total' 列 def calculate_total(row): if row['Project key'] == 'KOCHI_PL': return 0 elif row['Project key'] != 'KOCHI_PL' and row['Issue Type'] in ["KOCHI AE Issue", "Problem"]: return 1 else: return 0 dataframe['total'] = dataframe.apply(calculate_total, axis=1) # 添加 "BSW Analyzed" 列 def calculate_bsw_analyzed(row): if ( row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and pd.notna(row['Custom field (Custom_2)']) # 确保不为空 ): custom_field = row['Custom field (Custom_2)'] # 正则匹配关键字 regex = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|' \ r'Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue' if pd.Series(custom_field).str.contains(regex, regex=True, na=False).iloc[0]: return 1 return 0 dataframe['BSW Analyzed'] = dataframe.apply(calculate_bsw_analyzed, axis=1) # 添加 "BSW Unanalyzed" 列 def calculate_bsw_unanalyzed(row): if ( row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' ): custom_field = row['Custom field (Custom_2)'] # 判断是否为 NULL、空字符串或未匹配关键字 if pd.isna(custom_field) or custom_field == '' or not pd.Series(custom_field).str.contains( r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|' r'Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue', regex=True, na=False ).iloc[0]: return 1 return 0 dataframe['BSW Unanalyzed'] = dataframe.apply(calculate_bsw_unanalyzed, axis=1) # 添加 'BSW Staff List' 列 dataframe['BSW Staff List'] = dataframe['Assignee'].apply( lambda x: "Yes" if x in bsw_staff_set else "No" ) # 添加 "Found By Classification" 列 def classify_found_by(row): found_by = row['Custom field (Found By)'] labels = row['Labels'] if pd.notna(row['Labels']) else "" if found_by == 'SW Integration Test' and 'BSW_IT' in labels: return 'BSW_IT' elif found_by == 'SW Integration Test': return 'SWIT' elif found_by == 'Cybersecurity Monitoring': return 'Cybersecurity Monitoring' elif found_by in ['HIL TEST', 'HILP TEST', 'System Test']: return 'System Test' elif found_by in ['OEM Test', 'OEM', 'End user']: return 'OEM&End user' elif found_by == 'Development Audit' and 'BSW_PT' in labels: return 'BSW_PT' elif found_by in ['Review', 'SW Unit Test', 'Development']: return '开发自测' elif found_by in ['Elec. HW-SW Integration Test', 'Elec. HW Unit Test']: return 'HW' elif pd.isna(found_by) or str(found_by).strip() == '': return 'Others' elif found_by in [ 'SW Test', 'Production', 'Enviromental qualification', 'Mechatronic Module Test', 'ASPICE Assessment', 'Mech. Unit Test', 'Audit', 'SW Audit', 'Mechatronic Integration Test', 'Functional Safety Assesment', 'Mech. Integration Test', 'Gap Analysis' ]: return 'Others' else: return 'Others' dataframe['Found By Classification'] = dataframe.apply(classify_found_by, axis=1) # 添加 "High Priority Unanalyzed" 列 def calculate_high_priority_unanalyzed(row): # 1. 检查基础条件 if not ( row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and row['Priority'] == 'High' ): return 0 # 2. 计算创建至今的天数差 created_date = row['Created'] if pd.isna(created_date): return 0 # 确保时区一致 now = pd.Timestamp.now(tz='UTC') if created_date.tz else pd.Timestamp.now() days_diff = (now - created_date).days if days_diff <= 15: return 0 # 3. 检查自定义字段是否包含分析结论 custom_field = row['Custom field (Custom_2)'] if pd.notna(row['Custom field (Custom_2)']) else "" # 定义需要排除的关键字列表 exclude_keywords = [ '需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题', 'Non-BSW', 'Development Design Issue', 'Requirement Issue', 'Platform Issue', 'Party Package Issue' ] # 检查是否包含任何排除关键字 if any(keyword in custom_field for keyword in exclude_keywords): return 0 # 所有条件满足 return 1 dataframe['High Priority Unanalyzed'] = dataframe.apply(calculate_high_priority_unanalyzed, axis=1) # 添加 "Stack_Analyzed" 列 def calculate_stack_analyzed(row): if ( row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' ): # 检查组件是否包含'Stack' comp_has_stack = pd.notna(row['Component/s']) and 'Stack' in row['Component/s'] # 检查自定义字段是否包含指定关键字 custom_has_keywords = pd.notna(row['Custom field (Custom_2)']) and ( '非BSW问题' in row['Custom field (Custom_2)'] or 'Non-BSW' in row['Custom field (Custom_2)'] ) # 满足任一条件即返回1 if comp_has_stack or custom_has_keywords: return 1 return 0 dataframe['Stack_Analyzed'] = dataframe.apply(calculate_stack_analyzed, axis=1) # 添加 "Stack_Unanalyzed" 列 def calculate_stack_unanalyzed(row): if ( row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' ): # 检查组件是否包含'Stack' comp_has_stack = pd.notna(row['Component/s']) and 'Stack' in row['Component/s'] # 检查自定义字段是否包含指定关键字 custom_has_keywords = pd.notna(row['Custom field (Custom_2)']) and ( '非BSW问题' in row['Custom field (Custom_2)'] or 'Non-BSW' in row['Custom field (Custom_2)'] ) # 两个条件都不满足时返回1 if not comp_has_stack and not custom_has_keywords: return 1 return 0 dataframe['Stack_Unanalyzed'] = dataframe.apply(calculate_stack_unanalyzed, axis=1) # 关键修复:统一时区处理 try: # 转换 `Updated` 列为 UTC 时区的 datetime dataframe['Updated'] = pd.to_datetime( dataframe['Updated'], errors='coerce', utc=True ) # 计算日期差 now = pd.Timestamp.now(tz='UTC') days_since_update = (now - dataframe['Updated']).dt.days except TypeError as e: logging.error(f"时区错误: {e}") if 'Updated' in dataframe and dataframe['Updated'].dt.tz is None: logging.warning("检测到Updated列缺少时区信息,将转换为UTC") dataframe['Updated'] = dataframe['Updated'].dt.tz_localize('UTC') now = pd.Timestamp.now(tz='UTC') days_since_update = (now - dataframe['Updated']).dt.days except Exception as e: logging.error(f"处理日期时发生未知错误: {e}") days_since_update = None # 避免后续错误 # 使用日期差计算结果 condition_base = ( (dataframe['Issue Type'].isin(["KOCHI AE Issue", "Problem"])) & (dataframe['Involvement of BSW'] == 'Yes') & (dataframe['BSW Staff List'] == 'Yes') ) # 创建日期相关列 dataframe['update_over_15'] = ( condition_base & (days_since_update >= 15) ).astype(int) dataframe['update_over_7'] = ( condition_base & (days_since_update > 7) & (days_since_update < 15) ).astype(int) logging.info("成功添加所有分析列") return dataframe def assign_responsibles(dataframe, config): """ 根据配置为 DataFrame 增加 'Responsible' 和 'Email' 列 """ def get_responsible_and_email(project_key): if project_key in config: responsible = config[project_key].get("Responsible", "Unassigned") email = config[project_key].get("Email", "unknown@example.com") else: # 默认值,当 Project Key 未配置时 responsible = config["Default"].get("Responsible", "Unassigned") email = config["Default"].get("Email", "unknown@example.com") return responsible, email dataframe['Responsible'], dataframe['Email'] = zip(*dataframe['Project key'].apply(get_responsible_and_email)) return dataframe def process_columns(dataframe, config_path, bsw_staff_set): """ 处理 DataFrame,包括动态映射列和新增列。 """ # 加载配置文件 config = load_config(config_path) # 添加分析类列(传递bsw_staff_set参数) dataframe = add_columns(dataframe, bsw_staff_set) # 动态分配 'Responsible' 和 'Email' 列 dataframe = assign_responsibles(dataframe, config) return dataframe # ------------------ 数据库相关 ------------------ def get_database_engine(): """ 创建数据库连接引擎 """ try: host = os.getenv("DB_HOST") user = os.getenv("DB_USER") password = os.getenv("DB_PASSWORD") database = os.getenv("DB_NAME") port = os.getenv("DB_PORT", 3306) connection_uri = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}" engine = create_engine(connection_uri) logging.info(f"成功连接到数据库 {database}") return engine except Exception as e: logging.error("无法创建数据库引擎") logging.error(traceback.format_exc()) return None def query_and_process_data(engine, table_name, project_keys, output_path, config_path, chunksize=500): """ 查询和处理数据并输出到 CSV 文件。 """ try: # 确保输出目录存在 os.makedirs(os.path.dirname(output_path), exist_ok=True) # 加载 BSW Staff Set (从 JSON 配置文件中加载) bsw_staff_set = load_bsw_staff_list(config_path) # 把项目 keys 格式化为 SQL 查询条件的一部分 formatted_keys = "','".join(project_keys) query_condition = f"`Project key` IN ('{formatted_keys}')" query = f"SELECT * FROM `{table_name}` WHERE {query_condition}" logging.info(f"查询语句: {query}") # 查询总行数 count_query = f"SELECT COUNT(*) FROM `{table_name}` WHERE {query_condition}" total_rows = pd.read_sql(count_query, con=engine).iloc[0, 0] logging.info(f"满足条件的总行数: {total_rows}") if total_rows == 0: logging.info("查询结果为空") pd.DataFrame().to_csv(output_path, index=False) return # 分块读取和处理数据 chunks = pd.read_sql(query, con=engine, chunksize=chunksize) processed_first_chunk = False for idx, chunk in enumerate(chunks): logging.info(f"处理分块 {idx + 1}, 行数: {len(chunk)}") # 处理数据列 chunk = process_columns(chunk, config_path, bsw_staff_set) # 写入输出文件(首块保留表头) mode = 'w' if not processed_first_chunk else 'a' header = not processed_first_chunk chunk.to_csv(output_path, mode=mode, index=False, header=header, encoding='utf-8-sig') if not processed_first_chunk: processed_first_chunk = True rows_processed = (idx + 1) * chunksize progress = min(rows_processed, total_rows) logging.info(f"进度: {progress}/{total_rows} ({progress / total_rows:.1%})") logging.info(f"输出完成: {output_path}") except Exception as e: logging.error("处理数据时发生错误") logging.error(traceback.format_exc()) raise if __name__ == "__main__": # 环境变量验证 table_name = os.getenv("TABLE_NAME") if not table_name: logging.error("缺少环境变量 TABLE_NAME") exit(1) # 创建数据库连接 engine = get_database_engine() if engine is None: logging.error("无法创建数据库引擎,程序退出") exit(1) # 配置路径和输出路径 project_keys = ["P09135", "P09192", "P08499"] output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE) config_path = "responsible_config.json" # 执行数据处理 try: query_and_process_data(engine, table_name, project_keys, output_path, config_path) logging.info("数据处理流程完成") except Exception as e: logging.error("主流程执行失败") logging.error(traceback.format_exc()) exit(1)
11-22
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值