KeyError: “[‘problem‘] not found in axis“

在2021年2月7日,遇到读取CSV文件后执行代码出错,错误信息为KeyError:'problem' not found in axis。解决方案是重新加载CSV文件后再运行代码,成功避免了该错误。

2021.02.07
Day1 KeyError: “[‘problem’] not found in axis”

df = df.drop("problem", axis = 1).join(df.problem.str.get_dummies(","))
df

读取.csv文件后,运行出错,出现KeyError:

KeyError: "['problem'] not found in axis"

解决方案:重新加载.csv文件,再运行。

源码:import os import json import logging import traceback import pandas as pd from datetime import datetime from sqlalchemy import create_engine, inspect from sqlalchemy.engine import Engine from dotenv import load_dotenv # 初始化日志 logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # 加载环境变量 load_dotenv() # 输出文件配置 OUTPUT_DIR = r"D:\AEB Management\output" OUTPUT_FILE = "processed_data.csv" os.makedirs(OUTPUT_DIR, exist_ok=True) # ------------------ 配置加载 ------------------ def load_config(config_path): """加载 JSON 配置文件为字典""" try: with open(config_path, "r", encoding="utf-8") as file: config = json.load(file) logging.info(f"成功加载配置文件: {config_path}") return config except Exception as e: logging.error(f"加载配置文件失败: {str(e)}") logging.error(traceback.format_exc()) return {} # 返回空字典避免后续错误 # ------------------ 数据列处理 ------------------ def add_columns(dataframe, bsw_staff_set): """根据业务规则动态添加新列""" current_time = pd.Timestamp.now() dataframe['HorizontalExpansion PN'] = None # 定义两个条件: # (1)当 Project key 是 'KOCHI_PL',且 Labels 包含字符串 'CheckItOnOtherProject' kochi_condition_include = ( (dataframe['Project key'] == 'KOCHI_PL') & (dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False)) ) # (2)当 Project key 是 'KOCHI_PL',但 Labels 不包含字符串 'CheckItOnOtherProject' kochi_condition_exclude = ( (dataframe['Project key'] == 'KOCHI_PL') & (~dataframe['Labels'].str.contains('CheckItOnOtherProject', na=False)) ) # 如果存在满足(1)的行,提取项目编号并更新数据框 if kochi_condition_include.any(): # 提取符合条件行中 Summary 的项目编号 (Pxxxxx 的模式) extracted_projects = dataframe.loc[kochi_condition_include, 'Summary'].str.extract(r'(P\d{5})', expand=False) # 更新 HorizontalExpansion PN 列 dataframe.loc[kochi_condition_include, 'HorizontalExpansion PN'] = extracted_projects # 检测 non-Na 且有效的提取记录 valid_extraction = extracted_projects.notna() # 对符合条件且提取成功的行更新 Project key 列 dataframe.loc[kochi_condition_include & valid_extraction, 'Project key'] = extracted_projects[valid_extraction] # 如果存在满足(2)的行,赋值并更新数据框 if kochi_condition_exclude.any(): dataframe.loc[kochi_condition_exclude, 'Project key'] = 'noneed' # ================== 新增分析列 ================== # 1. ProjectNum dataframe['ProjectNum'] = dataframe['Project key'] # 2. BSW Self Test def is_bsw_self_test(row): if row['Involvement of BSW'] != 'Yes': return 0 found_by = row.get('Custom field (Found By)', '') labels = row.get('Labels', '') cond1 = found_by in ['Development Audit', 'Review', 'SW Unit Test', 'Development'] cond2 = (found_by == 'SW Integration Test') and ('BSW_IT' in labels) return 1 if cond1 or cond2 else 0 dataframe['BSW Self Test'] = dataframe.apply(is_bsw_self_test, axis=1) # 3. BSW Issue def is_bsw_issue(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) keywords = ['平台问题', '开发设计问题', '三方包问题', 'Development Design Issue', 'Platform Issue', 'Party Package Issue'] return 1 if any(keyword in custom_field for keyword in keywords) else 0 dataframe['BSW Issue'] = dataframe.apply(is_bsw_issue, axis=1) # 4. Stack BSW Analyzed def is_stack_analyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 components = str(row.get('Component/s', '')) custom_field = str(row.get('Custom field (Custom_2)', '')) cond1 = 'Stack' in components cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field return 1 if cond1 or cond2 else 0 dataframe['Stack BSW Analyzed'] = dataframe.apply(is_stack_analyzed, axis=1) # 5. Stack BSW Unanalyzed def is_stack_unanalyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes'): return 0 components = str(row.get('Component/s', '')) custom_field = str(row.get('Custom field (Custom_2)', '')) cond1 = 'Stack' in components cond2 = '非BSW问题' in custom_field or 'Non-BSW' in custom_field return 1 if not (cond1 or cond2) else 0 dataframe['Stack BSW Unanalyzed'] = dataframe.apply(is_stack_unanalyzed, axis=1) # 6. Stack Total dataframe['Stack Total'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and 'Stack' in str(row.get('Component/s', ''))) else 0, axis=1 ) # 7. BSW Involve dataframe['BSW Involve'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes') else 0, axis=1 ) # 8. BSW Involve Unclosed dataframe['BSW Involve Unclosed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and row.get('Status', '') != 'Closed') else 0, axis=1 ) # ------------------ HorizontalExpansion 分析列 ------------------ # 处理日期转换 if 'Created' in dataframe.columns: dataframe['Created'] = pd.to_datetime(dataframe['Created'], errors='coerce') days_since_creation = (current_time - dataframe['Created']).dt.days else: days_since_creation = pd.Series([0] * len(dataframe)) # 1. HorizontalExpansion Count dataframe['HorizontalExpansion Count'] = dataframe['HorizontalExpansion PN'].notna().astype(int) # 2. HorizontalExpansion Unfinished dataframe['HorizontalExpansion Unfinished'] = ( dataframe['HorizontalExpansion PN'].notna() & (dataframe['Status'] != 'Done') ).astype(int) # 3. HorizontalExpansion Delay dataframe['HorizontalExpansion Delay'] = ( dataframe['HorizontalExpansion PN'].notna() & (dataframe['Status'] != 'Done') & (days_since_creation > 30) ).astype(int) # ------------------ 原有业务逻辑列 ------------------ # BSW Analysis Conclusions def bsw_analysis_conclusions(row): if row['Involvement of BSW'] == 'No': return 'Non-BSW' elif row['Involvement of BSW'] == 'Yes': if pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return 'Analyzed-Non-BSW' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return 'Analyzed-BSW' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return 'Analyzed-Non-BSW' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return 'Analyzed-BSW' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return 'Analyzed-三方包问题' return 'Analyzing-Unclassified' dataframe['BSW Analysis Conclusions'] = dataframe.apply(bsw_analysis_conclusions, axis=1) # Stack Classification def stack_classification(component): if pd.isna(component): return 'Unclassed' stacks = ['Com_Stack', 'Cdd_Stack', 'CybSec_Stack', 'Diag_Stack', 'Fbl_Stack', 'FuSa_Stack', 'IoHwAb_Stack', 'Mem_Stack', 'ModeMgm_Stack', 'OS_Stack', 'Sys_Stack'] for stack in stacks: if stack in component: return stack return 'Unclassed' dataframe['Stack Classification'] = dataframe['Component/s'].apply(stack_classification) # BSW Reason Classification def calculate_bsw_reason_classification(row): if row['Involvement of BSW'] == 'No': return '底层未参与分析' if pd.notna(row['Custom field (Custom_2)']): custom_field = row['Custom field (Custom_2)'] if '非BSW问题' in custom_field or 'Non-BSW' in custom_field: return '非BSW问题' elif '开发设计问题' in custom_field or 'Development Design Issue' in custom_field: return '开发设计问题' elif '需求问题' in custom_field or 'Requirement Issue' in custom_field: return '需求问题' elif '平台问题' in custom_field or 'Platform Issue' in custom_field: return '平台问题' elif '三方包问题' in custom_field or 'Party Package Issue' in custom_field: return '三方包问题' return 'Analyzing-Unclassified' dataframe['BSW Reason Classification'] = dataframe.apply(calculate_bsw_reason_classification, axis=1) # total 列 dataframe['total'] = dataframe.apply( lambda row: 0 if row['Project key'] == 'KOCHI_PL' else 1 if row['Project key'] != 'KOCHI_PL' and row['Issue Type'] in ["KOCHI AE Issue", "Problem"] else 0, axis=1 ) # BSW Analyzed regex = r'需求问题|平台问题|开发设计问题|非BSW问题|三方包问题|Non-BSW|Development Design Issue|Requirement Issue|Platform Issue|Party Package Issue' dataframe['BSW Analyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and pd.notna(row['Custom field (Custom_2)']) and row['Custom field (Custom_2)'].str.contains(regex, regex=True).any()) else 0, axis=1 ) # BSW Unanalyzed dataframe['BSW Unanalyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and (pd.isna(row['Custom field (Custom_2)']) or not row['Custom field (Custom_2)'].str.contains(regex, regex=True).any())) else 0, axis=1 ) # BSW Staff List dataframe['BSW Staff List'] = dataframe['Assignee'].apply( lambda x: "Yes" if x in bsw_staff_set else "No" ) # Found By Classification def classify_found_by(row): found_by = row['Custom field (Found By)'] labels = row['Labels'] if pd.notna(row['Labels']) else "" if found_by == 'SW Integration Test' and 'BSW_IT' in labels: return 'BSW_IT' elif found_by == 'SW Integration Test': return 'SWIT' elif found_by == 'Cybersecurity Monitoring': return 'Cybersecurity Monitoring' elif found_by in ['HIL TEST', 'HILP TEST', 'System Test']: return 'System Test' elif found_by in ['OEM Test', 'OEM', 'End user']: return 'OEM&End user' elif found_by == 'Development Audit' and 'BSW_PT' in labels: return 'BSW_PT' elif found_by in ['Review', 'SW Unit Test', 'Development']: return '开发自测' return 'Others' dataframe['Found By Classification'] = dataframe.apply(classify_found_by, axis=1) # High Priority Unanalyzed def calculate_high_priority_unanalyzed(row): if not (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and row['Priority'] == 'High'): return 0 if pd.isna(row['Created']): return 0 days_diff = (current_time - row['Created']).days if days_diff <= 15: return 0 custom_field = str(row.get('Custom field (Custom_2)', '')) exclude_keywords = [ '需求问题', '平台问题', '开发设计问题', '非BSW问题', '三方包问题', 'Non-BSW', 'Development Design Issue', 'Requirement Issue', 'Platform Issue', 'Party Package Issue' ] if any(keyword in custom_field for keyword in exclude_keywords): return 0 return 1 dataframe['High Priority Unanalyzed'] = dataframe.apply(calculate_high_priority_unanalyzed, axis=1) # Stack_Analyzed dataframe['Stack_Analyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and ('Stack' in str(row['Component/s']) or '非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or 'Non-BSW' in str(row.get('Custom field (Custom_2)', '')))) else 0, axis=1 ) # Stack_Unanalyzed dataframe['Stack_Unanalyzed'] = dataframe.apply( lambda row: 1 if (row['Issue Type'] in ["KOCHI AE Issue", "Problem"] and row['Involvement of BSW'] == 'Yes' and not ('Stack' in str(row['Component/s']) or '非BSW问题' in str(row.get('Custom field (Custom_2)', '')) or 'Non-BSW' in str(row.get('Custom field (Custom_2)', '')))) else 0, axis=1 ) # 日期相关列 if 'Updated' in dataframe.columns: dataframe['Updated'] = pd.to_datetime(dataframe['Updated'], errors='coerce') days_since_update = (current_time - dataframe['Updated']).dt.days condition_base = ( (dataframe['Issue Type'].isin(["KOCHI AE Issue", "Problem"])) & (dataframe['Involvement of BSW'] == 'Yes') & (dataframe['BSW Staff List'] == 'Yes') ) dataframe['update_over_15'] = (condition_base & (days_since_update >= 15)).astype(int) dataframe['update_over_7'] = (condition_base & (days_since_update > 7) & (days_since_update < 15)).astype(int) logging.info("成功添加所有分析列") return dataframe def assign_responsibles(dataframe, config_dict): """根据配置字典为 DataFrame 增加 'Responsible' 和 'Email' 列""" if not config_dict: logging.warning("配置字典为空,无法分配责任人") dataframe['Responsible'] = "Unassigned" dataframe['Email'] = "" return dataframe def get_responsible_and_email(project_key): # 处理无效项目键 project_key = str(project_key) # 首先检查具体项目配置 if project_key in config_dict: project_config = config_dict[project_key] return ( project_config.get("Responsible", "Unassigned"), project_config.get("Email", "") ) # 其次检查默认配置 default_config = config_dict.get("Default", {}) return ( default_config.get("Responsible", "Unassigned"), default_config.get("Email", "") ) dataframe['Responsible'], dataframe['Email'] = zip(*dataframe['Project key'].apply(get_responsible_and_email)) return dataframe # ------------------ 数据库相关 ------------------ def get_database_engine(db_name="default"): """创建数据库连接引擎""" prefix = "BSW_" if db_name == "bsw_jira" else "" host = os.getenv(f"{prefix}DB_HOST") user = os.getenv(f"{prefix}DB_USER") password = os.getenv(f"{prefix}DB_PASSWORD") database = os.getenv(f"{prefix}DB_NAME", db_name) port = os.getenv(f"{prefix}DB_PORT", 3306) if not all([host, user, password, database]): raise ValueError(f"缺少{db_name}数据库配置") connection_uri = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}" return create_engine(connection_uri) def write_to_database_with_retry(df, engine, table_name, max_retries=3): """带重试机制的数据库写入""" for attempt in range(max_retries): try: # 过滤掉目标表不存在的列 inspector = inspect(engine) target_columns = [col['name'] for col in inspector.get_columns(table_name)] filtered_df = df[[col for col in df.columns if col in target_columns]] filtered_df.to_sql( name=table_name, con=engine, if_exists='append', index=False, chunksize=200, # 减小批量大小 method='multi' # 批量插入优化 ) return filtered_df.shape[0] except OperationalError as e: if "Deadlock" in str(e) and attempt < max_retries - 1: wait_time = 2 ** attempt # 指数退避 logging.warning(f"死锁检测,等待{wait_time}秒后重试 (尝试 {attempt + 1}/{max_retries})") time.sleep(wait_time) else: logging.error(f"数据库写入失败: {str(e)}") raise except Exception as e: logging.error(f"未知写入错误: {str(e)}") raise return 0 def process_and_write_data(source_engine, target_engine, config_path): """查询、处理并写入数据""" try: source_table = os.getenv("SOURCE_TABLE_NAME") target_table = os.getenv("TARGET_TABLE_NAME") if not source_table or not target_table: raise ValueError("缺少环境变量 SOURCE_TABLE_NAME 或 TARGET_TABLE_NAME") logging.info(f"从 {source_table} 读取数据,写入 {target_table}") # 加载配置文件为字典 config_dict = load_config(config_path) if not config_dict: logging.error("配置加载失败,无法分配责任人") # 继续处理但跳过责任分配 bsw_staff_set = set() else: bsw_staff_set = set(config_dict.get("bsw_staff_list", [])) project_keys = ["P09135", "P09192", "P08499", "KOCHI_PL"] # 构建查询 query = f""" SELECT * FROM `{source_table}` WHERE `Project key` IN ({', '.join([f"'{key}'" for key in project_keys])}) """ # 分块处理 chunks = pd.read_sql(query, con=source_engine, chunksize=500) total_rows = 0 for idx, chunk in enumerate(chunks): processed_chunk = add_columns(chunk, bsw_staff_set) # 修复:传入配置字典而非文件路径 processed_chunk = assign_responsibles(processed_chunk, config_dict) # 写入数据库 filtered_chunk = write_to_database(processed_chunk, target_engine, target_table) total_rows += len(filtered_chunk) logging.info(f"已写入分块 {idx + 1} [{len(filtered_chunk)}行]") logging.info(f"总计写入 {total_rows} 行到 {target_table}") return total_rows except Exception as e: logging.error(f"数据处理失败: {str(e)}") logging.error(traceback.format_exc()) return 0 # ------------------ 主函数 ------------------ if __name__ == "__main__": required_vars = ["SOURCE_TABLE_NAME", "TARGET_TABLE_NAME"] missing_vars = [var for var in required_vars if not os.getenv(var)] if missing_vars: logging.error(f"缺少环境变量: {', '.join(missing_vars)}") exit(1) os.makedirs(OUTPUT_DIR, exist_ok=True) # 创建双数据库连接 try: source_engine = get_database_engine("default") target_engine = get_database_engine("bsw_jira") except Exception as e: logging.error(f"数据库连接失败: {str(e)}") exit(1) # 处理并写入数据 try: row_count = process_and_write_data( source_engine=source_engine, target_engine=target_engine, config_path="responsible_config.json" ) logging.info(f"数据处理完成,共写入 {row_count} 条记录") exit(0) except Exception as e: logging.error(f"主流程执行失败: {str(e)}") exit(1)
最新发布
11-26
D:\DeepStudy\IDEA\.venv\Scripts\python.exe D:\DeepStudy\IDEA\111301.py 数据处理失败: "['RM', 'AGE', 'DIS', 'PTRATIO', 'LSTAT'] not in index" Traceback (most recent call last): File "D:\DeepStudy\IDEA\111301.py", line 44, in <module> df = df[numeric_columns] ~~^^^^^^^^^^^^^^^^^ File "D:\DeepStudy\IDEA\.venv\Lib\site-packages\pandas\core\frame.py", line 4113, in __getitem__ indexer = self.columns._get_indexer_strict(key, "columns")[1] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\DeepStudy\IDEA\.venv\Lib\site-packages\pandas\core\indexes\base.py", line 6212, in _get_indexer_strict self._raise_if_missing(keyarr, indexer, axis_name) File "D:\DeepStudy\IDEA\.venv\Lib\site-packages\pandas\core\indexes\base.py", line 6264, in _raise_if_missing raise KeyError(f"{not_found} not in index") KeyError: "['RM', 'AGE', 'DIS', 'PTRATIO', 'LSTAT'] not in index" During handling of the above exception, another exception occurred:Traceback (most recent call last): File "D:\DeepStudy\IDEA\111301.py", line 73, in <module> from sklearn.datasets import load_boston File "D:\DeepStudy\IDEA\.venv\Lib\site-packages\sklearn\datasets\__init__.py", line 161, in __getattr__ raise ImportError(msg) ImportError: `load_boston` has been removed from scikit-learn since version 1.2. The Boston housing prices dataset has an ethical problem: as investigated in [1], the authors of this dataset engineered a non-invertible variable "B" assuming that racial self-segregation had a positive impact on house prices [2]. Furthermore the goal of the research that led to the creation of this dataset was to study the impact of air quality but it did not give adequate demonstration of the validity of this assumption. The scikit-learn maintainers therefore strongly discourage the use of this dataset unless the purpose of the code is to study and educate about ethical issues in data science and machine learning. In this special case, you can fetch the dataset from the original source:: import pandas as pd import numpy as np data_url = "http://lib.stat.cmu.edu/datasets/boston" raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) target = raw_df.values[1::2, 2] Alternative datasets include the California housing dataset and the Ames housing dataset. You can load the datasets as follows:: from sklearn.datasets import fetch_california_housing housing = fetch_california_housing() for the California housing dataset and:: from sklearn.datasets import fetch_openml housing = fetch_openml(name="house_prices", as_frame=True) for the Ames housing dataset. [1] M Carlisle. "Racist data destruction?" <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8> [2] Harrison Jr, David, and Daniel L. Rubinfeld. "Hedonic housing prices and the demand for clean air." Journal of environmental economics and management 5.1 (1978): 81-102. <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air> 进程已结束,退出代码为 1
11-20
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值