DataFrame取值df['col_name']与df[['col_name']]区别

被这个困扰了好久,直到一次错误才发觉有用

df['col_name']
# 得到的是这一列的数据,不带col_name,是series结构
type(df['col_name'])
>> <class 'pandas.core.series.Series'>

col_name是DataFrame专有

df[['col_name']
import pandas as pd from datetime import datetime import re # 1. 读取数据 data_df = pd.read_csv('data.csv',encoding='utf_16_le',sep=None,) print(data_df.head()) out_df = pd.read_csv('outdata.csv',encoding='utf_16_le',sep=None,) # 2. 获取当前日期 today = datetime.now().strftime('%Y-%m-%d') # 3. 解析 outdata-1 的列名(跳过第一列“日期”) columns = out_df.columns.tolist() aaa = columns[0] # if columns[0] != '日期': # raise ValueError("输出文件第一列必须是'日期'") dynamic_cols = columns[1:] # 构建新行字典 new_row = {'日期': today} # 4. 预处理 data_df:确保数值列可转为 float,非数值保留 NaN # 找出所有可能的指标列(排除维度和航司) non_metric_cols = {'维度选择', '航司'} metric_cols = [col for col in data_df.columns if col not in non_metric_cols] # 将指标列尝试转为 numeric,无法转换的保留 NaN for col in metric_cols: data_df[col] = pd.to_numeric(data_df[col], errors='coerce') # 5. 对每个动态列进行解析并查找 for col_name in dynamic_cols: # 使用正则拆分:支持包含连字符的维度/列名/航司(但按最后一个两个 '-' 拆) parts = col_name.split('-') if len(parts) < 3: new_row[col_name] = '' continue # 启发式:最后一部分是航司,倒数第二是列名,前面合并为维度 airline = parts[-1] metric = parts[-2] dimension = '-'.join(parts[:-2]) # 在 data_df 中查找匹配行 mask = (data_df['维度选择'] == dimension) & (data_df['航司'] == airline) if mask.any(): value = data_df.loc[mask, metric].iloc[0] # 如果是 NaN,转为空字符串;否则保留原值(可以是数字或字符串) if pd.isna(value): new_row[col_name] = '' else: # 保持原始格式:如果是整数型浮点(如 130.0),转为整数再字符串 if isinstance(value, float) and value.is_integer(): new_row[col_name] = str(int(value)) else: new_row[col_name] = str(value) else: new_row[col_name] = '' # 6. 转换为 DataFrame 并追加到 out_df new_row_df = pd.DataFrame([new_row]) out_df = pd.concat([out_df, new_row_df], ignore_index=True) # 7. 保存回文件(空字符串替换 NaN) out_df.to_csv('outdata.csv', index=False, na_rep='') print("✅ 处理完成!已成功追加今日数据行。")
最新发布
11-17
import numpy as np # 拆分各表 kpi_df = google_sheets_new["KPI"] results_df = google_sheets_new["Results"] short_term_df = google_sheets_new["ShortTerm"] high_risk_df = google_sheets_new["HighRisk"] # 最终结果 extracted = {} # 规则一:KPI G10(第10行第7列,0-based索引为[9, 6]) try: extracted['kpi_g10'] = kpi_df.iloc[9, 6] except: extracted['kpi_g10'] = None # 规则二和三:Results 表中 AS 列,获取最低/最高百分比的5个料号 # AS 列未知精确列名,因此扫描列名含 % 的列 as_col_candidates = [col for col in results_df.columns if isinstance(col, str) and '%' in col] as_col_name = as_col_candidates[0] if as_col_candidates else None if as_col_name: try: as_series = results_df[as_col_name].dropna() as_numeric = pd.to_numeric(as_series.astype(str).str.replace('%', '').str.strip(), errors='coerce').dropna() # 对应索引(DataFrame 中) lowest_5_idx = as_numeric.nsmallest(5).index highest_5_idx = as_numeric.nlargest(5).index extracted['lowest_5_as'] = results_df.loc[lowest_5_idx].to_dict(orient='records') extracted['highest_5_as'] = results_df.loc[highest_5_idx].to_dict(orient='records') except: extracted['lowest_5_as'] = [] extracted['highest_5_as'] = [] else: extracted['lowest_5_as'] = [] extracted['highest_5_as'] = [] # 规则四:ShortTerm 中 G列金额最大 3 个呆滞料 try: col_g = short_term_df.columns[6] g_series = pd.to_numeric(short_term_df[col_g], errors='coerce') top3_g_idx = g_series.nlargest(3).index extracted['top3_dead_stock'] = short_term_df.loc[top3_g_idx].to_dict(orient='records') except: extracted['top3_dead_stock'] = [] # 规则五:HighRisk 中 D列,提取天数最多的三条(天数表达为 7/9/8) def parse_days(val): try: return sum(int(x.strip()) for x in str(val).split('/') if x.strip().isdigit()) except: return -1 try: col_d = high_risk_df.columns[3] d_series = high_risk_df[col_d].dropna() d_days = d_series.apply(parse_days) top3_d_idx = d_days.nlargest(3).index extracted['top3_days'] = high_risk_df.loc[top3_d_idx].to_dict(orient='records') except: extracted['top3_days'] = [] # 规则六:HighRisk 中 N列最大值 try: col_n = high_risk_df.columns[13] n_series = pd.to_numeric(high_risk_df[col_n], errors='coerce') max_n_idx = n_series.idxmax() extracted['max_n'] = high_risk_df.loc[max_n_idx].to_dict() except: extracted['max_n'] = {} # 规则七:HighRisk 中 Q2(第17列,第2行) try: q2_value = high_risk_df.iloc[1, 16] extracted['q2_value'] = q2_value except: extracted['q2_value'] = None # 规则八:HighRisk 中 AO列最大3个数值(第41列) try: col_ao = high_risk_df.columns[40] ao_series = pd.to_numeric(high_risk_df[col_ao], errors='coerce') top3_ao_idx = ao_series.nlargest(3).index extracted['top3_ao'] = high_risk_df.loc[top3_ao_idx].to_dict(orient='records') except: extracted['top3_ao'] = [] extracted
07-22
我这段代码CPU单核处理不过来,会未响应,帮我改成能用全部CPU ,以下为代码 def compare_and_write(self): selected_sheets = [self.tree.item(item, "values")[1] for item, state in self.check_states.items() if state] if not selected_sheets: messagebox.showwarning("提示", "请至少选择一个工作表作为对比表") return file_path = self.file_path.get() if not file_path: messagebox.showwarning("提示", "请先选择Excel文件") return try: wb = load_workbook(file_path) if 'check' not in wb.sheetnames: messagebox.showerror("错误", "找不到check表,请先创建") return ws_check = wb['check'] # === 关键修复1:正确的列名收集 === all_headers = set() sheet_data = {} # 存储各表原始数据 for sheet in selected_sheets: if sheet in wb.sheetnames: temp_df = pd.read_excel(file_path, sheet_name=sheet) sheet_data[sheet] = temp_df.copy() all_headers.update(temp_df.columns[1:]) # 仅收集数据列名 # === 关键修复2:重建数据框架保证ID列存在 === df_contrast = pd.DataFrame(columns=['ID_COL'] + list(all_headers)) for sheet, df in sheet_data.items(): # 创建当前表的副本并重命名ID列 temp_df = df.copy() temp_df.rename(columns={df.columns[0]: 'ID_COL'}, inplace=True) # 对齐列结构 for col in all_headers: if col not in temp_df: temp_df[col] = None # 只保留需要的列 temp_df = temp_df[['ID_COL'] + list(all_headers)] df_contrast = pd.concat([df_contrast, temp_df], ignore_index=True) if df_contrast.empty: messagebox.showwarning("警告", "对比表数据为空") return # === 列名写入保持不变 === existing_headers = {ws_check.cell(1, col).value for col in range(5, ws_check.max_column+1) if ws_check.cell(1, col).value} start_col = 5 while start_col <= ws_check.max_column and ws_check.cell(1, start_col).value is not None: start_col += 1 for header in all_headers: if header not in existing_headers: ws_check.cell(row=1, column=start_col, value=header) start_col += 1 # === 关键修复3:增强数据匹配逻辑 === added_count = 0 # 将ID列转为字符串类型确保匹配 df_contrast['ID_COL'] = df_contrast['ID_COL'].astype(str) for row_idx in range(2, ws_check.max_row + 1): col_values = {} for idx, col in enumerate([2, 3, 4], start=1): value = ws_check.cell(row_idx, col).value col_values[f'col{idx}'] = str(value) if value is not None else "" # 查找匹配行(优先匹配col2,其次col3,最后col4) matched_row = None for col_key in ['col1', 'col2', 'col3']: if col_values[col_key]: match = df_contrast[df_contrast['ID_COL'] == col_values[col_key]] if not match.empty: matched_row = match.iloc[0] break if matched_row is not None: # 动态定位写入位置 write_col = 5 while write_col <= ws_check.max_column and ws_check.cell(row_idx, write_col).value is not None: write_col += 1 # 写入所有数据列(跳过ID列) data_cols = [col for col in df_contrast.columns if col != 'ID_COL'] for col_name in data_cols: if write_col > ws_check.max_column: break ws_check.cell(row_idx, write_col, value=matched_row[col_name]) write_col += 1 added_count += 1 wb.save(file_path) messagebox.showinfo("完成", f"成功写入 {added_count} 行对比数据到check表") except Exception as e: messagebox.showerror("错误", f"对比写入失败: {str(e)}")
09-12
请你将下面的代码的每一个部分做一下详细的解释,尤其是预处理的:去除无效特征值、缺失值处理、数据可视化、特征变量选择(点评数、口味评分、环境评分、服务评分、人均消费),最后给出一个详细的总结。# -*- coding: utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler import os import matplotlib as mpl import sys # 设置中文字体支持 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei'] # 支持中文的字体 plt.rcParams['axes.unicode_minus'] = False # 正确显示负号 mpl.rcParams['font.size'] = 12 # 设置全局字体大小 # 获取桌面路径 desktop = os.path.join(os.path.expanduser('~'), 'Desktop') print(f"文件将保存到桌面: {desktop}") # 1. 读取数据 print("读取数据...") try: df = pd.read_csv('sourceSet.csv') print(f"成功读取数据,共{len(df)}行,{len(df.columns)}列") except FileNotFoundError: print("错误:找不到 sourceSet.csv 文件,请确保文件在当前目录") sys.exit(1) # 2. 识别关键特征 print("\n识别关键特征...") # 定义必须保留的五个关键特征(中英文列名映射) key_features = { '点评数': ['点评数', 'review_count'], '口味评分': ['口味', 'taste_rating'], '环境评分': ['环境', 'environment_rating'], '服务评分': ['服务', 'service_rating'], '人均消费': ['人均消费', 'avg_price'] } # 构建实际存在的特征列表 existing_key_features = {} for feature_name, possible_names in key_features.items(): for name in possible_names: if name in df.columns: existing_key_features[feature_name] = name print(f"已识别关键特征 '{feature_name}': 列名 '{name}'") break else: print(f"警告:未找到关键特征 '{feature_name}' 的对应列") if len(existing_key_features) < 5: print(f"错误:只找到 {len(existing_key_features)} 个关键特征,需要5个") sys.exit(1) # 3. 去除无效特征值 print("\n去除无效特征值...") columns_to_drop = ['lng', 'lat', '数据ID', '城市', 'ID', 'city', 'id', 'Unnamed: 0'] # 只删除实际存在的列,且不是关键特征 invalid_cols = [col for col in columns_to_drop if col in df.columns and col not in existing_key_features.values()] df.drop(columns=invalid_cols, inplace=True) print(f"已删除无效列: {invalid_cols}") print(f"剩余列: {list(df.columns)}") # 4. 缺失值处理 print("\n处理缺失值...") # 确保关键特征存在 key_feature_cols = list(existing_key_features.values()) print(f"关键特征列: {key_feature_cols}") # ① 缺失值比例高于30%的特征直接删除(仅限非关键特征) non_key_cols = [col for col in df.columns if col not in key_feature_cols] missing_percent = df[non_key_cols].isnull().mean() cols_to_drop = missing_percent[missing_percent > 0.3].index.tolist() df.drop(columns=cols_to_drop, inplace=True) if cols_to_drop: print(f"删除缺失值>30%的非关键特征: {cols_to_drop}") # ② 关键特征的缺失值处理 initial_rows = len(df) for feature_name, col_name in existing_key_features.items(): missing_count = df[col_name].isnull().sum() if missing_count > 0: # 关键特征缺失值使用列中位数填充 median_val = df[col_name].median() df[col_name].fillna(median_val, inplace=True) print(f"填充关键特征 '{feature_name}' ({col_name}) 的缺失值 (中位数: {median_val:.2f})") # ③ 其他非关键数值特征的缺失值采取均值填充 other_cols = [col for col in df.columns if col not in key_feature_cols and df[col].dtype in ['int64', 'float64']] for col in other_cols: if df[col].isnull().sum() > 0: mean_val = df[col].mean() df[col].fillna(mean_val, inplace=True) print(f"填充非关键特征 '{col}' 的缺失值 (均值: {mean_val:.2f})") # 5. 数据可视化 - 评分排名 print("\n生成可视化图表...") def plot_ranking(feature_name, col_name, title, color): """绘制评分排名的水平条形图""" # 检查特征是否存在 if col_name not in df.columns: print(f"错误:特征 '{col_name}' 不存在,无法生成 {title} 排名") return None # 取前20名 top20 = df.sort_values(col_name, ascending=False).head(20).reset_index(drop=True) plt.figure(figsize=(14, 10)) # 使用商家名称或索引作为标签 labels = top20.index.astype(str) if '商家名称' in df.columns: labels = top20['商家名称'].values elif 'shop_name' in df.columns: labels = top20['shop_name'].values else: # 如果没有商家名称,使用索引 labels = [f"商家 {i+1}" for i in top20.index] bars = plt.barh(labels, top20[col_name], color=color) plt.title(f'{title}排名 - 前20名', fontsize=16) plt.xlabel(title, fontsize=14) plt.ylabel('商家名称', fontsize=14) plt.gca().invert_yaxis() # 反转Y轴使最高分在顶部 # 在条形图上添加数值标签 for bar in bars: width = bar.get_width() plt.text(width + 0.01, bar.get_y() + bar.get_height() / 2, f'{width:.2f}', va='center', ha='left', fontsize=10) plt.tight_layout() save_path = os.path.join(desktop, f'{title}排名.png') plt.savefig(save_path, dpi=100, bbox_inches='tight') plt.close() print(f"已保存: {save_path}") return save_path # 绘制三种评分排名 saved_files = [] for feature_name in ['服务评分', '口味评分', '环境评分']: if feature_name in existing_key_features: col_name = existing_key_features[feature_name] color = { '口味评分': '#55A868', # 绿色 '环境评分': '#C44E52', # 红色 '服务评分': '#4C72B0' # 蓝色 }.get(feature_name, '#888888') # 默认灰色 saved_path = plot_ranking(feature_name, col_name, feature_name, color) if saved_path: saved_files.append(saved_path) else: print(f"警告:未找到 {feature_name} 列,跳过可视化") # 6. 特征变量选择 - 确保保留五个关键特征 print("\n特征变量选择...") selected_features = list(existing_key_features.values()) print(f"保留的关键特征: {selected_features}") df_selected = df[selected_features].copy() # 7. 数据标准化 (Z-score) print("\n数据标准化处理...") scaler = StandardScaler() df_selected.loc[:, selected_features] = scaler.fit_transform(df_selected[selected_features]) # 8. 保存预处理结果 output_path = os.path.join(desktop, 'processed_sourceSet.csv') try: df_selected.to_csv(output_path, index=False) print(f"预处理后的数据已保存到: {output_path}") # 验证文件是否成功生成 if os.path.exists(output_path): print(f"文件验证成功,大小: {os.path.getsize(output_path) / 1024:.2f} KB") else: print("警告:文件保存失败,请检查路径权限") except Exception as e: print(f"文件保存错误: {str(e)}") # 9. 生成预处理结果统计 print("\n预处理结果统计:") stats = pd.DataFrame({ '特征': [existing_key_features[fn] for fn in existing_key_features], '原始名称': list(existing_key_features.keys()), '类型': df_selected.dtypes.values, '最小值': df_selected.min().values, '最大值': df_selected.max().values, '取值范围': df_selected.max().values - df_selected.min().values }) print(stats) # 保存统计信息 stats_path = os.path.join(desktop, '预处理统计.csv') try: stats.to_csv(stats_path, index=False) print(f"预处理统计已保存到: {stats_path}") except Exception as e: print(f"统计信息保存错误: {str(e)}") print("\n所有处理完成!") if saved_files: print(f"生成的可视化文件: {saved_files}") else: print("警告:未生成任何可视化图表")
06-16
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值