Different read_csv index_col = None / 0 / False in pandas

本文解释了如何使用Pandas库正确地从CSV文件中读取数据并设置索引列,包括不同参数如index_col的使用方法及其可能导致的误解。

UPDATE

I think since version 0.16.1 it will now raise an error if you try to pass a boolean for index_col to avoid this ambiguity

ORIGINAL

A lot of people get confused by this, to specify the ordinal index of your column you should pass the int position in this case 0, what confuses people is that when there is no index column they pass False which is incorrect they should pass NoneFalse will evaluate to 0 hence the result you observe.

In [3]:

import io
import pandas as pd
t="""index,a,b
0,hello,pandas"""
pd.read_csv(io.StringIO(t))
​
Out[3]:
   index      a       b
0      0  hello  pandas

The default value is index_col=None as shown above.

If we set index_col=0 we're explicitly stating to treat the first column as the index:

In [4]:

pd.read_csv(io.StringIO(t), index_col=0)
Out[4]:
           a       b
index               
0      hello  pandas

If we pass index_col=False we get the same result as above due to False evaluating to 0:

In [5]:

pd.read_csv(io.StringIO(t), index_col=False)
Out[5]:
   index      a       b
0      0  hello  pandas

If we now state index_col=None we get the same behaviour as when we didn't pass this param:

In [6]:

pd.read_csv(io.StringIO(t), index_col=None)
Out[6]:
   index      a       b
0      0  hello  pandas

EDIT

For the case where you have a blank index column which is what you have:

In [7]:

import io
import pandas as pd
t=""",a,b
0,hello,pandas"""
pd.read_csv(io.StringIO(t))
​
Out[7]:
   Unnamed: 0      a       b
0           0  hello  pandas
In [8]:

pd.read_csv(io.StringIO(t), index_col=0)
Out[8]:
       a       b
0  hello  pandas
In [9]:

pd.read_csv(io.StringIO(t), index_col=False)
Out[9]:
   Unnamed: 0      a       b
0           0  hello  pandas
In [10]:

pd.read_csv(io.StringIO(t), index_col=None)
Out[10]:
   Unnamed: 0      a       b
0           0  hello  pandas
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os import locale import sys from pathlib import Path # 解决中文乱码问题 try: # 设置matplotlib中文显示 if os.name == 'nt': # Windows系统 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'Arial Unicode MS'] else: # Mac/Linux系统 plt.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei', 'WenQuanYi Zen Hei', 'Arial Unicode MS'] plt.rcParams['axes.unicode_minus'] = False plt.rcParams['font.size'] = 12 # 设置控制台编码 if hasattr(sys.stdout, 'encoding') and sys.stdout.encoding not in ['UTF-8', 'utf-8']: try: sys.stdout.reconfigure(encoding='utf-8') except: pass print("中文显示设置完成") except Exception as e: print("中文设置错误:", e) def enhanced_critic_weights(data, target_indicators=None, cost_indicators=None, epsilon=1e-10): """ 增强版CRITIC权重计算函数 """ # 转换数据为NumPy数组 if isinstance(data, pd.DataFrame): df = data.copy() X = df.values.astype(np.float64) columns = df.columns.tolist() else: X = np.asarray(data, dtype=np.float64) columns = [f"Var_{i}" for i in range(X.shape[1])] # 处理成本型指标 cost_col_indices = [] if cost_indicators: for col in cost_indicators: if col in columns: cost_col_indices.append(columns.index(col)) # 处理靶心型指标 target_col_indices = [] if target_indicators: for col, target_val in target_indicators.items(): if col in columns: col_idx = columns.index(col) # 计算绝对偏差 |x - target| X[:, col_idx] = np.abs(X[:, col_idx] - target_val) # 将靶心型指标视为成本型 if col_idx not in cost_col_indices: cost_col_indices.append(col_idx) target_col_indices.append(col_idx) # 处理缺失值和无穷值 X = np.where(np.isnan(X), np.nanmean(X, axis=0), X) X = np.where(np.isinf(X), np.nan, X) X = np.where(np.isnan(X), np.nanmean(X, axis=0), X) # 计算基本统计量 min_vals = np.nanmin(X, axis=0) max_vals = np.nanmax(X, axis=0) ranges = max_vals - min_vals ranges[ranges < epsilon] = epsilon # 标准化处理 Z = np.zeros_like(X) for j in range(X.shape[1]): if j in cost_col_indices: Z[:, j] = (max_vals[j] - X[:, j]) / ranges[j] else: Z[:, j] = (X[:, j] - min_vals[j]) / ranges[j] # 计算标准化值的和 standardized_sums = np.sum(Z, axis=0) # 计算标准差 std_dev = np.std(Z, axis=0, ddof=1) # 计算相关系数矩阵 corr_matrix = np.corrcoef(Z, rowvar=False) # 计算冲突性 conflict = np.sum(1 - np.abs(corr_matrix), axis=1) # 计算信息量 information = std_dev * conflict # 计算权重 weights = information / np.sum(information) # 整理结果信息 info = { "columns": columns, "std_dev": std_dev, "conflict": conflict, "information": information, "target_col_indices": target_col_indices, "cost_col_indices": cost_col_indices, "standardized_sums": standardized_sums, "standardized_data": Z } return weights, info def stability_analysis_with_noise(data, critic_func, noise_levels=[0.01, 0.05, 0.1], n_iter=100, **critic_kwargs): """ CRITIC权重稳定性分析 """ # 计算原始权重 original_weights, original_info = critic_func(data, **critic_kwargs) columns = original_info['columns'] # 初始化结果存储 stability_results = { 'noise_levels': noise_levels, 'original_weights': original_weights, 'weight_matrix': {}, 'stability_stats': {} } # 对每个噪声水平进行分析 for noise_level in noise_levels: weight_matrix = [] for i in range(n_iter): # 复制原始数据 if isinstance(data, pd.DataFrame): noisy_data = data.copy() else: noisy_data = data.copy() # 为每列添加独立高斯噪声 for j, col in enumerate(columns): if col in data.columns: std_dev = np.std(data[col]) noise = np.random.normal(0, noise_level * std_dev, size=len(data)) noisy_data[col] += noise # 计算带噪声数据的权重 weights, _ = critic_func(noisy_data, **critic_kwargs) weight_matrix.append(weights) # 转换为数组 weight_matrix = np.array(weight_matrix) stability_results['weight_matrix'][noise_level] = weight_matrix # 计算统计量 mean_weights = np.mean(weight_matrix, axis=0) std_weights = np.std(weight_matrix, axis=0) cv_weights = std_weights / np.where(mean_weights != 0, mean_weights, 1e-10) # 避免除零 # 创建DataFrame存储统计结果 stability_stats = pd.DataFrame({ '指标': columns, '原始权重': original_weights, '平均权重': mean_weights, '标准差': std_weights, '变异系数': cv_weights, '噪声水平': noise_level }) stability_results['stability_stats'][noise_level] = stability_stats # 可视化结果 visualize_stability_results(stability_results) # 保存报告 save_stability_report(stability_results, original_info) return stability_results def visualize_stability_results(results): """可视化稳定性分析结果 - 只绘制箱线图和热力图""" try: if os.name == 'nt': # Windows title_font = {'family': 'SimHei', 'size': 16} label_font = {'family': 'Microsoft YaHei', 'size': 14} else: # Mac/Linux title_font = {'family': 'WenQuanYi Micro Hei', 'size': 16} label_font = {'family': 'WenQuanYi Zen Hei', 'size': 14} except: title_font = {'size': 16} label_font = {'size': 14} noise_levels = results['noise_levels'] columns = results['original_weights'].index if isinstance(results['original_weights'], pd.Series) else range( len(results['original_weights'])) # 获取指标名称列表 if isinstance(columns, pd.Index): indicator_names = columns.tolist() else: # 如果columns是数字索引,使用原始信息中的列名 indicator_names = results['stability_stats'][noise_levels[0]]['指标'].tolist() color_palette = plt.cm.viridis(np.linspace(0, 1, len(noise_levels))) # =================================== # 1. 权重变化箱线图(独立图表) # =================================== plt.figure(figsize=(14, 8)) # 为每个噪声水平创建不同的颜色 for i, noise_level in enumerate(noise_levels): weight_matrix = results['weight_matrix'][noise_level] weight_changes = [] for j in range(weight_matrix.shape[1]): orig_weight = results['original_weights'][j] changes = (weight_matrix[:, j] - orig_weight) / orig_weight * 100 weight_changes.append(changes) # 绘制箱线图 - 使用指标名称作为横坐标 positions = np.arange(len(indicator_names)) + i * 0.2 box = plt.boxplot(weight_changes, positions=positions, widths=0.15, patch_artist=True, boxprops=dict(facecolor=color_palette[i], alpha=0.7), medianprops=dict(color='black', linewidth=1.5)) # 添加参考线、标签和标题 plt.axhline(y=0, color='r', linestyle='--', linewidth=1.5) # 设置横坐标为指标名称 plt.xticks(np.arange(len(indicator_names)), indicator_names, rotation=45, ha='right', fontproperties=label_font) plt.xlabel('指标', fontproperties=label_font) plt.ylabel('权重变化百分比 (%)', fontproperties=label_font) plt.title('不同噪声水平下的权重变化', fontproperties=title_font) # 创建自定义图例 - 放在右下角 legend_handles = [plt.Rectangle((0, 0), 1, 1, color=color_palette[i], alpha=0.7) for i in range(len(noise_levels))] legend_labels = [f'{level * 100:.0f}%噪声' for level in noise_levels] # 将图例放在右下角并添加边框 plt.legend(legend_handles, legend_labels, title='噪声水平', prop=label_font, loc='lower right', # 放在右下角 bbox_to_anchor=(0.98, 0.02), # 精确控制位置 frameon=True, # 添加边框 framealpha=0.8, # 设置透明度 fontsize=11, # 字体大小 title_fontsize=12) # 标题字体大小 plt.grid(axis='y', linestyle='--', alpha=0.3) # 调整布局以确保图例不遮挡 plt.tight_layout(rect=[0, 0, 1, 0.95]) # 调整布局区域 # 保存箱线图 try: plt.savefig('权重变化箱线图.png', dpi=300, bbox_inches='tight') print("权重变化箱线图已保存为 '权重变化箱线图.png'") except Exception as e: print(f"保存箱线图时出错: {e}") # 显示或关闭当前图表 try: plt.show() except Exception as e: print(f"显示箱线图时出错: {e}") # 关闭当前图表 plt.close() # =================================== # 2. 变异系数热力图(独立图表) # =================================== plt.figure(figsize=(14, 8)) # 准备热力图数据 cv_matrix = [] for noise_level in noise_levels: stats_df = results['stability_stats'][noise_level] cv_matrix.append(stats_df['变异系数'].values) cv_matrix = np.array(cv_matrix) # 创建热力图 ax = sns.heatmap(cv_matrix, annot=True, fmt=".3f", cmap="coolwarm", xticklabels=indicator_names, # 使用指标名称 yticklabels=[f'{level * 100:.0f}%' for level in noise_levels]) # 设置标签和标题 plt.setp(ax.get_xticklabels(), fontproperties=label_font, rotation=45, ha='right') plt.setp(ax.get_yticklabels(), fontproperties=label_font) plt.xlabel('指标', fontproperties=label_font) plt.ylabel('噪声水平', fontproperties=label_font) plt.title('权重变异系数热力图', fontproperties=title_font) # 添加颜色条标签 cbar = ax.collections[0].colorbar cbar.set_label('变异系数 (CV)', rotation=270, labelpad=20, fontproperties=label_font) plt.tight_layout() # 保存热力图 try: plt.savefig('变异系数热力图.png', dpi=300, bbox_inches='tight') print("变异系数热力图已保存为 '变异系数热力图.png'") except Exception as e: print(f"保存热力图时出错: {e}") # 显示或关闭当前图表 try: plt.show() except Exception as e: print(f"显示热力图时出错: {e}") # 关闭当前图表 plt.close() def save_stability_report(results, info): """保存稳定性分析报告""" report = "CRITIC权重稳定性分析报告\n" report += "=" * 80 + "\n\n" # 1. 原始权重信息 report += "原始权重信息:\n" report += "-" * 50 + "\n" for i, col in enumerate(info['columns']): report += f"{col}: {results['original_weights'][i]:.6f} " report += f"(标准差: {info['std_dev'][i]:.4f}, " report += f"冲突性: {info['conflict'][i]:.4f}, " report += f"信息量: {info['information'][i]:.4f})\n" # 2. 稳定性统计 report += "\n\n稳定性分析结果:\n" report += "=" * 50 + "\n" for noise_level in results['noise_levels']: stats_df = results['stability_stats'][noise_level] report += f"\n噪声水平: {noise_level * 100:.1f}%\n" report += "-" * 40 + "\n" report += f"{'指标':<20}{'原始权重':>12}{'平均权重':>12}{'标准差':>10}{'变异系数':>12}\n" for _, row in stats_df.iterrows(): report += f"{row['指标']:<20}{row['原始权重']:>12.6f}{row['平均权重']:>12.6f}" report += f"{row['标准差']:>10.6f}{row['变异系数']:>12.4f}\n" # 3. 稳定性评估 report += "\n\n稳定性评估:\n" report += "=" * 50 + "\n" # 计算整体稳定性指标 overall_stability = {} for noise_level in results['noise_levels']: stats_df = results['stability_stats'][noise_level] avg_cv = stats_df['变异系数'].mean() max_cv = stats_df['变异系数'].max() overall_stability[noise_level] = { '平均变异系数': avg_cv, '最大变异系数': max_cv } # 评估稳定性 if max_cv < 0.1: stability_level = "极高稳定性" elif max_cv < 0.2: stability_level = "高稳定性" elif max_cv < 0.3: stability_level = "中等稳定性" else: stability_level = "低稳定性" report += (f"噪声水平 {noise_level * 100:.1f}%: " f"平均变异系数={avg_cv:.4f}, 最大变异系数={max_cv:.4f} → {stability_level}\n") # 4. 关键指标稳定性分析 report += "\n关键指标稳定性分析:\n" report += "-" * 50 + "\n" # 找出权重最大的3个指标 top_indices = np.argsort(results['original_weights'])[-3:] top_columns = [info['columns'][i] for i in top_indices] for col in top_columns: col_index = info['columns'].index(col) cv_values = [] for noise_level in results['noise_levels']: stats_df = results['stability_stats'][noise_level] cv = stats_df[stats_df['指标'] == col]['变异系数'].values[0] cv_values.append(cv) avg_cv = np.mean(cv_values) report += f"指标 '{col}' (权重={results['original_weights'][col_index]:.4f}): " report += f"平均变异系数={avg_cv:.4f} → " if avg_cv < 0.1: report += "稳定性优异\n" elif avg_cv < 0.2: report += "稳定性良好\n" elif avg_cv < 0.3: report += "稳定性一般\n" else: report += "稳定性较差\n" # 保存报告 try: with open('CRITIC_稳定性分析报告.txt', 'w', encoding='utf-8') as f: f.write(report) print("稳定性分析报告已保存为 'CRITIC_稳定性分析报告.txt' (UTF-8编码)") except: try: with open('CRITIC_稳定性分析报告.txt', 'w') as f: f.write(report) print("稳定性分析报告已保存为 'CRITIC_稳定性分析报告.txt' (系统默认编码)") except Exception as e: print(f"保存报告时出错: {e}") print("\n" + "=" * 80) print("CRITIC权重稳定性分析报告(控制台输出)") print("=" * 80) print(report) def load_custom_data(file_path): """ 加载自定义CSV数据文件 """ try: # 尝试多种常见编码 encodings = ['utf-8', 'gbk', 'latin1', 'iso-8859-1'] for encoding in encodings: try: data = pd.read_csv(file_path, encoding=encoding) print(f"使用 {encoding} 编码成功加载数据") return data except UnicodeDecodeError: continue # 如果所有编码都失败,尝试自动检测 try: import chardet with open(file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] data = pd.read_csv(file_path, encoding=encoding) print(f"使用自动检测的 {encoding} 编码成功加载数据") return data except: raise ValueError("无法确定文件编码,请手动指定") except Exception as e: print(f"加载文件时出错: {e}") return None def validate_data(data): """ 验证数据质量 """ if data is None: return False print("\n数据验证:") print(f"- 数据维度: {data.shape[0]}行 × {data.shape[1]}列") # 检查缺失值 missing_values = data.isnull().sum().sum() if missing_values > 0: print(f"- 警告: 发现 {missing_values} 个缺失值,将使用列均值填充") # 检查非数值列 non_numeric_cols = data.select_dtypes(exclude=['number']).columns.tolist() if non_numeric_cols: print(f"- 错误: 发现非数值列: {non_numeric_cols}") print(" 请确保所有列都是数值类型") return False # 检查列数 if len(data.columns) < 2: print("- 错误: 数据至少需要2列指标") return False # 检查行数 if len(data) < 10: print("- 警告: 数据行数较少 (<10),结果可能不可靠") return True # 主程序入口 if __name__ == "__main__": print("=" * 80) print("CRITIC权重稳定性分析程序") print("=" * 80) # 1. 加载自定义CSV数据 data_file = "yingyu.csv" # 替换为您的CSV文件路径 custom_data = load_custom_data(data_file) if custom_data is None: print("无法加载数据,程序终止") exit(1) # 2. 验证数据质量 if not validate_data(custom_data): print("数据验证失败,请修正数据后重试") exit(1) print("\n数据前5行预览:") print(custom_data.head()) # 3. 指定指标类型 # 靶心型指标:字典形式 {指标列名: 目标值} target_spec = { "凝结水过冷度": 0, # 我们希望凝结水过冷度越接近0越好 "炉膛负压": -100 # 我们希望炉膛负压越接近-100越好 } # 成本型指标:列表形式,值越小越好 cost_indicators = [ "低压缸排汽压力", "发电机定子绕组温度", "SO2", "NOx", "烟尘", "供电煤耗率", "厂用电率", "CO2" ] # 4. 确保指定的列名在数据中存在 all_columns = custom_data.columns.tolist() missing_targets = [col for col in target_spec if col not in all_columns] missing_costs = [col for col in cost_indicators if col not in all_columns] if missing_targets: print(f"警告: 以下靶心型指标在数据中不存在: {missing_targets}") if missing_costs: print(f"警告: 以下成本型指标在数据中不存在: {missing_costs}") # 5. 计算原始权重 print("\n计算CRITIC权重中...") weights, info = enhanced_critic_weights( custom_data, target_indicators=target_spec, cost_indicators=cost_indicators ) # 打印原始权重 print("\n原始权重分配:") for col, weight in zip(info["columns"], weights): print(f"{col}: {weight:.6f}") # 6. 进行稳定性分析 print("\n进行稳定性分析...") stability_results = stability_analysis_with_noise( custom_data, critic_func=enhanced_critic_weights, noise_levels=[0.01, 0.03, 0.05], # 1%, 3%, 5%, 10%噪声 n_iter=50, # 迭代次数(可根据需要调整) target_indicators=target_spec, cost_indicators=cost_indicators ) print("\n分析完成!") print("结果已保存到:") print("- 权重变化箱线图.png") print("- 变异系数热力图.png") print("- CRITIC_稳定性分析报告.txt") 我想让他生成的图片中的文字都是对应的英文,如何修改代码,请你修改好,横坐标的指标名字我已经在文件中替换为英文了,就是需要你把其他的中文内容变成英文
最新发布
12-06
``` # 数据集字段示例(共87个特征) features = [ 'Flow Duration', 'Total Fwd Packets', 'Total Bwd Packets', 'Flow Bytes/s', 'Packet Length Mean', 'ACK Flag Count', 'Init_Win_bytes_forward', 'Label' # 标签:Benign或DDoS ] import pandas as pd from sklearn.model_selection import train_test_split # 加载数据集(假设已下载并解压为train.csv) data = pd.read_csv('train.csv') # 清洗数据:删除空值、重复项 data = data.dropna().drop_duplicates() # 标签编码:Benign->0, DDoS->1 data['Label'] = data['Label'].apply(lambda x: 0 if x == 'Benign' else 1) # 划分训练集和测试集(8:2) X = data.drop('Label', axis=1) y = data['Label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 通过随机森林评估特征重要性 from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) # 可视化Top10重要特征 import matplotlib.pyplot as plt feature_importances = pd.Series(rf.feature_importances_, index=X.columns) top_features = feature_importances.nlargest(10) top_features.plot(kind='barh', title='Feature Importance') plt.show() # 选取Top10特征重构数据集 selected_features = top_features.index.tolist() X_train = X_train[selected_features] X_test = X_test[selected_features] from xgboost import XGBClassifier from sklearn.metrics import classification_report, roc_auc_score # 初始化模型 model = XGBClassifier( n_estimators=200, max_depth=5, learning_rate=0.1, subsample=0.8 ) # 训练与预测 model.fit(X_train, y_train) y_pred = model.predict(X_test) # 输出评估报告 print(classification_report(y_test, y_pred)) print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred):.4f}") from flask import Flask, request, jsonify import joblib app = Flask(__name__) model = joblib.load('ddos_model.pkl') # 加载保存的模型 @app.route('/detect', methods=['POST']) def detect(): data = request.json features = [data[f] for f in selected_features] # 使用之前选定的特征 prediction = model.predict([features])[0] return jsonify({'is_ddos': int(prediction)}) if __name__ == '__main__': app.run(port=5000)```Traceback (most recent call last): File "E:\documents\Graduation Design\Code\123.py", line 21, in <module> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "E:\python\Lib\site-packages\sklearn\utils\_param_validation.py", line 216, in wrapper return func(*args, **kwargs) File "E:\python\Lib\site-packages\sklearn\model_selection\_split.py", line 2851, in train_test_split n_train, n_test = _validate_shuffle_split( ~~~~~~~~~~~~~~~~~~~~~~~^ n_samples, test_size, train_size, default_test_size=0.25 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "E:\python\Lib\site-packages\sklearn\model_selection\_split.py", line 2481, in _validate_shuffle_split raise ValueError( ...<3 lines>... ) ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
04-02
import os import re import subprocess import pandas as pd from openpyxl import load_workbook, Workbook import difflib import sys import io import time import shutil from pathlib import Path from collections import defaultdict import numpy as np # 设置系统标准输出为UTF-8 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') def recursive_compare_dirs(old_dir, new_dir): """ 递归比较两个目录,获取变更文件和变更行号(优化版) """ changed_files = defaultdict(set) print(f"递归比较目录: {old_dir} 和 {new_dir}") # 使用更高效的os.scandir替代os.walk for entry in os.scandir(new_dir): if entry.is_dir(): # 递归处理子目录 sub_changed = recursive_compare_dirs( os.path.join(old_dir, entry.name), os.path.join(new_dir, entry.name) ) for rel_path, lines in sub_changed.items(): changed_files[os.path.join(entry.name, rel_path)] = lines elif entry.is_file(): rel_path = os.path.relpath(entry.path, new_dir) old_path = os.path.join(old_dir, rel_path) # 处理新增文件 if not os.path.exists(old_path): try: # 使用更高效的行数统计方法 with open(entry.path, 'rb') as f: line_count = sum(1 for _ in f) # 标记所有行为已变更 changed_lines = set(range(1, line_count + 1)) changed_files[rel_path] = changed_lines except Exception as e: print(f"读取新文件出错: {entry.path} - {e}") continue # 处理修改文件 - 使用文件大小和修改时间快速过滤 if (os.path.getsize(entry.path) == os.path.getsize(old_path) and os.path.getmtime(entry.path) <= os.path.getmtime(old_path)): continue try: # 读取文件内容 with open(old_path, 'r', encoding='utf-8', errors='ignore') as f_old: old_content = f_old.readlines() with open(entry.path, 'r', encoding='utf-8', errors='ignore') as f_new: new_content = f_new.readlines() # 比较内容差异 if old_content != new_content: changed_lines = detect_changed_lines(old_content, new_content) if changed_lines: changed_files[rel_path] = changed_lines except Exception as e: print(f"比较文件出错: {rel_path} - {e}") return dict(changed_files) def detect_changed_lines(old_content, new_content): """ 优化版:检测文件中的变更行号 """ changed_lines = set() matcher = difflib.SequenceMatcher(None, old_content, new_content) for opcode in matcher.get_opcodes(): if opcode[0] != 'equal': # 添加变更行号范围 start = opcode[3] end = opcode[4] changed_lines.update(range(start, end)) return changed_lines def get_changed_files_and_lines(old_dir, new_dir, winmerge_path, save_report=False): """使用WinMerge获取变更文件列表及具体变更行号(优化版)""" # 创建临时目录存放报告 temp_dir = os.path.join(os.path.dirname(__file__), "temp") os.makedirs(temp_dir, exist_ok=True) report_file = os.path.join(temp_dir, "winmerge_diff_report.txt") # 移除路径结尾的反斜杠 old_dir = old_dir.rstrip('\\') new_dir = new_dir.rstrip('\\') # WinMerge命令参数 cmd = [ f'"{winmerge_path}"', '/u', '/r', '/minimize', '/noprefs', '/noninteractive', f'/report="{report_file}"', '/f "Text Report"', f'"{old_dir}"', f'"{new_dir}"' ] full_cmd = ' '.join(cmd) changed_files = {} try: # 运行WinMerge result = subprocess.run( full_cmd, shell=True, timeout=600, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8', errors='ignore' ) # 检查报告文件 report_content = "" if os.path.exists(report_file): try: with open(report_file, 'r', encoding='utf-16') as f: report_content = f.read() except UnicodeError: try: with open(report_file, 'r', encoding='utf-8') as f: report_content = f.read() except Exception: report_content = result.stdout else: report_content = result.stdout # 当报告内容可用时处理 if report_content.strip(): # 使用更高效的正则表达式匹配 pattern = re.compile( r'(?:文件|Files|Comparing|File)\s+["\']?(.+?)["\']?\s+(?:和|and|are)\s+["\']?(.+?)["\']?\s+(?:不同|differ|different)', re.IGNORECASE ) diff_files = set() for match in pattern.finditer(report_content): new_file = os.path.normpath(match.group(2).strip()) diff_files.add(new_file) # 批量处理文件比较 for file_path in diff_files: rel_path = os.path.relpath(file_path, new_dir) old_file_path = os.path.join(old_dir, rel_path) if os.path.isfile(old_file_path) and os.path.isfile(file_path): try: with open(old_file_path, 'r', encoding='utf-8', errors='ignore') as f_old: content_old = f_old.readlines() with open(file_path, 'r', encoding='utf-8', errors='ignore') as f_new: content_new = f_new.readlines() changed_lines = detect_changed_lines(content_old, content_new) if changed_lines: changed_files[rel_path] = changed_lines except Exception: continue except subprocess.TimeoutExpired: print("WinMerge执行超时") except Exception: # 异常时使用递归比较 changed_files = recursive_compare_dirs(old_dir, new_dir) return changed_files def detect_encoding(file_path): """优化版:检测文件编码""" # 常见编码类型列表(优先级排序) encodings = ['utf-8', 'utf-16', 'cp932', 'shift_jis', 'gbk', 'big5', 'latin1'] for encoding in encodings: try: with open(file_path, 'r', encoding=encoding) as f: f.read(4096) return encoding except: continue return 'utf-8' def update_excel_sheets(csv_folder, output_excel, changed_files): """优化版:更新Excel表格(大幅提升性能)""" try: print(f"开始更新Excel: {output_excel}") # 加载或创建Excel文件 if os.path.exists(output_excel): wb = load_workbook(output_excel) else: wb = Workbook() # 删除默认创建的工作表 for sheet_name in wb.sheetnames: wb.remove(wb[sheet_name]) # 创建所需的工作表 wb.create_sheet("ファイル差分") wb.create_sheet("_org_fm") wb.create_sheet("warn") # === 功能1: 写入文件差分表 === if "ファイル差分" not in wb.sheetnames: wb.create_sheet("ファイル差分") ws_diff = wb["ファイル差分"] # 清空工作表(保留标题) if ws_diff.max_row > 1: ws_diff.delete_rows(2, ws_diff.max_row - 1) if ws_diff.max_row == 0 or ws_diff["A1"].value != "文件路径": ws_diff.append(["文件路径"]) # 批量写入文件路径 ws_diff.append(list(changed_files.keys())) # === 功能2: 复制func_met.csv到_org_fm工作表 === func_met_path = os.path.join(csv_folder, "func_met.csv") if os.path.exists(func_met_path): if "_org_fm" not in wb.sheetnames: wb.create_sheet("_org_fm") ws_fm = wb["_org_fm"] # 清空工作表 if ws_fm.max_row > 1: ws_fm.delete_rows(2, ws_fm.max_row - 1) # 读取并写入数据 encoding = detect_encoding(func_met_path) df_fm = pd.read_csv(func_met_path, encoding=encoding) # 批量写入数据 for r_idx, row in enumerate(df_fm.itertuples(index=False), 2): for c_idx, value in enumerate(row, 1): ws_fm.cell(row=r_idx, column=c_idx, value=value) # === 功能3: 高效处理warn.csv === warn_path = os.path.join(csv_folder, "warn.csv") if os.path.exists(warn_path): if "warn" not in wb.sheetnames: wb.create_sheet("warn") ws_warn = wb["warn"] headers = ['Source', 'Line #', 'Level', 'Warn #', 'Message', 'WarnFilter(变更有无)'] ws_warn.append(headers) else: ws_warn = wb["warn"] # 读取CSV文件 encoding = detect_encoding(warn_path) df_warn = pd.read_csv(warn_path, encoding=encoding) # 列名映射 column_mapping = { 'File': 'Source', 'Line': 'Line #', 'Grp': 'Level', 'Nbr': 'Warn #', 'Description': 'Message' } # 构建变更文件映射 file_map = {} for file_path, changed_lines in changed_files.items(): # 文件名映射 filename = Path(file_path).name if filename not in file_map: file_map[filename] = set() file_map[filename] |= changed_lines # 完整路径映射 file_map[file_path] = changed_lines # 准备数据批量写入 rows_to_write = [] # 使用向量化操作标记变更 if 'File' in df_warn.columns and 'Line' in df_warn.columns: # 创建变更标记列 df_warn['WarnFilter(变更有无)'] = 'No' # 向量化处理变更标记 for file_path, lines in file_map.items(): # 文件名匹配 if Path(file_path).name == file_path: # 判断是否为纯文件名 mask = (df_warn['File'] == file_path) & df_warn['Line'].isin(lines) else: # 完整路径匹配 mask = (df_warn['File'] == file_path) & df_warn['Line'].isin(lines) df_warn.loc[mask, 'WarnFilter(变更有无)'] = 'Yes' # 构建要写入的行数据 for _, row in df_warn.iterrows(): new_row = [ row.get(column_mapping.get('File', 'Source'), ''), row.get(column_mapping.get('Line', 'Line #'), ''), row.get(column_mapping.get('Grp', 'Level'), ''), row.get(column_mapping.get('Nbr', 'Warn #'), ''), row.get(column_mapping.get('Description', 'Message'), ''), row.get('WarnFilter(变更有无)', 'No') ] rows_to_write.append(new_row) # 清空现有数据行 if ws_warn.max_row > 1: ws_warn.delete_rows(2, ws_warn.max_row - 1) # 批量写入Excel for row in rows_to_write: ws_warn.append(row) # 保存Excel wb.save(output_excel) return True except Exception as e: print(f"更新Excel出错: {str(e)}") # 错误备份逻辑保持不变 timestamp = time.strftime("%Y%m%d_%H%M%S") backup_name = f"{output_excel}.error_{timestamp}.xlsx" try: wb.save(backup_name) print(f"已备份错误文件至: {backup_name}") except: print("备份失败") return False def main(): # 配置路径(根据实际情况修改) old_code_dir = r"E:\system\Desktop\项目所需文件\工具\ffff\code\old\GA_D82DD83D_00-00-07\mainline\spa_traveo\src" new_code_dir = r"E:\system\Desktop\项目所需文件\工具\ffff\code\new\GA_D82DD83D_00-00-08\mainline\spa_traveo\src" csv_folder = r"E:\system\Desktop\项目所需文件\工具\ffff\APL\Tool出力結果" output_excel = r"E:\system\Desktop\项目所需文件\工具\ffff\GA_D24D_00-00-01(三回目)_QAC.xlsx" winmerge_path = r"E:/App/WinMerge/WinMerge/WinMergeU.exe" print("="*50) print("开始文件比较...") try: # 修复变量名错误:new_dir -> new_code_dir changed_files = get_changed_files_and_lines(old_code_dir, new_code_dir, winmerge_path) print(f"找到 {len(changed_files)} 个变更文件") print("="*50) print("更新Excel表格...") success = update_excel_sheets(csv_folder, output_excel, changed_files) if success: print(f"处理完成! 输出文件: {output_excel}") else: print("处理失败,请检查错误日志") except Exception as e: print(f"处理过程中发生错误: {str(e)}") import traceback traceback.print_exc() print("="*50) print("程序结束") if __name__ == "__main__": main() 1、我看不到excel文件中warn sheet里面对于warn.csv的填充的内容,全部都是乱掉的。列名映射的我看了一下应该没有问题,追加打印或者日志,详细排查问题。2、另外也找不到变更的文件了,我不仅需要变更的文件,还需要变更文件的行号(按照新代码的行号计算),后面会将该文件与warn的文件进行比较,需要用到行号
09-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值