步骤
-
获取A文件夹和B文件夹中所有csv文件的文件名。
-
遍历A文件夹中的每个csv文件,检查B文件夹中是否存在同名文件。
-
如果存在,比较两个文件的内容是否完全相同。
-
如果内容不同,则输出文件名;如果所有同名文件都一致,则输出一致的信息。
V1.0
import os
import pandas as pd
import filecmp
def compare_csv_folders(folder_a, folder_b):
# 获取两个文件夹中的文件列表
files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
# 找出共同的文件名
common_files = files_a & files_b
different_files = []
# 比较每个共同文件
for file_name in common_files:
path_a = os.path.join(folder_a, file_name)
path_b = os.path.join(folder_b, file_name)
if not filecmp.cmp(path_a, path_b, shallow=False):
dfa = pd.read_csv(path_a)
dfb = pd.read_csv(path_b)
if dfa.shape == dfb.shape:
if not dfa.equals(dfb):
different_files.append(file_name)
else:
different_files.append(file_name)
# 输出结果
if not different_files:
if common_files:
print("两个文件夹中所有同名CSV文件内容完全一致")
else:
print("未找到同名CSV文件")
else:
print("以下文件内容不一致:")
for file_name in different_files:
print(file_name)
# 使用示例
folder_a = 'A' # A文件夹路径
folder_b = 'B' # B文件夹路径
compare_csv_folders(folder_a, folder_b)
说明
- 找出两个文件夹中同名的CSV文件
- 使用二进制方式比较文件内容(包括元数据)
- 输出内容不一致的文件名
- 如果所有文件都一致,则显示成功消息
添加了进度条、英语注释和函数文档的 V1.1
import os
import filecmp
import pandas as pd
from tqdm import tqdm # For progress bar display
def compare_csv_folders(folder_a: str, folder_b: str) -> None:
"""
Compare CSV files with the same name in two folders and report differences.
Args:
folder_a (str): Path to the first folder containing CSV files
folder_b (str): Path to the second folder containing CSV files
Returns:
None: Results are printed to the console
"""
# Get list of CSV files in both folders
files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
# Find common filenames
common_files = files_a & files_b
different_files = []
# Display information about found files
print(f"Folder A contains {len(files_a)} CSV files")
print(f"Folder B contains {len(files_b)} CSV files")
print(f"Found {len(common_files)} common CSV files to compare")
# Compare each common file with progress bar
for file_name in tqdm(common_files, desc="Comparing files", unit="file"):
path_a = os.path.join(folder_a, file_name)
path_b = os.path.join(folder_b, file_name)
if not filecmp.cmp(path_a, path_b, shallow=False):
dfa = pd.read_csv(path_a)
dfb = pd.read_csv(path_b)
if dfa.shape == dfb.shape:
if not dfa.equals(dfb):
different_files.append(file_name)
else:
different_files.append(file_name)
# Output results
print("\n" + "="*50)
if not different_files:
if common_files:
print("✓ All common CSV files have identical content")
else:
print("⚠ No common CSV files found to compare")
else:
print(f"⚠ Found {len(different_files)} files with different content:")
for file_name in different_files:
print(f" - {file_name}")
print("="*50)
if __name__ == "__main__":
# Example usage
folder_a = 'A' # Path to folder A
folder_b = 'B' # Path to folder B
compare_csv_folders(folder_a, folder_b)
使用说明
- 需要安装
tqdm包来显示进度条:
pip install tqdm
- 显示每个文件夹中的CSV文件数量
- 显示找到的同名文件数量
- 使用进度条显示比较进度
- 输出比较结果
V 2.0
使用多进程可以显著加速CSV文件的读取和比较过程,特别是当文件数量较多或文件较大时。以下是改进后的代码,使用Python的multiprocessing模块实现并行处理:
import os
import filecmp
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from functools import partial
def compare_single_file(file_name, folder_a, folder_b):
"""比较单个CSV文件的内容"""
path_a = os.path.join(folder_a, file_name)
path_b = os.path.join(folder_b, file_name)
# 首先使用二进制比较,如果二进制相同则内容一定相同
if filecmp.cmp(path_a, path_b, shallow=False):
return file_name, True
# 二进制不同,再使用pandas比较内容
try:
dfa = pd.read_csv(path_a)
dfb = pd.read_csv(path_b)
# 比较形状和内容
if dfa.shape == dfb.shape and dfa.equals(dfb):
return file_name, True
else:
return file_name, False
except Exception as e:
print(f"Error comparing {file_name}: {str(e)}")
return file_name, False
def compare_csv_folders(folder_a: str, folder_b: str) -> None:
"""
Compare CSV files with the same name in two folders and report differences.
Args:
folder_a (str): Path to the first folder containing CSV files
folder_b (str): Path to the second folder containing CSV files
Returns:
None: Results are printed to the console
"""
# Get list of CSV files in both folders
files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
# Find common filenames
common_files = list(files_a & files_b)
different_files = []
# Display information about found files
print(f"Folder A contains {len(files_a)} CSV files")
print(f"Folder B contains {len(files_b)} CSV files")
print(f"Found {len(common_files)} common CSV files to compare")
# 使用多进程并行比较文件
print("Comparing files with multiprocessing...")
# 创建进程池
with mp.Pool(processes=mp.cpu_count()) as pool:
# 使用partial固定folder_a和folder_b参数
compare_func = partial(compare_single_file, folder_a=folder_a, folder_b=folder_b)
# 使用进程池并行处理,同时使用tqdm显示进度
results = []
for result in tqdm(pool.imap(compare_func, common_files),
total=len(common_files), desc="Comparing files", unit="file"):
results.append(result)
# 收集不同内容的文件
different_files = [file_name for file_name, is_same in results if not is_same]
# Output results
print("\n" + "="*50)
if not different_files:
if common_files:
print("✓ All common CSV files have identical content")
else:
print("⚠ No common CSV files found to compare")
else:
print(f"⚠ Found {len(different_files)} files with different content:")
for file_name in different_files:
print(f" - {file_name}")
print("="*50)
if __name__ == "__main__":
# 确保多进程在Windows下正常工作
mp.freeze_support()
# Example usage
folder_a = 'A' # Path to folder A
folder_b = 'B' # Path to folder B
compare_csv_folders(folder_a, folder_b)
主要改进点
- 多进程处理:使用multiprocessing.Pool创建进程池,并行处理文件比较任务
- 进程数优化:默认使用CPU核心数作为进程数(mp.cpu_count())
- 进度显示:保持使用tqdm显示进度条
- 错误处理:添加异常捕获,防止单个文件错误导致整个程序崩溃
- 二进制比较优化:先进行快速的二进制比较,只有在二进制不同的情况下才使用pandas进行内容比较
注意事项
-
在多进程环境下,每个进程都有独立的内存空间,这意味着内存使用量可能会增加
-
在Windows系统上,多进程需要if name == “main”:保护,并使用mp.freeze_support()

被折叠的 条评论
为什么被折叠?



