Python 比较两个文件夹中同名CSV文件的内容

步骤

  1. 获取A文件夹和B文件夹中所有csv文件的文件名。

  2. 遍历A文件夹中的每个csv文件,检查B文件夹中是否存在同名文件。

  3. 如果存在,比较两个文件的内容是否完全相同。

  4. 如果内容不同,则输出文件名;如果所有同名文件都一致,则输出一致的信息。

V1.0

import os
import pandas as pd
import filecmp

def compare_csv_folders(folder_a, folder_b):
    # 获取两个文件夹中的文件列表
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # 找出共同的文件名
    common_files = files_a & files_b
    different_files = []
    
    # 比较每个共同文件
    for file_name in common_files:
        path_a = os.path.join(folder_a, file_name)
        path_b = os.path.join(folder_b, file_name)
        
        if not filecmp.cmp(path_a, path_b, shallow=False):
            dfa = pd.read_csv(path_a)
            dfb = pd.read_csv(path_b)
            if dfa.shape == dfb.shape:
                if not dfa.equals(dfb):
                    different_files.append(file_name)
            else:
                different_files.append(file_name)

    # 输出结果
    if not different_files:
        if common_files:
            print("两个文件夹中所有同名CSV文件内容完全一致")
        else:
            print("未找到同名CSV文件")
    else:
        print("以下文件内容不一致:")
        for file_name in different_files:
            print(file_name)

# 使用示例
folder_a = 'A'  # A文件夹路径
folder_b = 'B'  # B文件夹路径

compare_csv_folders(folder_a, folder_b)

说明

  • 找出两个文件夹中同名的CSV文件
  • 使用二进制方式比较文件内容(包括元数据)
  • 输出内容不一致的文件名
  • 如果所有文件都一致,则显示成功消息

添加了进度条、英语注释和函数文档的 V1.1

import os
import filecmp
import pandas as pd
from tqdm import tqdm  # For progress bar display

def compare_csv_folders(folder_a: str, folder_b: str) -> None:
    """
    Compare CSV files with the same name in two folders and report differences.
    
    Args:
        folder_a (str): Path to the first folder containing CSV files
        folder_b (str): Path to the second folder containing CSV files
    
    Returns:
        None: Results are printed to the console
    """
    # Get list of CSV files in both folders
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # Find common filenames
    common_files = files_a & files_b
    different_files = []
    
    # Display information about found files
    print(f"Folder A contains {len(files_a)} CSV files")
    print(f"Folder B contains {len(files_b)} CSV files")
    print(f"Found {len(common_files)} common CSV files to compare")
    
    # Compare each common file with progress bar
    for file_name in tqdm(common_files, desc="Comparing files", unit="file"):
        path_a = os.path.join(folder_a, file_name)
        path_b = os.path.join(folder_b, file_name)
        
        if not filecmp.cmp(path_a, path_b, shallow=False):
            dfa = pd.read_csv(path_a)
            dfb = pd.read_csv(path_b)
            if dfa.shape == dfb.shape:
                if not dfa.equals(dfb):
                    different_files.append(file_name)
            else:
                different_files.append(file_name)
    
    # Output results
    print("\n" + "="*50)
    if not different_files:
        if common_files:
            print("✓ All common CSV files have identical content")
        else:
            print("⚠ No common CSV files found to compare")
    else:
        print(f"⚠ Found {len(different_files)} files with different content:")
        for file_name in different_files:
            print(f"  - {file_name}")
    print("="*50)


if __name__ == "__main__":
    # Example usage
    folder_a = 'A'  # Path to folder A
    folder_b = 'B'  # Path to folder B
    
    compare_csv_folders(folder_a, folder_b)

使用说明

  • 需要安装 tqdm 包来显示进度条:
	   pip install tqdm
  • 显示每个文件夹中的CSV文件数量
  • 显示找到的同名文件数量
  • 使用进度条显示比较进度
  • 输出比较结果

V 2.0

使用多进程可以显著加速CSV文件的读取和比较过程,特别是当文件数量较多或文件较大时。以下是改进后的代码,使用Python的multiprocessing模块实现并行处理:

import os
import filecmp
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from functools import partial

def compare_single_file(file_name, folder_a, folder_b):
    """比较单个CSV文件的内容"""
    path_a = os.path.join(folder_a, file_name)
    path_b = os.path.join(folder_b, file_name)
    
    # 首先使用二进制比较,如果二进制相同则内容一定相同
    if filecmp.cmp(path_a, path_b, shallow=False):
        return file_name, True
    
    # 二进制不同,再使用pandas比较内容
    try:
        dfa = pd.read_csv(path_a)
        dfb = pd.read_csv(path_b)
        
        # 比较形状和内容
        if dfa.shape == dfb.shape and dfa.equals(dfb):
            return file_name, True
        else:
            return file_name, False
    except Exception as e:
        print(f"Error comparing {file_name}: {str(e)}")
        return file_name, False

def compare_csv_folders(folder_a: str, folder_b: str) -> None:
    """
    Compare CSV files with the same name in two folders and report differences.
    
    Args:
        folder_a (str): Path to the first folder containing CSV files
        folder_b (str): Path to the second folder containing CSV files
    
    Returns:
        None: Results are printed to the console
    """
    # Get list of CSV files in both folders
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # Find common filenames
    common_files = list(files_a & files_b)
    different_files = []
    
    # Display information about found files
    print(f"Folder A contains {len(files_a)} CSV files")
    print(f"Folder B contains {len(files_b)} CSV files")
    print(f"Found {len(common_files)} common CSV files to compare")
    
    # 使用多进程并行比较文件
    print("Comparing files with multiprocessing...")
    
    # 创建进程池
    with mp.Pool(processes=mp.cpu_count()) as pool:
        # 使用partial固定folder_a和folder_b参数
        compare_func = partial(compare_single_file, folder_a=folder_a, folder_b=folder_b)
        
        # 使用进程池并行处理,同时使用tqdm显示进度
        results = []
        for result in tqdm(pool.imap(compare_func, common_files), 
                          total=len(common_files), desc="Comparing files", unit="file"):
            results.append(result)
    
    # 收集不同内容的文件
    different_files = [file_name for file_name, is_same in results if not is_same]
    
    # Output results
    print("\n" + "="*50)
    if not different_files:
        if common_files:
            print("✓ All common CSV files have identical content")
        else:
            print("⚠ No common CSV files found to compare")
    else:
        print(f"⚠ Found {len(different_files)} files with different content:")
        for file_name in different_files:
            print(f"  - {file_name}")
    print("="*50)

if __name__ == "__main__":
    # 确保多进程在Windows下正常工作
    mp.freeze_support()
    
    # Example usage
    folder_a = 'A'  # Path to folder A
    folder_b = 'B'  # Path to folder B
    
    compare_csv_folders(folder_a, folder_b)

主要改进点

  1. 多进程处理:使用multiprocessing.Pool创建进程池,并行处理文件比较任务
  2. 进程数优化:默认使用CPU核心数作为进程数(mp.cpu_count())
  3. 进度显示:保持使用tqdm显示进度条
  4. 错误处理:添加异常捕获,防止单个文件错误导致整个程序崩溃
  5. 二进制比较优化:先进行快速的二进制比较,只有在二进制不同的情况下才使用pandas进行内容比较

注意事项

  • 在多进程环境下,每个进程都有独立的内存空间,这意味着内存使用量可能会增加

  • 在Windows系统上,多进程需要if name == “main”:保护,并使用mp.freeze_support()

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值