Python 比较两个文件夹中同名CSV文件的内容_python比较两个csv文件-优快云博客

步骤

获取A文件夹和B文件夹中所有csv文件的文件名。
遍历A文件夹中的每个csv文件，检查B文件夹中是否存在同名文件。
如果存在，比较两个文件的内容是否完全相同。
如果内容不同，则输出文件名；如果所有同名文件都一致，则输出一致的信息。

V1.0

import os
import pandas as pd
import filecmp

def compare_csv_folders(folder_a, folder_b):
    # 获取两个文件夹中的文件列表
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # 找出共同的文件名
    common_files = files_a & files_b
    different_files = []
    
    # 比较每个共同文件
    for file_name in common_files:
        path_a = os.path.join(folder_a, file_name)
        path_b = os.path.join(folder_b, file_name)
        
        if not filecmp.cmp(path_a, path_b, shallow=False):
            dfa = pd.read_csv(path_a)
            dfb = pd.read_csv(path_b)
            if dfa.shape == dfb.shape:
                if not dfa.equals(dfb):
                    different_files.append(file_name)
            else:
                different_files.append(file_name)

    # 输出结果
    if not different_files:
        if common_files:
            print("两个文件夹中所有同名CSV文件内容完全一致")
        else:
            print("未找到同名CSV文件")
    else:
        print("以下文件内容不一致：")
        for file_name in different_files:
            print(file_name)

# 使用示例
folder_a = 'A'  # A文件夹路径
folder_b = 'B'  # B文件夹路径

compare_csv_folders(folder_a, folder_b)

说明

找出两个文件夹中同名的CSV文件
使用二进制方式比较文件内容（包括元数据）
输出内容不一致的文件名
如果所有文件都一致，则显示成功消息

添加了进度条、英语注释和函数文档的 V1.1

import os
import filecmp
import pandas as pd
from tqdm import tqdm  # For progress bar display

def compare_csv_folders(folder_a: str, folder_b: str) -> None:
    """
    Compare CSV files with the same name in two folders and report differences.
    
    Args:
        folder_a (str): Path to the first folder containing CSV files
        folder_b (str): Path to the second folder containing CSV files
    
    Returns:
        None: Results are printed to the console
    """
    # Get list of CSV files in both folders
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # Find common filenames
    common_files = files_a & files_b
    different_files = []
    
    # Display information about found files
    print(f"Folder A contains {len(files_a)} CSV files")
    print(f"Folder B contains {len(files_b)} CSV files")
    print(f"Found {len(common_files)} common CSV files to compare")
    
    # Compare each common file with progress bar
    for file_name in tqdm(common_files, desc="Comparing files", unit="file"):
        path_a = os.path.join(folder_a, file_name)
        path_b = os.path.join(folder_b, file_name)
        
        if not filecmp.cmp(path_a, path_b, shallow=False):
            dfa = pd.read_csv(path_a)
            dfb = pd.read_csv(path_b)
            if dfa.shape == dfb.shape:
                if not dfa.equals(dfb):
                    different_files.append(file_name)
            else:
                different_files.append(file_name)
    
    # Output results
    print("\n" + "="*50)
    if not different_files:
        if common_files:
            print("✓ All common CSV files have identical content")
        else:
            print("⚠ No common CSV files found to compare")
    else:
        print(f"⚠ Found {len(different_files)} files with different content:")
        for file_name in different_files:
            print(f"  - {file_name}")
    print("="*50)


if __name__ == "__main__":
    # Example usage
    folder_a = 'A'  # Path to folder A
    folder_b = 'B'  # Path to folder B
    
    compare_csv_folders(folder_a, folder_b)

使用说明

需要安装 tqdm 包来显示进度条：

	   pip install tqdm

显示每个文件夹中的CSV文件数量
显示找到的同名文件数量
使用进度条显示比较进度
输出比较结果

V 2.0

使用多进程可以显著加速CSV文件的读取和比较过程，特别是当文件数量较多或文件较大时。以下是改进后的代码，使用Python的multiprocessing模块实现并行处理：

import os
import filecmp
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from functools import partial

def compare_single_file(file_name, folder_a, folder_b):
    """比较单个CSV文件的内容"""
    path_a = os.path.join(folder_a, file_name)
    path_b = os.path.join(folder_b, file_name)
    
    # 首先使用二进制比较，如果二进制相同则内容一定相同
    if filecmp.cmp(path_a, path_b, shallow=False):
        return file_name, True
    
    # 二进制不同，再使用pandas比较内容
    try:
        dfa = pd.read_csv(path_a)
        dfb = pd.read_csv(path_b)
        
        # 比较形状和内容
        if dfa.shape == dfb.shape and dfa.equals(dfb):
            return file_name, True
        else:
            return file_name, False
    except Exception as e:
        print(f"Error comparing {file_name}: {str(e)}")
        return file_name, False

def compare_csv_folders(folder_a: str, folder_b: str) -> None:
    """
    Compare CSV files with the same name in two folders and report differences.
    
    Args:
        folder_a (str): Path to the first folder containing CSV files
        folder_b (str): Path to the second folder containing CSV files
    
    Returns:
        None: Results are printed to the console
    """
    # Get list of CSV files in both folders
    files_a = set(f for f in os.listdir(folder_a) if f.endswith('.csv'))
    files_b = set(f for f in os.listdir(folder_b) if f.endswith('.csv'))
    
    # Find common filenames
    common_files = list(files_a & files_b)
    different_files = []
    
    # Display information about found files
    print(f"Folder A contains {len(files_a)} CSV files")
    print(f"Folder B contains {len(files_b)} CSV files")
    print(f"Found {len(common_files)} common CSV files to compare")
    
    # 使用多进程并行比较文件
    print("Comparing files with multiprocessing...")
    
    # 创建进程池
    with mp.Pool(processes=mp.cpu_count()) as pool:
        # 使用partial固定folder_a和folder_b参数
        compare_func = partial(compare_single_file, folder_a=folder_a, folder_b=folder_b)
        
        # 使用进程池并行处理，同时使用tqdm显示进度
        results = []
        for result in tqdm(pool.imap(compare_func, common_files), 
                          total=len(common_files), desc="Comparing files", unit="file"):
            results.append(result)
    
    # 收集不同内容的文件
    different_files = [file_name for file_name, is_same in results if not is_same]
    
    # Output results
    print("\n" + "="*50)
    if not different_files:
        if common_files:
            print("✓ All common CSV files have identical content")
        else:
            print("⚠ No common CSV files found to compare")
    else:
        print(f"⚠ Found {len(different_files)} files with different content:")
        for file_name in different_files:
            print(f"  - {file_name}")
    print("="*50)

if __name__ == "__main__":
    # 确保多进程在Windows下正常工作
    mp.freeze_support()
    
    # Example usage
    folder_a = 'A'  # Path to folder A
    folder_b = 'B'  # Path to folder B
    
    compare_csv_folders(folder_a, folder_b)