遍历文件夹批量提取word文档

原创已于 2025-10-11 05:44:23 修改 · 602 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#python #word #批量提取

于 2025-10-11 05:20:08 首次发布

python 同时被 2 个专栏收录

12 篇文章

订阅专栏

word

5 篇文章

订阅专栏

需求场景：

老板让汇总全国100多个分公司的年度述职报告（word），每个分公司传了一个文件夹过来，里面包含分公司各个人员的报告。我现在想批量把这些报告全部都汇总到一个文件夹里，免得一个文件夹一个文件夹找。

实现代码：

import os
import shutil
from pathlib import Path


def collect_word_documents(source_dirs, target_dir):
    """
    从多个源文件夹收集Word文档到目标文件夹

    参数:
        source_dirs: 源文件夹路径列表
        target_dir: 目标文件夹路径
    """
    # 确保目标文件夹存在
    target_path = Path(target_dir)
    target_path.mkdir(parents=True, exist_ok=True)

    # 支持的Word文档扩展名
    word_extensions = {'.docx', '.doc'}

    # 计数器
    copied_count = 0
    skipped_count = 0

    print("开始收集Word文档...")

    for source_dir in source_dirs:
        source_path = Path(source_dir)

        if not source_path.exists():
            print(f"警告: 源文件夹不存在: {source_dir}")
            continue

        print(f"正在处理: {source_dir}")

        # 递归遍历所有文件和子文件夹
        for file_path in source_path.rglob('*'):
            if file_path.is_file() and file_path.suffix.lower() in word_extensions:
                # 生成目标文件路径
                target_file = target_path / file_path.name

                # 处理文件名冲突
                counter = 1
                while target_file.exists():
                    stem = file_path.stem
                    suffix = file_path.suffix
                    target_file = target_path / f"{stem}_{counter}{suffix}"
                    counter += 1

                # 复制文件
                try:
                    shutil.copy2(file_path, target_file)
                    print(f"已复制: {file_path} -> {target_file}")
                    copied_count += 1
                except Exception as e:
                    print(f"错误: 无法复制文件 {file_path}: {e}")
                    skipped_count += 1

    print(f"\n操作完成! 成功复制: {copied_count} 个文件, 跳过: {skipped_count} 个文件")
    print(f"文件已保存到: {target_path.absolute()}")


if __name__ == "__main__":
    # 设置源文件夹列表（可以修改为您的实际文件夹路径），多个文件夹用逗号隔开
    source_directories = [
        r"C:\Users\2025\Desktop\年度述职报告"  
    ]

    # 设置目标文件夹（可以修改为您的目标路径）
    target_directory = r"C:\Users\2025\Desktop\年度述职报告汇总" 

    # 执行收集操作
    collect_word_documents(source_directories, target_directory)