批量将word中的表格汇总成excel表格

最新推荐文章于 2025-10-02 23:00:19 发布

原创最新推荐文章于 2025-10-02 23:00:19 发布 · 471 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#word #excel #python #pandas

python 同时被 3 个专栏收录

12 篇文章

订阅专栏

word

5 篇文章

订阅专栏

excel

1 篇文章

订阅专栏

单位给下级单位发了某项活动的报名表，但下发的报名表是在word里，下级单位回传的也都是word。现在领导让汇总各单位回传的信息，并做数据分析。如果要先把word里的表格一个个复制到excel里，很耗时间。这里介绍一种方法，通过python代码，一键将文件夹里上百个word里的表格提出来并自动合并到一张excel中。

直接上代码：

import os
import re
import pandas as pd
from collections import defaultdict
from docx import Document
from pathlib import Path
from difflib import SequenceMatcher


def extract_and_merge_tables_by_style(input_folder, output_file):
    """
    提取Word文档中的表格，按样式分组，并合并到不同Sheet中

    :param input_folder: 包含Word文档的文件夹路径
    :param output_file: 合并后的Excel文件路径
    """
    # 确保输出目录存在
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # 收集所有文档路径
    docx_files = list(Path(input_folder).glob("*.docx"))
    if not docx_files:
        print(f"在 '{input_folder}' 中未找到任何Word文档")
        return

    print(f"找到 {len(docx_files)} 个Word文档")

    # 按表格样式分组存储
    style_groups = defaultdict(list)
    processed_files = 0
    table_count = 0

    for docx_file in docx_files:
        try:
            doc = Document(docx_file)
            # 提取文档中的所有表格
            tables = doc.tables
            if not tables:
                continue

            processed_files += 1
            print(f"\n处理 '{docx_file.name}': 包含 {len(tables)} 个表格")

            for table_idx, table in enumerate(tables, 1):
                try:
                    # 获取表格列数
                    col_count = len(table.columns) if hasattr(table, 'columns') else 0

                    # 提取列标题（如果表格有标题行）
                    headers = []
                    for row_idx, row in enumerate(table.rows):
                        row_data = [cell.text.strip().replace("\n", " ") for cell in row.cells]
                        # 如果这一行有内容，认为是可能的标题行
                        if any(row_data):
                            headers = row_data
                            break

                    # 定义表格样式标识符
                    style_id = f"columns={col_count}"
                    if headers:
                        # 创建紧凑的标题标识
                        header_id = "_".join([re.sub(r'\W+', '', h)[:5] for h in headers if h])[:30]
                        style_id += f"_headers={header_id}"

                    # 提取表格所有数据行（跳过已识别的标题行）
                    table_data = []
                    found_header = False
                    for row_idx, row in enumerate(table.rows):
                        row_data = [cell.text.strip().replace("\n", " ") for cell in row.cells]

                        # 如果是标题行并且是第一个遇到的标题行
                        if not found_header and any(row_data) and row_data == headers:
                            found_header = True
                            continue

                        if any(row_data):
                            # 填充缺失的单元格
                            while len(row_data) < col_count:
                                row_data.append("")
                            table_data.append(row_data)

                    if table_data:
                        style_groups[style_id].append({
                            "file": docx_file.name,
                            "table_idx": table_idx,
                            "headers": headers,
                            "col_count": col_count,
                            "data": table_data
                        })
                        table_count += 1
                        print(f"  表格 {table_idx}: 样式 [{style_id.split('_')[-1]}] ({len(table_data)}行)")
                    else:
                        print(f"  表格 {table_idx}: 没有有效数据行")

                except Exception as e:
                    print(f"  处理表格 {table_idx} 时出错: {str(e)}")

        except Exception as e:
            print(f"处理 '{docx_file.name}' 时出错: {str(e)}")

    print(f"\n成功处理 {processed_files}/{len(docx_files)} 个文档")
    print(f"总表格数: {table_count}")
    print(f"发现的表格样式类型: {len(style_groups)} 种")

    if not style_groups:
        print("未找到任何表格数据")
        return

    # 创建Excel写入器
    writer = pd.ExcelWriter(output_file, engine='openpyxl')

    # 合并每个样式组的表格数据
    print("\n按样式分组合并表格:")
    for style_idx, (style_id, group_tables) in enumerate(style_groups.items(), 1):
        # 检查是否有共同的表头
        common_headers = group_tables[0]["headers"]
        for table in group_tables:
            if table["headers"] != common_headers:
                common_headers = None
                break

        # 合并同一样式的所有表格
        all_data = []
        for table in group_tables:
            if common_headers:
                # 添加表头（只有第一个表需要添加）
                if not all_data:
                    all_data.append(common_headers)
                all_data.extend(table["data"])
            else:
                # 对于没有共同表头的表格，直接添加所有行
                if table["headers"]:
                    all_data.append(table["headers"])
                all_data.extend(table["data"])

        # 创建DataFrame
        if common_headers:
            df = pd.DataFrame(all_data[1:], columns=common_headers)
        elif all_data:
            # 如果没有任何标题，用通用列名
            col_count = group_tables[0]["col_count"]
            df = pd.DataFrame(all_data)
            if df.shape[1] > col_count:
                df = df.iloc[:, :col_count]
        else:
            continue

        # 创建Sheet名称
        if common_headers:
            # 从第一个标题创建名称
            header_name = "_".join([re.sub(r'\W+', '', h)[:5] for h in common_headers[:2] if h])[:15]
            sheet_name = f"样式{style_idx}_{header_name}"
        else:
            sheet_name = f"样式{style_idx}_表格"

        # 确保sheet名称不超过Excel限制
        sheet_name = sheet_name[:31]

        # 将数据写入Sheet
        df.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"  {sheet_name} - 组合表格数: {len(group_tables)}, 合并行数: {len(df)}")

    # 保存Excel文件
    writer.close()
    print(f"\n合并后的表格已保存到: {output_file} (包含 {len(style_groups)} 个Sheet)")


# ==================== 参数设置区域 ====================
# 请修改以下参数为实际路径
INPUT_FOLDER = r"C:\Users\Administrator\Documents\WeChat Files\xingchenzxc\FileStorage\File\2025-06\报名表"  # 替换为包含Word文档的文件夹路径
OUTPUT_FILE = r"C:\Users\Administrator\Documents\WeChat Files\xingchenzxc\FileStorage\File\2025-06\结果.xlsx"  # 替换为合并后的Excel文件路径
# ====================================================

if __name__ == "__main__":
    print("=" * 70)
    print("Word文档表格按样式分组与合并工具")
    print("=" * 70)
    print(f"输入文件夹: {INPUT_FOLDER}")
    print(f"输出文件: {OUTPUT_FILE}")
    print("-" * 70)

    extract_and_merge_tables_by_style(INPUT_FOLDER, OUTPUT_FILE)

    print("\n" + "=" * 70)
    print("处理完成！")
    print("=" * 70)