python实现批量word关键字标注

最新推荐文章于 2025-03-27 15:39:00 发布

原创最新推荐文章于 2025-03-27 15:39:00 发布 · 685 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python

本文介绍如何使用Python编程语言高效地实现批量处理Word文档，并进行关键词标注。通过示例代码，展示如何读取Word文件，搜索特定关键词并进行高亮标注，提升文档处理的自动化水平。

import docx
import os
from docx.opc import exceptions
from docx import shared

def replace_docx_format(path, dicts, formats=None, output=None):
    """
    Args:
        :param path: 文件路径，不可为空。
        :param dicts: 用于查找的字典，不可为空。
        :param formats: 具体目标格式。（默认为只加下划线）
        :param output: 格式化之后的保存到新文件的路径。（默认为原路径）

    Returns:
        输出查找的结果，如果为True则表示找到并替换成功，
        如果为False则表示未找到指定内容。

    Raises:
        ImportError: 缺少必要的库。
        PackageNotFoundError: 输入的文件路径有错，没有找到指定文件。
        TypeError: 缺少必要的参数或其他错误。
    """
    if not output:
        print('[提示] 已启用默认输出：新文件将会尝试覆盖源文件。')
        output = path
    if '.docx' not in path or '.docx' not in output:
        print('[错误] 请使用docx文件。')
        return False
    if not formats:
        print('[提示] 已启用默认格式：为找到的文字变红加粗。')
        formats = ['bold', 'red']

    count = 0
    try:
        file = docx.Document(path)
    except exceptions.PackageNotFoundError:
        print('[错误] 在路径 ', path, ' 下未找到指定文件！')
        return False
    for paragraph in file.paragraphs:
        for run in paragraph.runs:
            for key in dicts:
                if key in run.text:
                    rest = run.text.split(key)
                    count += 1
                    if key == run.text:
                        run = format_docx_runs(run, formats)
                    else:
                        rest = run.text.split(key)
                        run.text = ''
                        for text in rest[:-1]:
                            r = paragraph.add_run(text=text)
                            r = format_docx_runs(r, format1)
                            r = paragraph.add_run(text=key)
                            r = format_docx_runs(r, formats)
                            r = paragraph.add_run(rest[-1])
                            r = format_docx_runs(r, format1)

    if count == 0:
        print('[警告] 在文档 ', path, ' 中未找到指定内容！未进行保存。')
        return False
    try:
        file.save(output)
    except IOError:
        print('[错误] 请先关闭output的目标文件。')
    print('[提示] 已找到 ', count, ' 处需要替换的内容，并保存至 ', output)


def format_docx_runs(runs, formats):
    if 'underline' in formats:
        runs.underline = True
    if 'bold' in formats:
        runs.bold = True
    if 'red' in formats:
        runs.font.color.rgb = docx.shared.RGBColor(250, 0, 0)
    return runs

def list_folders_files(path):
    """
    返回 "文件夹" 和 "文件" 名字
    :param path: "文件夹"和"文件"所在的路径
    :return:  (list_folders, list_files)
            :list_folders: 文件夹
            :list_files: 文件
    """
    list_folders = []
    list_files = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if os.path.isdir(file_path):
            list_folders.append(file)
        else:
            list_files.append(file)
    return (list_folders, list_files)


if __name__ == '__main__':  # 主函数（测试用）
    #   Example:
    #   检查与本文件同目录下的document.docx文件中的Hello和World单词，如果存在，则替换为默认标注格式，并输出到源文件
    path = "C:\\Users\\LENOVO\\Desktop\\文献\\汇总-各家条文\\word汇总\\汇总-各家条文-形\\38-背"
    (list_folders, list_files) = list_folders_files(path)
    format1=['black']
    print("Path: " + path)
    for fileName in list_files:
        filePath = path + '\\' + fileName
        replace_docx_format(filePath, ['背'])