docx转excel脚本，赋算法说明

最新推荐文章于 2025-11-22 13:04:58 发布

原创最新推荐文章于 2025-11-22 13:04:58 发布 · 426 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#excel #算法

Tips 专栏收录该内容

8 篇文章

订阅专栏

需要分割的内容模板

其他情况可以根据状态标志位调整

算法思路

通过获取需要切分的标志，通过状态变量记录
当处于当前标志下的内容加入到buffer中
当遇到需要再excel中换行时就将buffer中的内容写入到excel中

脚本

from docx import Document
from openpyxl import Workbook
import re

# ---------- 配置 ----------
WORD_FILE = "/Users/asushiro/Downloads/规范化变电运维管理规定细则中的典型故障和异常处理.docx"
OUT_XLSX = "/Users/asushiro/Downloads/解析结果变电运维.xlsx"
# -------------------------

doc = Document(WORD_FILE)
wb = Workbook()
ws = wb.active
ws.append(["故障对象", "异常情况", "异常表现", "异常处理方案"])


# 判断是否是标志分割行
def is_first_level_heading(line: str, para) -> bool:
    if para.style.name in ("Heading 1", "标题 1", "标题1"):
        return True
    return bool(re.match(r"^\d+\s+\S+", line))   # 1 xxx


def is_second_level_heading(line: str, para) -> bool:
    if para.style.name in ("Heading 2", "标题 2", "标题2"):
        return True
    return bool(re.match(r"^\d+\.\d+\s+\S+", line))  # 1.1 xxx


def is_third_level_heading(line: str, para) -> bool:
    if para.style.name in ("Heading 3", "标题 3", "标题3"):
        return True
    return bool(re.match(r"^\d+\.\d+\.\d+\s*\S*", line))  # 1.1.1 现象



# 状态变量
faulty_object = ""   
condition = ""     	  #标题只有一行，不需要用数组
phenomenon = []       #现象buffer       
principle = []        #处理原则buffer
in_phenomenon = False            
in_principle = False            
have_faulty_object_started = False  #当有需要重复利用的行时，定义的状态标志

#将缓冲的内容写入excel每行中
def flush_to_rows():
		#判断是标志位是不是第一次
    if not have_faulty_object_started:
        return

    phenomenon_text = "\n".join([l for l in phenomenon]).strip()
    principle_text = "\n".join([l for l in principle]).strip()

    ws.append([faulty_object, condition, phenomenon_text, principle_text])


# 逐段落解析
no: int = 0
for para in doc.paragraphs:
    line = para.text.strip()
    if line == "":
        continue

    if is_first_level_heading(line, para):
        # flush previous pending task (if exist)
        if have_faulty_object_started:
            flush_to_rows()

        # reset for new faulty object
        faulty_object = line
        # 每次遇到写入标志位后清理状态变量
        condition = ""
        phenomenon = []
        principle = []
        in_phenomenon = False
        have_faulty_object_started = False
        continue
		
	# 其他标志位判断，除了需要重复利用的行的第一个要赋予started变量为true
    if is_second_level_heading(line, para):
        if have_faulty_object_started:
            flush_to_rows()
        
        condition = line
        phenomenon = []
        principle = []
        in_phenomenon = False
        in_principle = False
        have_faulty_object_started = True
        continue
    
    #遇到其他变量将对应的状态变量赋为true
    if is_third_level_heading(line, para):
        if r"现象" in line:
            in_phenomenon = True
            in_principle = False
            no = 0
            continue
        elif r"处理原则" in line:
            in_phenomenon = False
            in_principle = True
            no = 0
            continue
    
    no += 1
    if in_phenomenon:
        phenomenon.append(str(no) + "、" + line)
        continue

    if in_principle:
        principle.append(str(no) + "、" + line)
        continue
		
	#孤立行的处理办法
    #若既不在任务也不在项目区，可能是文档里有自由文本（忽略或当作任务内容）
    # 我们选择：如果还没开始过操作任务，则忽略；
    # 如果已经开始过操作任务但当前既不在 task 也不在 proj（可能是缺少"操作任务"标记），则把它忽略
    if have_faulty_object_started and not in_phenomenon and not in_principle:
        # 把孤立行忽略
        continue
    

# 循环结束后，flush 最后的任务
if have_faulty_object_started:
    flush_to_rows()

# 保存 Excel
wb.save(OUT_XLSX)
print("解析完成，结果保存在：", OUT_XLSX)