需要分割的内容模板


其他情况可以根据状态标志位调整
算法思路
通过获取需要切分的标志,通过状态变量记录
当处于当前标志下的内容加入到buffer中
当遇到需要再excel中换行时就将buffer中的内容写入到excel中
脚本
from docx import Document
from openpyxl import Workbook
import re
# ---------- 配置 ----------
WORD_FILE = "/Users/asushiro/Downloads/规范化变电运维管理规定细则中的典型故障和异常处理.docx"
OUT_XLSX = "/Users/asushiro/Downloads/解析结果变电运维.xlsx"
# -------------------------
doc = Document(WORD_FILE)
wb = Workbook()
ws = wb.active
ws.append(["故障对象", "异常情况", "异常表现", "异常处理方案"])
# 判断是否是标志分割行
def is_first_level_heading(line: str, para) -> bool:
if para.style.name in ("Heading 1", "标题 1", "标题1"):
return True
return bool(re.match(r"^\d+\s+\S+", line)) # 1 xxx
def is_second_level_heading(line: str, para) -> bool:
if para.style.name in ("Heading 2", "标题 2", "标题2"):
return True
return bool(re.match(r"^\d+\.\d+\s+\S+", line)) # 1.1 xxx
def is_third_level_heading(line: str, para) -> bool:
if para.style.name in ("Heading 3", "标题 3", "标题3"):
return True
return bool(re.match(r"^\d+\.\d+\.\d+\s*\S*", line)) # 1.1.1 现象
# 状态变量
faulty_object = ""
condition = "" #标题只有一行,不需要用数组
phenomenon = [] #现象buffer
principle = [] #处理原则buffer
in_phenomenon = False
in_principle = False
have_faulty_object_started = False #当有需要重复利用的行时,定义的状态标志
#将缓冲的内容写入excel每行中
def flush_to_rows():
#判断是标志位是不是第一次
if not have_faulty_object_started:
return
phenomenon_text = "\n".join([l for l in phenomenon]).strip()
principle_text = "\n".join([l for l in principle]).strip()
ws.append([faulty_object, condition, phenomenon_text, principle_text])
# 逐段落解析
no: int = 0
for para in doc.paragraphs:
line = para.text.strip()
if line == "":
continue
if is_first_level_heading(line, para):
# flush previous pending task (if exist)
if have_faulty_object_started:
flush_to_rows()
# reset for new faulty object
faulty_object = line
# 每次遇到写入标志位后清理状态变量
condition = ""
phenomenon = []
principle = []
in_phenomenon = False
have_faulty_object_started = False
continue
# 其他标志位判断,除了需要重复利用的行的第一个要赋予started变量为true
if is_second_level_heading(line, para):
if have_faulty_object_started:
flush_to_rows()
condition = line
phenomenon = []
principle = []
in_phenomenon = False
in_principle = False
have_faulty_object_started = True
continue
#遇到其他变量将对应的状态变量赋为true
if is_third_level_heading(line, para):
if r"现象" in line:
in_phenomenon = True
in_principle = False
no = 0
continue
elif r"处理原则" in line:
in_phenomenon = False
in_principle = True
no = 0
continue
no += 1
if in_phenomenon:
phenomenon.append(str(no) + "、" + line)
continue
if in_principle:
principle.append(str(no) + "、" + line)
continue
#孤立行的处理办法
#若既不在任务也不在项目区,可能是文档里有自由文本(忽略或当作任务内容)
# 我们选择:如果还没开始过操作任务,则忽略;
# 如果已经开始过操作任务但当前既不在 task 也不在 proj(可能是缺少"操作任务"标记),则把它忽略
if have_faulty_object_started and not in_phenomenon and not in_principle:
# 把孤立行忽略
continue
# 循环结束后,flush 最后的任务
if have_faulty_object_started:
flush_to_rows()
# 保存 Excel
wb.save(OUT_XLSX)
print("解析完成,结果保存在:", OUT_XLSX)
537

被折叠的 条评论
为什么被折叠?



