import os
import re
import shutil
import zipfile
import tempfile
from typing import List, Dict, Tuple
import tkinter as tk
from tkinter import filedialog, messagebox
from docx import Document
from docx.shared import Pt
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.shape import CT_Picture
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
class WordProcessor:
def __init__(self):
self.input_path = ""
self.output_dir = ""
self.full_titles = []
self.current_title_path = []
self.ignore_titles = ["目录", "目 录", "contents", "Contents"]
self.chapter_counter = 0
self.image_counter = 1
self.media_files = {} # 存储图片文件
self.base_filename = "" # 基础文件名
self.source_doc = None # 存储原始文档对象
def select_input_file(self):
"""选择输入文件"""
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename(
title="选择要处理的Word文档",
filetypes=[("Word文档", "*.docx"), ("所有文件", "*.*")]
)
if file_path:
self.input_path = file_path
self.base_filename = os.path.splitext(os.path.basename(file_path))[0]
self._extract_media_files() # 提取文档中的图片文件
self.source_doc = Document(file_path) # 加载原始文档
return file_path
def select_output_dir(self):
"""选择输出目录"""
root = tk.Tk()
root.withdraw()
dir_path = filedialog.askdirectory(title="选择输出目录")
if dir_path:
self.output_dir = dir_path
return dir_path
def _extract_media_files(self):
"""从Word文档中提取图片文件"""
self.media_files = {}
with zipfile.ZipFile(self.input_path) as z:
for file in z.namelist():
if file.startswith('word/media/'):
self.media_files[file] = z.read(file)
def is_title(self, paragraph, level=1):
"""判断段落是否是标题"""
# 方法1:检查样式名称
if paragraph.style.name.startswith(f'Heading {level}'):
return True
# 方法2:检查格式特征
if level == 1 and paragraph.runs:
run = paragraph.runs[0]
if run.bold and run.font.size == Pt(16):
return True
# 方法3:检查文本模式
text = paragraph.text.strip()
if level == 1 and re.match(r'^第[一二三四五六七八九十]+章', text):
return True
return False
def should_ignore_paragraph(self, paragraph):
"""判断是否应该忽略此段落"""
text = paragraph.text.strip()
return (not text or
text in self.ignore_titles or
re.match(r'^\d+$', text))
def is_useless_image(self, paragraph):
"""判断是否是无用图片"""
return "logo" in paragraph.text.lower()
def is_useless_text(self, paragraph):
"""判断是否是无用文本"""
pattern = r'[A-Z]{1,5}(\s*[/-]\s*[A-Z]{1,5})*\s*\d+(-\d+)*'
return re.fullmatch(pattern, paragraph.text.strip())
def clean_document(self, doc):
"""清理文档中的无用内容"""
# 清理段落
for paragraph in list(doc.paragraphs):
if (self.should_ignore_paragraph(paragraph) or
self.is_useless_image(paragraph) or
self.is_useless_text(paragraph)):
self._remove_element(paragraph._element)
# 清理页眉页脚
for section in doc.sections:
for paragraph in section.header.paragraphs:
if self.is_useless_text(paragraph):
self._remove_element(paragraph._element)
for paragraph in section.footer.paragraphs:
if self.is_useless_text(paragraph):
self._remove_element(paragraph._element)
return doc
def _remove_element(self, element):
"""删除文档元素"""
if element is not None and element.getparent() is not None:
element.getparent().remove(element)
def process_tables(self, doc):
"""处理续表"""
tables = doc.tables
i = 0
while i < len(tables):
first_cell = tables[i].cell(0, 0).text.strip().lower()
if "续表" in first_cell or "continued" in first_cell:
if i > 0:
self._merge_tables(tables[i-1], tables[i])
self._remove_element(tables[i]._element)
i -= 1 # 因为删除了一个表格,索引需要调整
i += 1
return doc
def _merge_tables(self, main_table, continued_table):
"""合并两个表格"""
start_row = 1 if continued_table.rows[0].cells[0].text.strip().lower() in ["续表", "continued"] else 0
for row in continued_table.rows[start_row:]:
new_row = main_table.add_row()
for i, cell in enumerate(row.cells):
new_row.cells[i].text = cell.text
# 复制格式
if cell._element.tcPr is not None:
new_row.cells[i]._element.tcPr = parse_xml(cell._element.tcPr.xml)
def split_by_chapters(self):
"""按章节拆分文档"""
doc = self.source_doc
doc = self.clean_document(doc)
doc = self.process_tables(doc)
chapters = []
current_chapter = None
current_chapter_title = None
# 获取文档主体中的所有元素
body_elements = doc.element.body.xpath('*')
for element in body_elements:
if element.tag.endswith('p'): # 段落
paragraph = self._get_paragraph(doc, element)
if paragraph is None:
continue
if self.should_ignore_paragraph(paragraph):
continue
if self.is_title(paragraph, level=1):
if current_chapter is not None:
chapters.append((current_chapter_title, current_chapter))
current_chapter_title = self._format_chapter_title(paragraph.text)
current_chapter = Document()
# 复制文档的核心样式
self._copy_core_styles(doc, current_chapter)
current_chapter.add_heading(current_chapter_title, level=1)
self.current_title_path = [current_chapter_title]
self.chapter_counter += 1
continue
if current_chapter is not None:
self._copy_paragraph(current_chapter, paragraph)
elif element.tag.endswith('tbl'): # 表格
if current_chapter is not None:
self._copy_table(current_chapter, element)
elif element.tag.endswith('drawing'): # 图片
if current_chapter is not None:
self._copy_image(current_chapter, element)
if current_chapter is not None:
chapters.append((current_chapter_title, current_chapter))
return chapters
def _copy_core_styles(self, source_doc, target_doc):
"""复制核心样式到目标文档"""
# 复制默认段落样式
default_style = source_doc.styles['Normal']
target_style = target_doc.styles['Normal']
target_style.font.name = default_style.font.name
target_style.font.size = default_style.font.size
def _get_paragraph(self, doc, element):
"""获取段落对象"""
for p in doc.paragraphs:
if p._element == element:
return p
return None
def _format_chapter_title(self, title):
"""格式化章节标题"""
title = title.strip()
if not re.match(r'^第[一二三四五六七八九十]+章', title):
match = re.search(r'(第[一二三四五六七八九十]+章\s*.+)', title)
if match:
title = match.group(1)
return title
def _copy_paragraph(self, target_doc, source_paragraph):
"""复制段落及其内容"""
new_para = target_doc.add_paragraph(style=source_paragraph.style)
# 复制段落格式
new_para.paragraph_format.alignment = source_paragraph.paragraph_format.alignment
new_para.paragraph_format.left_indent = source_paragraph.paragraph_format.left_indent
new_para.paragraph_format.right_indent = source_paragraph.paragraph_format.right_indent
new_para.paragraph_format.first_line_indent = source_paragraph.paragraph_format.first_line_indent
new_para.paragraph_format.line_spacing = source_paragraph.paragraph_format.line_spacing
new_para.paragraph_format.space_before = source_paragraph.paragraph_format.space_before
new_para.paragraph_format.space_after = source_paragraph.paragraph_format.space_after
# 复制run和图片
for run in source_paragraph.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.size = run.font.size
new_run.font.name = run.font.name
# 复制图片
if run._element.xpath('.//wp:inline'):
self._copy_run_image(new_run, run)
def _copy_run_image(self, new_run, source_run):
"""复制run中的图片"""
drawing = source_run._element.xpath('.//wp:inline')[0]
new_run._element.append(parse_xml(drawing.xml))
def _copy_table(self, target_doc, table_element):
"""复制表格"""
new_table = target_doc.add_table(rows=1, cols=1)
new_table._element = parse_xml(table_element.xml)
# 确保表格中的图片引用正确
for row in new_table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
if run._element.xpath('.//wp:inline'):
self._copy_run_image(run, run)
def _copy_image(self, target_doc, image_element):
"""复制独立图片"""
para = target_doc.add_paragraph()
run = para.add_run()
run._element.append(parse_xml(image_element.xml))
def save_chapters(self, chapters):
"""保存章节并处理图片引用"""
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
saved_files = []
for idx, (title, chapter_doc) in enumerate(chapters):
# 生成安全的文件名
safe_title = re.sub(r'[\\/*?:"<>|]', "_", title)
# 使用基础文件名+章节标题作为文件名
filename = f"{self.base_filename}-{safe_title}.docx"
filepath = os.path.join(self.output_dir, filename)
# 临时保存以处理图片
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
chapter_doc.save(temp_file.name)
temp_file.close()
# 处理图片引用
self._repack_docx_with_images(temp_file.name, filepath)
os.unlink(temp_file.name)
saved_files.append(filepath)
return saved_files
def _repack_docx_with_images(self, src_path, dest_path):
"""重新打包docx文件包含图片"""
with zipfile.ZipFile(src_path, 'r') as zin:
with zipfile.ZipFile(dest_path, 'w') as zout:
# 复制所有文件
for item in zin.infolist():
if not item.filename.startswith('word/media/'): # 不复制原media文件
zout.writestr(item, zin.read(item.filename))
# 添加图片文件
for rel_path, data in self.media_files.items():
zout.writestr(rel_path, data)
def _extract_chapter_number(self, title):
"""从标题中提取章节编号"""
match = re.search(r'第([一二三四五六七八九十]+)章', title)
if match:
chinese_num = match.group(1)
num_map = {'一':'1','二':'2','三':'3','四':'4','五':'5',
'六':'6','七':'7','八':'8','九':'9','十':'10'}
return num_map.get(chinese_num, None)
return None
def process_document(self):
"""处理文档主流程"""
if not self.input_path or not os.path.exists(self.input_path):
raise FileNotFoundError("输入文件路径无效或文件不存在")
if not self.output_dir:
raise ValueError("输出目录未指定")
try:
chapters = self.split_by_chapters()
saved_files = self.save_chapters(chapters)
return saved_files
except Exception as e:
raise Exception(f"处理文档时出错: {str(e)}")
def main():
"""主界面"""
try:
processor = WordProcessor()
root = tk.Tk()
root.title("Word文档处理工具 v3.2")
root.geometry("650x450")
# 界面布局
tk.Label(root, text="Word文档高级处理工具", font=("Arial", 16)).pack(pady=10)
# 输入文件选择
input_frame = tk.Frame(root)
input_frame.pack(pady=5, fill=tk.X, padx=20)
tk.Label(input_frame, text="输入文件:").pack(side=tk.LEFT)
input_entry = tk.Entry(input_frame, width=45)
input_entry.pack(side=tk.LEFT, padx=5, expand=True, fill=tk.X)
tk.Button(input_frame, text="浏览...",
command=lambda: input_entry.insert(0, processor.select_input_file())).pack(side=tk.LEFT)
# 输出目录选择
output_frame = tk.Frame(root)
output_frame.pack(pady=5, fill=tk.X, padx=20)
tk.Label(output_frame, text="输出目录:").pack(side=tk.LEFT)
output_entry = tk.Entry(output_frame, width=45)
output_entry.pack(side=tk.LEFT, padx=5, expand=True, fill=tk.X)
tk.Button(output_frame, text="浏览...",
command=lambda: output_entry.insert(0, processor.select_output_dir())).pack(side=tk.LEFT)
# 处理按钮
def on_process():
processor.input_path = input_entry.get()
processor.output_dir = output_entry.get()
if not processor.input_path:
messagebox.showerror("错误", "请先选择输入文件")
return
if not processor.output_dir:
messagebox.showerror("错误", "请先选择输出目录")
return
try:
saved_files = processor.process_document()
messagebox.showinfo("成功",
f"处理完成! 共生成 {len(saved_files)} 个子文档。\n"
f"输出目录: {processor.output_dir}\n"
f"第一个文件: {os.path.basename(saved_files[0])}")
# 打开输出目录
if os.name == 'nt': # Windows
os.startfile(processor.output_dir)
elif os.name == 'posix': # macOS, Linux
os.system(f'open "{processor.output_dir}"')
except Exception as e:
messagebox.showerror("错误", f"处理失败: {str(e)}")
process_btn = tk.Button(root, text="开始处理", command=on_process,
height=2, width=20, bg="#4CAF50", fg="white")
process_btn.pack(pady=20)
# 说明文本
info_frame = tk.Frame(root, borderwidth=1, relief="solid", padx=10, pady=10)
info_frame.pack(pady=10, padx=20, fill=tk.BOTH, expand=True)
tk.Label(info_frame, text="功能说明:", font=("Arial", 10, "bold")).pack(anchor="w")
tk.Label(info_frame,
text="1. 按章节拆分文档,保留原格式\n"
"2. 完整保留所有表格和图片\n"
"3. 自动处理续表合并\n"
"4. 清理无用内容(logo、标准号等)\n"
"5. 生成文件名格式: 原文件名-章节标题",
justify=tk.LEFT, anchor="w").pack(fill=tk.X)
tk.Label(info_frame,
text="输出示例:\n"
"高速公路清障施救标准化手册0901终-第一章 总则.docx\n"
"高速公路清障施救标准化手册0901终-第二章 清障施救标准.docx",
justify=tk.LEFT, anchor="w", fg="blue").pack(fill=tk.X, pady=(5,0))
root.mainloop()
except Exception as e:
messagebox.showerror("系统错误", f"程序发生错误: {str(e)}")
if __name__ == "__main__":
main()
我在执行上面代码的时候,被拆分的文档中所有的图片和表格都没有了,是什原因导致的我应该怎么修改
最新发布