import sys
import os
import tempfile
import subprocess
import time
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QPushButton, QTextEdit, QFileDialog,
QVBoxLayout, QWidget, QStatusBar, QProgressDialog
)
from PyQt5.QtCore import Qt
from docx import Document
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.text.paragraph import Paragraph
class DocumentReaderApp(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("文档阅读器")
self.setGeometry(100, 100, 800, 600)
# 主布局
main_widget = QWidget()
self.setCentralWidget(main_widget)
layout = QVBoxLayout(main_widget)
# 选择文件按钮
self.btn_open = QPushButton("选择文件")
self.btn_open.clicked.connect(self.open_file)
layout.addWidget(self.btn_open)
# 文本显示区域
self.text_edit = QTextEdit()
self.text_edit.setReadOnly(True)
layout.addWidget(self.text_edit)
# 状态栏
self.status_bar = QStatusBar()
self.setStatusBar(self.status_bar)
self.status_bar.showMessage("就绪")
# 检查LibreOffice是否安装
self.libreoffice_path = self.find_libreoffice()
if not self.libreoffice_path:
self.status_bar.showMessage("警告: 未找到LibreOffice,DOC/WPS文件转换功能不可用")
def find_libreoffice(self):
"""查找系统中安装的LibreOffice"""
# Windows路径
windows_paths = [
r"C:\Program Files\LibreOffice\program\soffice.exe",
r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
]
# Linux路径
linux_paths = [
"/usr/bin/libreoffice",
"/usr/bin/soffice",
"/snap/bin/libreoffice"
]
# 检查路径是否存在
paths = windows_paths if sys.platform == "win32" else linux_paths
for path in paths:
if os.path.exists(path):
return path
# 尝试通过PATH查找
try:
if sys.platform == "win32":
result = subprocess.run(["where", "soffice"], capture_output=True, text=True)
else:
result = subprocess.run(["which", "soffice"], capture_output=True, text=True)
if result.returncode == 0 and os.path.exists(result.stdout.strip()):
return result.stdout.strip()
except:
pass
return None
def open_file(self):
"""打开文件对话框并读取内容"""
file_path, _ = QFileDialog.getOpenFileName(
self, "选择文档", "",
"文档文件 (*.docx *.doc *.wps);;所有文件 (*.*)"
)
if not file_path:
return
self.status_bar.showMessage(f"正在处理: {os.path.basename(file_path)}...")
QApplication.processEvents() # 更新UI
try:
text = self.read_document(file_path)
self.text_edit.setText(text)
self.status_bar.showMessage(f"成功读取: {os.path.basename(file_path)}")
except Exception as e:
self.text_edit.setText(f"错误: {str(e)}")
self.status_bar.showMessage(f"读取失败: {os.path.basename(file_path)}")
def read_document(self, file_path):
"""根据文件类型选择读取方法"""
ext = os.path.splitext(file_path)[1].lower()
if ext == '.docx':
return self.read_docx_with_numbering(file_path)
elif ext in ('.doc', '.wps'):
if not self.libreoffice_path:
raise RuntimeError("未安装LibreOffice,无法转换DOC/WPS文件")
# 显示进度对话框
progress = QProgressDialog("正在转换文件...", "取消", 0, 0, self)
progress.setWindowTitle("文档转换")
progress.setWindowModality(Qt.WindowModal)
progress.setCancelButton(None) # 禁用取消按钮
progress.show()
QApplication.processEvents()
# 转换文件
converted_file = self.convert_to_docx(file_path)
# 关闭进度对话框
progress.close()
# 读取转换后的文件
return self.read_docx_with_numbering(converted_file)
else:
raise ValueError("不支持的格式")
def convert_to_docx(self, file_path):
"""使用LibreOffice将文件转换为DOCX格式"""
# 创建临时目录
temp_dir = tempfile.mkdtemp()
try:
# 构建转换命令
cmd = [
self.libreoffice_path,
"--headless", # 无界面模式
"--convert-to", "docx", # 转换为docx
"--outdir", temp_dir, # 输出目录
file_path # 输入文件
]
# 执行转换
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=60 # 超时60秒
)
if result.returncode != 0:
error_msg = result.stderr.decode('utf-8', errors='ignore')
raise RuntimeError(f"文件转换失败: {error_msg}")
# 查找转换后的文件
base_name = os.path.splitext(os.path.basename(file_path))[0]
converted_path = os.path.join(temp_dir, f"{base_name}.docx")
if not os.path.exists(converted_path):
raise RuntimeError("转换后的文件未找到")
return converted_path
except subprocess.TimeoutExpired:
raise RuntimeError("文件转换超时")
except Exception as e:
raise RuntimeError(f"转换过程中出错: {str(e)}")
def read_docx_with_numbering(self, file_path):
"""读取.docx文件并正确处理编号"""
try:
doc = Document(file_path)
text_list = []
numbering_dict = {}
list_counter = {} # 用于跟踪每个列表的计数器
# 获取文档中的所有编号定义
if doc.part.numbering_part:
numbering_part = doc.part.numbering_part.numbering_definitions._numbering
for num in numbering_part.findall('.//w:num', namespaces=numbering_part.nsmap):
num_id = num.get(qn('w:numId'))
abstract_num_id = num.find('.//w:abstractNumId', namespaces=num.nsmap).get(qn('w:val'))
# 查找对应的抽象编号定义
abstract_num = numbering_part.find(f'.//w:abstractNum[@w:abstractNumId="{abstract_num_id}"]',
namespaces=numbering_part.nsmap)
if abstract_num:
levels = {}
for lvl in abstract_num.findall('.//w:lvl', namespaces=abstract_num.nsmap):
ilvl = lvl.get(qn('w:ilvl'))
num_fmt = lvl.find('.//w:numFmt', namespaces=lvl.nsmap).get(qn('w:val'))
levels[ilvl] = num_fmt
numbering_dict[num_id] = levels
# 遍历文档中的所有段落
for para in doc.paragraphs:
p = para._p # 获取底层XML元素
# 检查段落是否有编号
num_pr = p.find('.//w:pPr/w:numPr', namespaces=p.nsmap)
if num_pr:
num_id = num_pr.find('.//w:numId', namespaces=num_pr.nsmap).get(qn('w:val'))
ilvl = num_pr.find('.//w:ilvl', namespaces=num_pr.nsmap).get(qn('w:val'))
# 获取编号格式
num_fmt = numbering_dict.get(num_id, {}).get(ilvl, 'decimal')
# 为这个编号列表创建计数器
counter_key = f"{num_id}_{ilvl}"
if counter_key not in list_counter:
list_counter[counter_key] = 0
list_counter[counter_key] += 1
# 根据格式生成编号前缀
prefix = self.get_number_prefix(num_fmt, list_counter[counter_key])
text_list.append(f"{prefix} {para.text}")
else:
# 没有编号的段落
text_list.append(para.text)
# 如果是临时文件,读取后删除
if "tmp" in file_path.lower() or "temp" in file_path.lower():
try:
os.remove(file_path)
temp_dir = os.path.dirname(file_path)
if os.path.exists(temp_dir) and not os.listdir(temp_dir):
os.rmdir(temp_dir)
except:
pass
return "\n".join(text_list)
except Exception as e:
raise RuntimeError(f"读取DOCX文件失败: {str(e)}")
def get_number_prefix(self, num_fmt, counter):
"""根据编号格式生成前缀"""
if num_fmt == 'decimal':
return f"{counter}."
elif num_fmt == 'lowerLetter':
return f"{self.number_to_letters(counter, False)}."
elif num_fmt == 'upperLetter':
return f"{self.number_to_letters(counter, True)}."
elif num_fmt == 'lowerRoman':
return f"{self.number_to_roman(counter).lower()}."
elif num_fmt == 'upperRoman':
return f"{self.number_to_roman(counter)}."
elif num_fmt == 'bullet':
return "•"
else:
return f"{counter}."
def number_to_letters(self, n, uppercase=True):
"""将数字转换为字母(A, B, C, ... AA, AB, ...)"""
result = ""
while n > 0:
n, remainder = divmod(n - 1, 26)
result = chr(65 + remainder) + result
return result if uppercase else result.lower()
def number_to_roman(self, n):
"""将数字转换为罗马数字"""
val = [
1000, 900, 500, 400,
100, 90, 50, 40,
10, 9, 5, 4,
1
]
syb = [
"M", "CM", "D", "CD",
"C", "XC", "L", "XL",
"X", "IX", "V", "IV",
"I"
]
roman_num = ''
i = 0
while n > 0:
for _ in range(n // val[i]):
roman_num += syb[i]
n -= val[i]
i += 1
return roman_num
if __name__ == "__main__":
app = QApplication(sys.argv)
window = DocumentReaderApp()
window.show()
sys.exit(app.exec_())
这个程序可以读取wps、doc、docx文件,并可以识别自动编号,但识别的编号不对,比如,它会把“一、”“(一)”这样的编号识别成“1.”,这样就不符合原文的内容了。请帮我把这段代码修复一下
最新发布