import fitz # PyMuPDF
import os
import re
from tkinter import filedialog, messagebox, Tk, Toplevel, Label, IntVar
from tkinter.ttk import Progressbar
import traceback
from win32com import client
import pythoncom
def update_status(completed_pdfs):
"""
用于更新文件处理的状态,保存到 status.txt 文件中。
"""
with open("status.txt", "w") as status_file:
status_file.write(f"累计完成文件: {completed_pdfs}")
def clean_filename(filename):
"""
仅保留文件名中的汉字、字母、数字和“-”的部分,并替换下划线为“-”。
"""
return re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9\-]", "", filename).replace("_", "-")
def save_pdf_page_as_image(pdf_document, pdf_name, page_num, output_folder, dpi_value=144):
"""
将 PDF 单页保存为清晰度更高的 PNG 文件。
"""
try:
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, f"{pdf_name}-Page-{page_num + 1}.png")
if os.path.exists(output_file):
return
page = pdf_document.load_page(page_num)
zoom_ratio = dpi_value / 72.0
pix = page.get_pixmap(matrix=fitz.Matrix(zoom_ratio, zoom_ratio))
pix.save(output_file)
except Exception as e:
print(f"Error saving page {page_num + 1} as image of {pdf_name}: {e}")
traceback.print_exc()
def split_pdf_pages(pdf_document, pdf_name, total_pages, output_base_folder, progress_var, progress_window):
"""
将 PDF 按页切分并保存到单独的图片文件中。
"""
# 为每个 PDF 创建对应的输出文件夹
pdf_folder = os.path.join(output_base_folder, pdf_name)
os.makedirs(pdf_folder, exist_ok=True)
for page_num in range(total_pages):
save_pdf_page_as_image(pdf_document, pdf_name, page_num, pdf_folder)
# 更新进度条
progress_var.set(progress_var.get() + 1)
progress_window.update()
def process_pdf(pdf_path, output_folder, progress_var, progress_label, progress_window):
"""
处理 PDF 文件,将每页保存为单独的 PNG 文件。
"""
try:
original_pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
pdf_name = clean_filename(original_pdf_name) # 清洗 PDF 文件名
# 打开 PDF 文件并获取页数
pdf_document = fitz.open(pdf_path)
total_pages = len(pdf_document)
# 更新当前处理文件的状态
progress_label.config(text=f"正在处理文件:{pdf_name} ({total_pages} 页)")
progress_window.update()
# 按页切分并保存
split_pdf_pages(pdf_document, pdf_name, total_pages, output_folder, progress_var, progress_window)
pdf_document.close() # 关闭 PDF 文档
return True
except Exception as e:
print(f"Failed to process PDF {pdf_path}: {e}")
traceback.print_exc()
return False
def convert_word_to_pdf(doc_path):
"""
将 Word 文档转换为 PDF,并返回新生成的 PDF 路径。
"""
try:
pythoncom.CoInitialize() # 初始化 COM
word = client.DispatchEx("Word.Application")
doc = word.Documents.Open(doc_path)
pdf_path = os.path.splitext(doc_path)[0] + ".pdf"
doc.SaveAs(pdf_path, FileFormat=17)
doc.Close()
word.Quit()
return pdf_path
except Exception as e:
print(f"Failed to convert {doc_path} to PDF: {e}")
traceback.print_exc()
return None
def process_files_in_folder(input_folder, output_folder, progress_window, progress_label, progress_bar, progress_var):
"""
遍历输入文件夹中的所有 PDF 和 Word 文件并处理。
"""
file_list = []
total_pages = 0
for root, _, files in os.walk(input_folder):
for file in files:
file_path = os.path.join(root, file)
if file.lower().endswith('.pdf'):
file_list.append(file_path)
total_pages += fitz.open(file_path).page_count
elif file.lower().endswith('.doc') or file.lower().endswith('.docx'):
pdf_path = convert_word_to_pdf(file_path)
if pdf_path:
file_list.append(pdf_path)
total_pages += fitz.open(pdf_path).page_count
progress_bar.config(maximum=total_pages)
completed_pdfs = 0
for file_path in file_list:
if process_pdf(file_path, output_folder, progress_var, progress_label, progress_window):
completed_pdfs += 1
update_status(completed_pdfs)
progress_window.destroy() # 关闭进度窗口
messagebox.showinfo("完成", f"共处理完成 {completed_pdfs} 个 PDF 文件") # 显示完成信息
def main():
"""
主程序入口,处理输入和输出文件夹的选择,并调用文件处理逻辑。
"""
root = Tk()
root.withdraw() # 隐藏主窗口
# 选择输入和输出目录
input_folder = filedialog.askdirectory(title="选择包含PDF和Word文件的文件夹")
if not input_folder:
print("未选择输入文件夹,程序终止")
return
output_folder = filedialog.askdirectory(title="选择输出文件夹")
if not output_folder:
print("未选择输出文件夹,程序终止")
return
# 创建一个进度条显示的窗口
progress_window = Toplevel()
progress_window.title("处理进度")
progress_window.geometry("400x150")
progress_label = Label(progress_window, text="正在初始化...", anchor='w')
progress_label.pack(fill='x', padx=10, pady=5)
progress_var = IntVar()
progress_bar = Progressbar(progress_window, orient="horizontal", length=300, mode="determinate", variable=progress_var)
progress_bar.pack(padx=10, pady=5)
# 开始文件处理,显示进度
process_files_in_folder(input_folder, output_folder, progress_window, progress_label, progress_bar, progress_var)
root.mainloop()
if __name__ == "__main__":
main()
修改一下代码,使该代码可以在linux环境下运行,且运行效果不变
最新发布