import os
import tkinter as tk
from tkinter import filedialog, ttk, messagebox, simpledialog
import fitz # PyMuPDF
import pandas as pd
from PIL import Image, ImageTk
import io
class PDFExtractorApp:
def __init__(self, root):
self.root = root
self.root.title("PDF文本提取器")
self.root.geometry("1200x800")
self.pdf_document = None
self.current_page = 0
self.total_pages = 0
self.thumbnails = []
self.selection_start = None
self.selection_end = None
self.extracted_data = []
self.table_data = None # 用于存储表格数据
self.all_pages_table_data = None # 用于存储所有页面的表格数据
# 滚动和拖拽相关变量
self.is_dragging = False
self.drag_start_x = 0
self.drag_start_y = 0
# 缩放和显示相关变量
self.zoom_factor = 1.0
self.zoom_step = 0.1 # 减小缩放步长,使缩放更平滑
self.min_zoom = 0.2 # 最小缩放比例
self.max_zoom = 5.0 # 最大缩放比例
self.auto_fit_mode = "page" # "page", "width", "none"
# Ctrl键状态跟踪
self.ctrl_pressed = False
# 选区相关变量
self.is_selecting = False
self.selection_rect = None
self.page_bbox = None # 页面边界框
# 选区坐标显示变量
self.coord_display = None
self.coord_text_id = None
# 选区辅助线条
self.helper_lines = []
# 表格识别参数
self.column_tolerance = 10 # 列容差,用于确定文本是否在同一列
self.row_tolerance = 10 # 行容差,用于确定文本是否在同一行
self.setup_ui()
def setup_ui(self):
# 创建主框架
main_frame = ttk.Frame(self.root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 左侧缩略图区域
left_frame = ttk.LabelFrame(main_frame, text="页面预览")
left_frame.pack(side=tk.LEFT, fill=tk.Y, padx=5, pady=5)
# 页码跳转区域
nav_frame = ttk.Frame(left_frame)
nav_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(nav_frame, text="跳转至:").pack(side=tk.LEFT, padx=2)
self.page_entry = ttk.Entry(nav_frame, width=6)
self.page_entry.pack(side=tk.LEFT, padx=2)
ttk.Button(nav_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=2)
# 缩略图滚动区域
scrollbar = ttk.Scrollbar(left_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.thumbnail_canvas = tk.Canvas(left_frame, width=150, yscrollcommand=scrollbar.set)
self.thumbnail_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.config(command=self.thumbnail_canvas.yview)
# 创建缩略图框架,用于容纳所有缩略图
self.thumbnails_frame = ttk.Frame(self.thumbnail_canvas)
self.thumbnail_window = self.thumbnail_canvas.create_window((0, 0), window=self.thumbnails_frame, anchor=tk.NW)
# 中间PDF预览区域
center_frame = ttk.LabelFrame(main_frame, text="PDF预览")
center_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5)
# 创建一个框架用于放置滚动条
canvas_frame = ttk.Frame(center_frame)
canvas_frame.pack(fill=tk.BOTH, expand=True)
# 创建水平和垂直滚动条
self.xscrollbar = ttk.Scrollbar(canvas_frame, orient=tk.HORIZONTAL)
self.xscrollbar.pack(side=tk.BOTTOM, fill=tk.X)
self.yscrollbar = ttk.Scrollbar(canvas_frame, orient=tk.VERTICAL)
self.yscrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# 创建PDF画布
self.pdf_canvas = tk.Canvas(canvas_frame, bg="white",
xscrollcommand=self.xscrollbar.set,
yscrollcommand=self.yscrollbar.set)
self.pdf_canvas.pack(fill=tk.BOTH, expand=True)
# 配置滚动条
self.xscrollbar.config(command=self.pdf_canvas.xview)
self.yscrollbar.config(command=self.pdf_canvas.yview)
# 绑定鼠标事件用于选区
self.pdf_canvas.bind("<ButtonPress-1>", self.on_mouse_down)
self.pdf_canvas.bind("<B1-Motion>", self.on_mouse_drag)
self.pdf_canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
# 绑定鼠标滚轮和右键拖拽事件
self.pdf_canvas.bind("<MouseWheel>", self.on_mouse_wheel)
self.pdf_canvas.bind("<ButtonPress-3>", self.on_right_mouse_down)
self.pdf_canvas.bind("<B3-Motion>", self.on_right_mouse_drag)
self.pdf_canvas.bind("<ButtonRelease-3>", self.on_right_mouse_up)
# Ctrl键状态监听
self.root.bind("<Control-KeyPress>", self.on_ctrl_press)
self.root.bind("<Control-KeyRelease>", self.on_ctrl_release)
# 右侧控制区域
right_frame = ttk.LabelFrame(main_frame, text="控制区")
right_frame.pack(side=tk.RIGHT, fill=tk.Y, padx=5, pady=5)
# 文件选择
ttk.Button(right_frame, text="打开PDF", command=self.open_pdf).pack(fill=tk.X, padx=5, pady=5)
# 页面导航
nav_frame = ttk.Frame(right_frame)
nav_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Button(nav_frame, text="上一页", command=self.prev_page).pack(side=tk.LEFT, padx=2)
self.page_label = ttk.Label(nav_frame, text="0/0")
self.page_label.pack(side=tk.LEFT, padx=5)
ttk.Button(nav_frame, text="下一页", command=self.next_page).pack(side=tk.LEFT, padx=2)
# 缩放控制
zoom_frame = ttk.Frame(right_frame)
zoom_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Button(zoom_frame, text="放大", command=self.zoom_in).pack(side=tk.LEFT, padx=2)
ttk.Button(zoom_frame, text="缩小", command=self.zoom_out).pack(side=tk.LEFT, padx=2)
ttk.Button(zoom_frame, text="重置视图", command=self.reset_view).pack(side=tk.LEFT, padx=2)
# 自适应模式选择
fit_frame = ttk.Frame(right_frame)
fit_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(fit_frame, text="自适应:").pack(side=tk.LEFT, padx=2)
self.fit_mode = tk.StringVar(value="page")
ttk.Radiobutton(fit_frame, text="整页", variable=self.fit_mode,
value="page", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2)
ttk.Radiobutton(fit_frame, text="宽度", variable=self.fit_mode,
value="width", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2)
ttk.Radiobutton(fit_frame, text="实际大小", variable=self.fit_mode,
value="none", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2)
# 提取选项
ttk.Label(right_frame, text="提取选项").pack(fill=tk.X, padx=5, pady=5)
self.extract_type = tk.StringVar(value="selection")
ttk.Radiobutton(right_frame, text="选区提取", variable=self.extract_type, value="selection").pack(anchor=tk.W, padx=20)
ttk.Radiobutton(right_frame, text="整页提取", variable=self.extract_type, value="full_page").pack(anchor=tk.W, padx=20)
# 表格识别选项
table_frame = ttk.LabelFrame(right_frame, text="表格识别选项")
table_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(table_frame, text="列容差:").grid(row=0, column=0, padx=5, pady=2, sticky=tk.W)
self.column_tolerance_var = tk.StringVar(value=str(self.column_tolerance))
ttk.Entry(table_frame, textvariable=self.column_tolerance_var, width=5).grid(row=0, column=1, padx=2, pady=2)
ttk.Label(table_frame, text="行容差:").grid(row=1, column=0, padx=5, pady=2, sticky=tk.W)
self.row_tolerance_var = tk.StringVar(value=str(self.row_tolerance))
ttk.Entry(table_frame, textvariable=self.row_tolerance_var, width=5).grid(row=1, column=1, padx=2, pady=2)
ttk.Button(table_frame, text="应用设置", command=self.apply_table_settings).grid(row=0, column=2, rowspan=2, padx=5, pady=2)
# 选区坐标显示
selection_frame = ttk.LabelFrame(right_frame, text="选区坐标")
selection_frame.pack(fill=tk.X, padx=5, pady=5)
# 左上角坐标
ttk.Label(selection_frame, text="左上角:").grid(row=0, column=0, padx=5, pady=2, sticky=tk.W)
self.selection_x1_var = tk.StringVar(value="0")
self.selection_y1_var = tk.StringVar(value="0")
ttk.Entry(selection_frame, textvariable=self.selection_x1_var, width=8, state="readonly").grid(row=0, column=1, padx=2, pady=2)
ttk.Label(selection_frame, text=",").grid(row=0, column=2, padx=0, pady=2)
ttk.Entry(selection_frame, textvariable=self.selection_y1_var, width=8, state="readonly").grid(row=0, column=3, padx=2, pady=2)
# 右下角坐标
ttk.Label(selection_frame, text="右下角:").grid(row=1, column=0, padx=5, pady=2, sticky=tk.W)
self.selection_x2_var = tk.StringVar(value="0")
self.selection_y2_var = tk.StringVar(value="0")
ttk.Entry(selection_frame, textvariable=self.selection_x2_var, width=8, state="readonly").grid(row=1, column=1, padx=2, pady=2)
ttk.Label(selection_frame, text=",").grid(row=1, column=2, padx=0, pady=2)
ttk.Entry(selection_frame, textvariable=self.selection_y2_var, width=8, state="readonly").grid(row=1, column=3, padx=2, pady=2)
# 选区尺寸
ttk.Label(selection_frame, text="尺寸:").grid(row=2, column=0, padx=5, pady=2, sticky=tk.W)
self.selection_width_var = tk.StringVar(value="0")
self.selection_height_var = tk.StringVar(value="0")
ttk.Entry(selection_frame, textvariable=self.selection_width_var, width=8, state="readonly").grid(row=2, column=1, padx=2, pady=2)
ttk.Label(selection_frame, text="×").grid(row=2, column=2, padx=0, pady=2)
ttk.Entry(selection_frame, textvariable=self.selection_height_var, width=8, state="readonly").grid(row=2, column=3, padx=2, pady=2)
# 选区重置按钮
ttk.Button(right_frame, text="重置选区", command=self.reset_selection).pack(fill=tk.X, padx=5, pady=5)
# 提取按钮
ttk.Button(right_frame, text="提取文本", command=self.extract_text).pack(fill=tk.X, padx=5, pady=5)
# 提取所有页面相同区域按钮
ttk.Button(right_frame, text="提取所有页面相同区域", command=self.extract_all_pages_same_region).pack(fill=tk.X, padx=5, pady=5)
# 导出按钮
ttk.Button(right_frame, text="导出到Excel", command=self.export_to_excel).pack(fill=tk.X, padx=5, pady=5)
# 提取结果显示
result_frame = ttk.LabelFrame(right_frame, text="提取结果")
result_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
self.result_text = tk.Text(result_frame, height=10)
self.result_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# 状态栏
self.status_var = tk.StringVar(value="就绪")
self.status_bar = ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W)
self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)
# 绑定回车键到页码跳转
self.page_entry.bind("<Return>", lambda event: self.jump_to_page())
# 绑定缩略图画布配置事件
self.thumbnails_frame.bind("<Configure>", self.on_thumbnails_configure)
def apply_table_settings(self):
"""应用表格识别设置"""
try:
self.column_tolerance = float(self.column_tolerance_var.get())
self.row_tolerance = float(self.row_tolerance_var.get())
self.status_var.set(f"已应用表格识别设置: 列容差={self.column_tolerance}, 行容差={self.row_tolerance}")
except ValueError:
messagebox.showerror("输入错误", "请输入有效的数值")
def on_thumbnails_configure(self, event):
"""更新缩略图画布的滚动区域"""
self.thumbnail_canvas.configure(scrollregion=self.thumbnail_canvas.bbox("all"))
def open_pdf(self):
file_path = filedialog.askopenfilename(filetypes=[("PDF文件", "*.pdf")])
if file_path:
try:
self.pdf_document = fitz.open(file_path)
self.total_pages = len(self.pdf_document)
self.current_page = 0
self.extracted_data = []
self.table_data = None
self.all_pages_table_data = None
self.result_text.delete(1.0, tk.END)
# 更新页面标签
self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}")
# 生成缩略图
self.generate_thumbnails()
# 显示当前页面
self.display_current_page()
self.status_var.set(f"已打开文件: {os.path.basename(file_path)}")
except Exception as e:
messagebox.showerror("错误", f"无法打开PDF文件: {str(e)}")
def generate_thumbnails(self):
"""生成PDF页面缩略图"""
# 清除现有缩略图
self.thumbnails = []
for widget in self.thumbnails_frame.winfo_children():
widget.destroy()
y_offset = 5
for page_num in range(self.total_pages):
# 创建缩略图容器
thumbnail_frame = ttk.Frame(self.thumbnails_frame)
thumbnail_frame.pack(fill=tk.X, padx=5, pady=2)
# 生成缩略图
page = self.pdf_document[page_num]
pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
img = Image.open(io.BytesIO(pix.tobytes("ppm")))
photo = ImageTk.PhotoImage(img)
# 存储引用以防止垃圾回收
self.thumbnails.append(photo)
# 创建缩略图按钮
thumbnail_btn = tk.Button(thumbnail_frame, image=photo,
command=lambda p=page_num: self.go_to_page(p),
relief=tk.FLAT)
thumbnail_btn.pack(side=tk.LEFT, padx=5, pady=5)
# 添加页码标签
page_label = ttk.Label(thumbnail_frame, text=f"页面 {page_num + 1}")
page_label.pack(side=tk.LEFT, padx=5)
# 高亮显示当前页面
if page_num == self.current_page:
thumbnail_frame.config(style="ActiveThumbnail.TFrame")
page_label.config(style="ActiveThumbnail.TLabel")
else:
# 鼠标悬停效果
thumbnail_frame.bind("<Enter>", lambda e, f=thumbnail_frame: f.config(style="HoverThumbnail.TFrame"))
thumbnail_frame.bind("<Leave>", lambda e, f=thumbnail_frame: f.config(style=""))
page_label.bind("<Enter>", lambda e, l=page_label: l.config(style="HoverThumbnail.TLabel"))
page_label.bind("<Leave>", lambda e, l=page_label: l.config(style=""))
def display_current_page(self):
if not self.pdf_document:
return
self.pdf_canvas.delete("all")
self.selection_start = None
self.selection_end = None
self.is_selecting = False
# 重置选区坐标显示
self.update_selection_coordinates(0, 0, 0, 0)
page = self.pdf_document[self.current_page]
# 根据自适应模式计算缩放比例
if self.auto_fit_mode == "page":
# 自适应整页
self.adjust_zoom_to_fit_page()
elif self.auto_fit_mode == "width":
# 自适应宽度
self.adjust_zoom_to_fit_width()
else:
# 实际大小
pass # 使用当前zoom_factor
# 使用计算的缩放比例渲染页面
matrix = fitz.Matrix(self.zoom_factor, self.zoom_factor)
pix = page.get_pixmap(matrix=matrix)
img = Image.open(io.BytesIO(pix.tobytes("ppm")))
photo = ImageTk.PhotoImage(img)
# 存储引用以防止垃圾回收
self.current_image = photo
# 显示图像
self.image_id = self.pdf_canvas.create_image(0, 0, anchor=tk.NW, image=photo)
# 获取页面边界框(用于限制选区)
self.page_bbox = self.pdf_canvas.bbox(self.image_id)
# 设置画布滚动区域
self.pdf_canvas.config(scrollregion=self.pdf_canvas.bbox(tk.ALL))
# 居中显示
self.center_view()
# 状态栏显示页面信息
self.status_var.set(f"页面 {self.current_page + 1}/{self.total_pages}, 缩放比例: {self.zoom_factor:.1f}x")
# 更新缩略图高亮
self.generate_thumbnails()
def adjust_zoom_to_fit_page(self):
"""调整缩放比例以适应整个页面"""
if not self.pdf_document:
return
# 获取当前页面和画布尺寸
page = self.pdf_document[self.current_page]
canvas_width = self.pdf_canvas.winfo_width()
canvas_height = self.pdf_canvas.winfo_height()
# 考虑滚动条宽度
scrollbar_width = 15
canvas_width -= scrollbar_width
canvas_height -= scrollbar_width
# 计算页面宽高
page_width = page.rect.width
page_height = page.rect.height
# 计算适应画布的缩放比例
width_factor = canvas_width / page_width
height_factor = canvas_height / page_height
# 取较小的缩放比例以确保整个页面可见
self.zoom_factor = min(width_factor, height_factor)
def adjust_zoom_to_fit_width(self):
"""调整缩放比例以适应页面宽度"""
if not self.pdf_document:
return
# 获取当前页面和画布宽度
page = self.pdf_document[self.current_page]
canvas_width = self.pdf_canvas.winfo_width()
# 考虑滚动条宽度
scrollbar_width = 15
canvas_width -= scrollbar_width
# 计算页面宽度
page_width = page.rect.width
# 计算适应画布宽度的缩放比例
self.zoom_factor = canvas_width / page_width
def center_view(self):
"""居中显示PDF页面"""
if not self.pdf_document:
return
# 获取画布和页面尺寸
canvas_width = self.pdf_canvas.winfo_width()
canvas_height = self.pdf_canvas.winfo_height()
page_width = self.pdf_canvas.bbox(tk.ALL)[2]
page_height = self.pdf_canvas.bbox(tk.ALL)[3]
# 计算居中位置
if page_width > canvas_width:
# 页面宽度大于画布,使用滚动条
xview = 0
else:
# 页面宽度小于画布,居中显示
xview = (canvas_width - page_width) / 2 / canvas_width
if page_height > canvas_height:
# 页面高度大于画布,使用滚动条
yview = 0
else:
# 页面高度小于画布,居中显示
yview = (canvas_height - page_height) / 2 / canvas_height
# 设置视图位置
self.pdf_canvas.xview_moveto(xview)
self.pdf_canvas.yview_moveto(yview)
def go_to_page(self, page_num):
"""跳转到指定页面"""
if 0 <= page_num < self.total_pages:
self.current_page = page_num
self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}")
self.display_current_page()
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(page_num + 1))
def jump_to_page(self):
"""根据输入的页码跳转到指定页面"""
if not self.pdf_document:
return
try:
page_num = int(self.page_entry.get()) - 1
if 0 <= page_num < self.total_pages:
self.go_to_page(page_num)
else:
messagebox.showwarning("页码错误", f"请输入1到{self.total_pages}之间的页码")
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(self.current_page + 1))
except ValueError:
messagebox.showwarning("输入错误", "请输入有效的页码")
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(self.current_page + 1))
def prev_page(self):
if self.current_page > 0:
self.current_page -= 1
self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}")
self.display_current_page()
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(self.current_page + 1))
def next_page(self):
if self.current_page < self.total_pages - 1:
self.current_page += 1
self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}")
self.display_current_page()
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(self.current_page + 1))
def on_mouse_down(self, event):
# 检查是否点击在页面内
if self.page_bbox and self.is_point_in_page(event.x, event.y):
self.is_selecting = True
self.selection_start = (event.x, event.y)
# 限制起点在页面内
self.selection_start = self.clamp_point_to_page(self.selection_start[0], self.selection_start[1])
# 创建选区矩形
self.selection_rect = self.pdf_canvas.create_rectangle(
self.selection_start[0], self.selection_start[1],
self.selection_start[0], self.selection_start[1],
outline="red", width=2, stipple="gray25", fill="#FF000033")
# 更新选区坐标显示
self.update_selection_coordinates(
int(self.selection_start[0]),
int(self.selection_start[1]),
int(self.selection_start[0]),
int(self.selection_start[1])
)
# 状态栏显示选区信息
self.status_var.set(f"选区起点: ({int(self.selection_start[0])}, {int(self.selection_start[1])})")
# 清除之前的辅助线条
for line in self.helper_lines:
self.pdf_canvas.delete(line)
self.helper_lines = []
def on_mouse_drag(self, event):
if self.is_selecting and hasattr(self, 'selection_rect'):
# 限制终点在页面内
end_x, end_y = self.clamp_point_to_page(event.x, event.y)
# 更新选区矩形
self.pdf_canvas.coords(self.selection_rect,
self.selection_start[0], self.selection_start[1],
end_x, end_y)
# 计算选区坐标
x1, y1 = self.selection_start
x2, y2 = end_x, end_y
# 确保坐标按左上右下排序
if x1 > x2:
x1, x2 = x2, x1
if y1 > y2:
y1, y2 = y2, y1
# 更新选区坐标显示
self.update_selection_coordinates(int(x1), int(y1), int(x2), int(y2))
# 计算选区尺寸
width = x2 - x1
height = y2 - y1
# 显示选区坐标和尺寸
coord_text = f"选区: ({int(x1)}, {int(y1)}) - ({int(x2)}, {int(y2)}) | 尺寸: {int(width)}×{int(height)} 像素"
self.status_var.set(coord_text)
# 更新辅助线条
for line in self.helper_lines:
self.pdf_canvas.delete(line)
self.helper_lines = []
# 绘制水平辅助线
h_line1 = self.pdf_canvas.create_line(x1, y1, x2, y1, fill="blue", dash=(4, 4))
h_line2 = self.pdf_canvas.create_line(x1, y2, x2, y2, fill="blue", dash=(4, 4))
# 绘制垂直辅助线
v_line1 = self.pdf_canvas.create_line(x1, y1, x1, y2, fill="blue", dash=(4, 4))
v_line2 = self.pdf_canvas.create_line(x2, y1, x2, y2, fill="blue", dash=(4, 4))
self.helper_lines.extend([h_line1, h_line2, v_line1, v_line2])
def on_mouse_up(self, event):
if self.is_selecting:
self.is_selecting = False
# 限制终点在页面内
end_x, end_y = self.clamp_point_to_page(event.x, event.y)
self.selection_end = (end_x, end_y)
# 检查选区是否太小
width = abs(self.selection_end[0] - self.selection_start[0])
height = abs(self.selection_end[1] - self.selection_start[1])
if width < 10 or height < 10:
# 选区太小,删除选区
self.pdf_canvas.delete(self.selection_rect)
self.selection_start = None
self.selection_end = None
# 重置选区坐标显示
self.update_selection_coordinates(0, 0, 0, 0)
self.status_var.set("选区太小,已重置")
else:
# 更新选区矩形
self.pdf_canvas.coords(self.selection_rect,
self.selection_start[0], self.selection_start[1],
self.selection_end[0], self.selection_end[1])
# 计算最终选区坐标
x1, y1 = self.selection_start
x2, y2 = self.selection_end
# 确保坐标按左上右下排序
if x1 > x2:
x1, x2 = x2, x1
if y1 > y2:
y1, y2 = y2, y1
# 更新选区坐标显示
self.update_selection_coordinates(int(x1), int(y1), int(x2), int(y2))
# 计算选区尺寸
width = x2 - x1
height = y2 - y1
# 显示最终选区坐标和尺寸
coord_text = f"选区已完成: ({int(x1)}, {int(y1)}) - ({int(x2)}, {int(y2)}) | 尺寸: {int(width)}×{int(height)} 像素"
self.status_var.set(coord_text)
# 清除辅助线条
for line in self.helper_lines:
self.pdf_canvas.delete(line)
self.helper_lines = []
def update_selection_coordinates(self, x1, y1, x2, y2):
"""更新选区坐标显示"""
# 确保坐标按左上右下排序
if x1 > x2:
x1, x2 = x2, x1
if y1 > y2:
y1, y2 = y2, y1
# 更新坐标显示
self.selection_x1_var.set(str(x1))
self.selection_y1_var.set(str(y1))
self.selection_x2_var.set(str(x2))
self.selection_y2_var.set(str(y2))
# 计算并更新尺寸
width = x2 - x1
height = y2 - y1
self.selection_width_var.set(str(width))
self.selection_height_var.set(str(height))
def is_point_in_page(self, x, y):
"""检查点是否在页面内"""
if not self.page_bbox:
return False
x1, y1, x2, y2 = self.page_bbox
return x1 <= x <= x2 and y1 <= y <= y2
def clamp_point_to_page(self, x, y):
"""将点限制在页面范围内"""
if not self.page_bbox:
return (x, y)
x1, y1, x2, y2 = self.page_bbox
clamped_x = max(x1, min(x, x2))
clamped_y = max(y1, min(y, y2))
return (clamped_x, clamped_y)
def extract_text(self):
if not self.pdf_document:
messagebox.showinfo("提示", "请先打开PDF文件")
return
if self.extract_type.get() == "selection":
if not self.selection_start or not self.selection_end:
messagebox.showinfo("提示", "请先选择文本区域")
return
# 确保选区坐标正确排序
x0, y0 = min(self.selection_start[0], self.selection_end[0]), min(self.selection_start[1], self.selection_end[1])
x1, y1 = max(self.selection_start[0], self.selection_end[0]), max(self.selection_start[1], self.selection_end[1])
# 计算选区在原始PDF中的比例
page = self.pdf_document[self.current_page]
# 调整选区坐标为PDF坐标
pdf_x0 = x0 / self.zoom_factor
pdf_y0 = y0 / self.zoom_factor
pdf_x1 = x1 / self.zoom_factor
pdf_y1 = y1 / self.zoom_factor
# 创建选区矩形
rect = fitz.Rect(pdf_x0, pdf_y0, pdf_x1, pdf_y1)
# 使用dict模式提取文本,获取更详细的布局信息
text_data = page.get_text("dict", clip=rect)
else:
# 整页提取
page = self.pdf_document[self.current_page]
text_data = page.get_text("dict")
if text_data and 'blocks' in text_data:
# 分析文本块,识别表格结构
table_data = self.analyze_table_structure(text_data)
# 添加页码到表格数据
if table_data:
self.table_data = [[f"页面 {self.current_page + 1}"] + row for row in table_data]
else:
self.table_data = None
# 更新结果显示
self.result_text.delete(1.0, tk.END)
if self.table_data and len(self.table_data) > 0:
# 显示表格预览
self.result_text.insert(tk.END, "已识别表格结构:\n\n")
for row in self.table_data:
row_text = " | ".join([cell if cell else " " for cell in row])
self.result_text.insert(tk.END, f"{row_text}\n")
self.status_var.set(f