Export selection of word document as an image file

本文介绍了一种在Word文档中处理图片的方法,通过VBA宏实现图片导出及重新导入为带有圆角矩形边框的效果。适用于Word 2002和2003版本,能在不同操作系统上运行,包括Windows 2000、XP、2003和Vista。该方法支持直接从Word文档中导出图片,并保持原有位置不变。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

原文地址: http://www.spotlight-wissen.de/archiv/message/1665077.html
Option Explicit

' (c) Désirée und Wolfram, 3/2005
' Modifiziert: 11/2007 - Bilder mit runden Ecken versehen
' Bilder aus Winword im Originalformat exportieren.
' Nur für WD2002 und WD2003 unter Win2000/XP/2003/Vista.
'
' Änderung3.11.2007: RundeEcken Shape Seitenverhältnis sperren
' Änderung4.11.2007: Table Pictures Contextmenu hinzugefügt
' Änderung4.11.2007: Inlineshape Position wird erhalten, Section Delete
' Änderung5.11.2007: Bilder in Header/Footer unterstützen

Private Declare Function EmptyClipboard Lib "user32" () As Long
Private Declare Function OpenClipboard Lib "user32" _
(ByVal hwnd As Long) As Long
Private Declare Function CloseClipboard Lib "user32" () As Long
Private Declare Function GetClipboardData Lib "user32" _
(ByVal wFormat As Long) As Long
Private Declare Function GetEnhMetaFileBits Lib "gdi32" _
(ByVal hEMF As Long, ByVal cbBuffer As Long, lpbBuffer As Byte) As Long
Private Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" _
(pDest As Any, pSource As Any, ByVal cbLength As Long)

Private Const CF_ENHMETAFILE = 14
Private emf() As Byte, imgData() As Byte

Private Type EmfRecord ' private emf-type
id As Long
len As Long
End Type

Private Type GDI_Comment ' private GDI type
len As Long
Type As Long
data As Long
End Type

Function ExportSelectedPicture(Filename As String) As String
Dim pBMI As Long, pDIB As Long, ext As String, picType As Integer, s As String

On Error Resume Next
Erase imgData: Erase emf
GetImage Selection

If ExportEMFPlusImageData(pBMI, pDIB) Then
CopyMemory picType, imgData(0), 2
Select Case picType
Case &HD8FF: ext = "jpg"
Case &H4947: ext = "gif"
Case &H5089: ext = "png"
Case &H1:ext = "emf"
Case &HCDD7: ext = "wmf"
Case &H4D42: ext = "bmp"
Case &H4949: ext = "tif"
Case &H50A:ext = "pcx"
Case &H100:ext = "tga"
Case &HD0C5:ext = "eps"
Case &H2100:ext = "cgm"
Case Else: ext = "bmp"
End Select

s = Filename & "." & ext
If Len(Dir(s)) Then Kill s
SaveRawImageData s
ExportSelectedPicture = s
Else
MsgBox "Fehler beim Export des selektierten Bildes"
End If
End Function

Function GetImage(ByVal r)
Dim hEMF As Long, n As Long

If Val(Application.Version) >= 11 Then
' EnhMetaFileBits liefert für Office 11 den raw EMF-stream
' Bug: Clipboard muss vorher geleert werden
If OpenClipboard(0&) Then
EmptyClipboard
CloseClipboard
End If
emf = CallByName(r, "EnhMetaFileBits", VbGet): DoEvents
Else
' für Office <=10 Ersatz über Clipboard. Vorsicht: In Office 11
' liefert CopyAsPicture nur eine EMF-Kopie, nicht den raw Stream.
r.CopyAsPicture
If OpenClipboard(0&) Then
hEMF = GetClipboardData(CF_ENHMETAFILE)
CloseClipboard
End If
If hEMF Then
n = GetEnhMetaFileBits(hEMF, 0, ByVal 0&)
If n Then
ReDim emf(n - 1)
GetEnhMetaFileBits hEMF, n, emf(0)
End If
End If
End If
End Function

Function ExportEMFPlusImageData(pBMI As Long, pDIB As Long) As Boolean
' aus dem EMF-Stream die GDI+ (EMF+) Image-Daten extrahieren

Dim pEMF As Long, lEmf As Long, n As Long, state As Long, pNext As Long
Dim recEMF As EmfRecord, recEMFplus As GDI_Comment, pImgData As Long
Dim nextblock As Boolean, pCmd As Long, imgtype As Long, toff As Long
Dim WMFhdr As Long, WMFhsz As Integer, misalign As Boolean, big As Boolean
Dim dib As Boolean, dibits As Long, bmi As Long, imgend As Boolean

On Error Resume Next
n = UBound(emf)
If n < 7 Or Err <> 0 Then Exit Function
Do
CopyMemory recEMF, emf(pEMF), 8
'Debug.Print Hex$(pEMF), Hex$(recEMF.id), Hex$(recEMF.len)
Select Case state
Case 0: ' header
If recEMF.id <> 1 Or recEMF.len = 0 Then Exit Function ' wrong header
state = 1
Case 1: ' wait for GDI_COMMENT Begin Group
If recEMF.id = 70 And recEMF.len > 23 Then
CopyMemory recEMFplus, emf(pEMF + 8), 12
If recEMFplus.type = &H43494447 And recEMFplus.data = 2 Then ' GDIC
state = 2
End If
End If
Case 2: ' wait for GDI_COMMENT EMF+ (GDI+) records
If recEMF.id = 70 And recEMF.len >= 20 Then
CopyMemory recEMFplus, emf(pEMF + 8), 12
'Debug.Print "+", Hex$(recEMFplus.type), Hex$(recEMFplus.data)
If (recEMFplus.type = &H2B464D45) And (Not imgend) Then ' GDI+ record
pNext = pEMF + 16
pCmd = recEMFplus.data
Do While (pCmd And &HFFFF&) <> &H4008' wait for cmd Image
CopyMemory n, emf(pNext + 4), 4' len of command
pNext = pNext + n
If pNext >= pEMF + recEMF.len Then Exit Do
CopyMemory pCmd, emf(pNext), 4 ' next command
Loop
If (pCmd And &HFFFFFFF) = &H5004008 Then' cmd Image + Flags
big = (pCmd And &H80000000) = &H80000000
toff = IIf(big, pNext + 20, pNext + 16)
If Not (big And nextblock) Then
CopyMemory imgtype, emf(toff), 4
If imgtype = 1 Then' bitmap
ReDim imgData(recEMF.len - toff - 24 + pEMF - 1)
CopyMemory imgData(0), emf(toff + 24), recEMF.len - toff - 24 + pEMF
ElseIf imgtype = 2 Then' metafile
ReDim imgData(recEMF.len - toff - 12 + pEMF - 1): misalign = False
CopyMemory WMFhdr, emf(toff + 12), 4
CopyMemory WMFhsz, emf(toff + 12 + 22 + 2), 2
If WMFhdr = &H9AC6CDD7 Then' WMF APM Header?
misalign = WMFhsz <> 9 ' check Std WMF hdr misaling
End If
If misalign Then ' correct GDI+ misalign-bug
CopyMemory imgData(0), emf(toff + 12), 22' APM header
CopyMemory imgData(22), emf(toff + 12 + 22 + 2), recEMF.len - toff - 12 + pEMF - 22 - 2
ReDim Preserve imgData(UBound(imgData) - 2)
Else
CopyMemory imgData(0), emf(toff + 12), recEMF.len - toff - 12 + pEMF
End If
Else
Exit Do' unknown type
End If' imgtype
If big Then nextblock = True Else imgend = True
Else
n = UBound(imgData)
ReDim Preserve imgData(n + recEMF.len - &H20)
CopyMemory imgData(n + 1), emf(pEMF + &H20), recEMF.len - &H20
End If' not (big and next)
End If ' cmd image
ElseIf recEMFplus.type = &H43494447 And recEMFplus.data = 3 Then ' GDIC end
Exit Do ' EMF+ group end
End If
ElseIf recEMF.id = 81 And recEMF.len >= 88 And (Not dib) Then ' EMR_StrechDibits
dib = True
CopyMemory n, emf(pEMF + 48), 4' BMIoffset (0x50)
bmi = pEMF + n ' BIHdr
CopyMemory n, emf(pEMF + 56), 4'
dibits = pEMF + n' DIBits
End If
End Select
pEMF = pEMF + recEMF.len
Loop Until pEMF > UBound(emf)
n = 0: n = UBound(imgData)
If n = 0 Then' if image not found, copy enh metafile bits
ReDim imgData(UBound(emf)): CopyMemory imgData(0), emf(0), UBound(emf) + 1
Else: pDIB = dibits: pBMI = bmi
End If
ExportEMFPlusImageData = True
End Function

Function SaveRawImageData(ByVal Filename As String)
Dim f As Long
f = FreeFile
Open Filename For Binary Access Write As f
Put f, 1, imgData
Close f
End Function

Sub GrafikMitRundenEcken()
Dim ils As InlineShape, fBaseName As String, fName As String
Dim w As Single, h As Single, sh As Shape, sh1 As Shape
Dim sr As Single, sa As Range, sl As Single, st As Single
Dim sla As Long, srh As Long, srv As Long, szp As Long
Dim swo As Long, sdb As Single, sdl As Single, sdr As Single, hf As HeaderFooter
Dim sdt As Single, ssi As Long, swt As Long, n As Long, r As Range, s As Long

fBaseName = Options.DefaultFilePath(wdTempFilePath) & "\~temppic"

s = Selection.Information(wdActiveEndSectionNumber)
Select Case Selection.StoryType ' HeaderFooter Shapes
Case wdEvenPagesHeaderStory ' 6
Set hf = ActiveDocument.Sections(s).Headers(wdHeaderFooterEvenPages)
Case wdPrimaryHeaderStory ' 7
Set hf = ActiveDocument.Sections(s).Headers(wdHeaderFooterPrimary)
Case wdEvenPagesFooterStory ' 8
Set hf = ActiveDocument.Sections(s).Footers(wdHeaderFooterEvenPages)
Case wdPrimaryFooterStory '9
Set hf = ActiveDocument.Sections(s).Footers(wdHeaderFooterPrimary)
Case wdFirstPageHeaderStory ' 10
Set hf = ActiveDocument.Sections(s).Headers(wdHeaderFooterFirstPage)
Case wdFirstPageFooterStory ' 11
Set hf = ActiveDocument.Sections(s).Footers(wdHeaderFooterFirstPage)
End Select

Select Case Selection.type
Case wdSelectionInlineShape
Set ils = Selection.InlineShapes(1)
w = ils.Width
h = ils.Height
fName = ExportSelectedPicture(fBaseName)
If Len(fName) Then
'n = Selection.Start - Selection.Paragraphs(1).Range.Start
Selection.Delete
If Selection.StoryType >= 6 And Selection.StoryType <= 11 Then
Set sh = hf.Shapes.AddShape(msoShapeRoundedRectangle, 0, 0, w, h, Selection.Range)
Set r = hf.Range
r.SetRange Selection.Paragraphs(1).Range.Start, Selection.Start
n = r.Characters.Count
Else
Set sh = ActiveDocument.Shapes.AddShape(msoShapeRoundedRectangle, 0, 0, w, h, Selection.Range)
n = ActiveDocument.Range(Selection.Paragraphs(1).Range.Start, Selection.Start).Characters.Count
End If
sh.Fill.UserPicture fName
sh.Line.Visible = msoFalse
sh.LockAspectRatio = msoTrue
sh.Select
CommandBars.FindControl(id:=5934).Execute' Ersatz für ConvertToInlineshape
If n Then' ILS war nicht zu nicht zu Absatzbeginn
Selection.Cut
Selection.MoveRight wdCharacter, n ' an vorherige Position schieben
Selection.Paste
End If
End If

Case wdSelectionShape
Set sh1 = Selection.ShapeRange(1)
w = sh1.Width
h = sh1.Height
sr = sh1.Rotation
Set sa = sh1.Anchor
sl = sh1.Left
st = sh1.Top
sla = sh1.LockAnchor
srh = sh1.RelativeHorizontalPosition
srv = sh1.RelativeVerticalPosition
szp = sh1.ZOrderPosition
swo = sh1.WrapFormat.AllowOverlap
sdb = sh1.WrapFormat.DistanceBottom
sdl = sh1.WrapFormat.DistanceLeft
sdr = sh1.WrapFormat.DistanceRight
sdt = sh1.WrapFormat.DistanceTop
ssi = sh1.WrapFormat.Side
swt = sh1.WrapFormat.type

fName = ExportSelectedPicture(fBaseName)
If Len(fName) Then
sh1.Delete
If Selection.StoryType >= 6 And Selection.StoryType <= 11 Then
Set sh = hf.Shapes.AddShape(msoShapeRoundedRectangle, sl, st, w, h, sa)
Else
Set sh = ActiveDocument.Shapes.AddShape(msoShapeRoundedRectangle, sl, st, w, h, sa)
End If
sh.Fill.UserPicture fName
sh.Line.Visible = msoFalse
sh.LockAspectRatio = msoTrue
sh.Rotation = sr
sh.LockAnchor = sla
sh.RelativeHorizontalPosition = srh
sh.RelativeVerticalPosition = srv
sh.WrapFormat.AllowOverlap = swo
sh.WrapFormat.DistanceBottom = sdb
sh.WrapFormat.DistanceLeft = sdl
sh.WrapFormat.DistanceRight = sdr
sh.WrapFormat.DistanceTop = sdt
sh.WrapFormat.Side = ssi
sh.WrapFormat.type = swt
End If
End Select
End Sub

Sub AddContextMenu1()
Const myId = "RundeEckenGrafik"
CustomizationContext = ThisDocument
Dim c As CommandBarControl, CBname As Variant, cbx As Variant

CBname = Array("Inline Picture", "Floating Picture", "Table Pictures")

For Each cbx In CBname
For Each c In Application.CommandBars(cbx).Controls
If c.Tag = myId Then c.Delete: Exit For
Next
With Application.CommandBars(cbx).Controls.Add(msoControlButton, , , 4)
.Tag = myId
.Caption = "Grafik mit runden Ecken"
.OnAction = "GrafikMitRundenEcken"
End With
Next cbx
End Sub



Grüße
Wolfram
import os import tkinter as tk from tkinter import filedialog, ttk, messagebox, simpledialog import fitz # PyMuPDF import pandas as pd from PIL import Image, ImageTk import io class PDFExtractorApp: def __init__(self, root): self.root = root self.root.title("PDF文本提取器") self.root.geometry("1200x800") self.pdf_document = None self.current_page = 0 self.total_pages = 0 self.thumbnails = [] self.selection_start = None self.selection_end = None self.extracted_data = [] self.table_data = None # 用于存储表格数据 self.all_pages_table_data = None # 用于存储所有页面的表格数据 # 滚动和拖拽相关变量 self.is_dragging = False self.drag_start_x = 0 self.drag_start_y = 0 # 缩放和显示相关变量 self.zoom_factor = 1.0 self.zoom_step = 0.1 # 减小缩放步长,使缩放更平滑 self.min_zoom = 0.2 # 最小缩放比例 self.max_zoom = 5.0 # 最大缩放比例 self.auto_fit_mode = "page" # "page", "width", "none" # Ctrl键状态跟踪 self.ctrl_pressed = False # 选区相关变量 self.is_selecting = False self.selection_rect = None self.page_bbox = None # 页面边界框 # 选区坐标显示变量 self.coord_display = None self.coord_text_id = None # 选区辅助线条 self.helper_lines = [] # 表格识别参数 self.column_tolerance = 10 # 列容差,用于确定文本是否在同一列 self.row_tolerance = 10 # 行容差,用于确定文本是否在同一行 self.setup_ui() def setup_ui(self): # 创建主框架 main_frame = ttk.Frame(self.root) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 左侧缩略图区域 left_frame = ttk.LabelFrame(main_frame, text="页面预览") left_frame.pack(side=tk.LEFT, fill=tk.Y, padx=5, pady=5) # 页码跳转区域 nav_frame = ttk.Frame(left_frame) nav_frame.pack(fill=tk.X, padx=5, pady=5) ttk.Label(nav_frame, text="跳转至:").pack(side=tk.LEFT, padx=2) self.page_entry = ttk.Entry(nav_frame, width=6) self.page_entry.pack(side=tk.LEFT, padx=2) ttk.Button(nav_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=2) # 缩略图滚动区域 scrollbar = ttk.Scrollbar(left_frame) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.thumbnail_canvas = tk.Canvas(left_frame, width=150, yscrollcommand=scrollbar.set) self.thumbnail_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.config(command=self.thumbnail_canvas.yview) # 创建缩略图框架,用于容纳所有缩略图 self.thumbnails_frame = ttk.Frame(self.thumbnail_canvas) self.thumbnail_window = self.thumbnail_canvas.create_window((0, 0), window=self.thumbnails_frame, anchor=tk.NW) # 中间PDF预览区域 center_frame = ttk.LabelFrame(main_frame, text="PDF预览") center_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建一个框架用于放置滚动条 canvas_frame = ttk.Frame(center_frame) canvas_frame.pack(fill=tk.BOTH, expand=True) # 创建水平和垂直滚动条 self.xscrollbar = ttk.Scrollbar(canvas_frame, orient=tk.HORIZONTAL) self.xscrollbar.pack(side=tk.BOTTOM, fill=tk.X) self.yscrollbar = ttk.Scrollbar(canvas_frame, orient=tk.VERTICAL) self.yscrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 创建PDF画布 self.pdf_canvas = tk.Canvas(canvas_frame, bg="white", xscrollcommand=self.xscrollbar.set, yscrollcommand=self.yscrollbar.set) self.pdf_canvas.pack(fill=tk.BOTH, expand=True) # 配置滚动条 self.xscrollbar.config(command=self.pdf_canvas.xview) self.yscrollbar.config(command=self.pdf_canvas.yview) # 绑定鼠标事件用于选区 self.pdf_canvas.bind("<ButtonPress-1>", self.on_mouse_down) self.pdf_canvas.bind("<B1-Motion>", self.on_mouse_drag) self.pdf_canvas.bind("<ButtonRelease-1>", self.on_mouse_up) # 绑定鼠标滚轮和右键拖拽事件 self.pdf_canvas.bind("<MouseWheel>", self.on_mouse_wheel) self.pdf_canvas.bind("<ButtonPress-3>", self.on_right_mouse_down) self.pdf_canvas.bind("<B3-Motion>", self.on_right_mouse_drag) self.pdf_canvas.bind("<ButtonRelease-3>", self.on_right_mouse_up) # Ctrl键状态监听 self.root.bind("<Control-KeyPress>", self.on_ctrl_press) self.root.bind("<Control-KeyRelease>", self.on_ctrl_release) # 右侧控制区域 right_frame = ttk.LabelFrame(main_frame, text="控制区") right_frame.pack(side=tk.RIGHT, fill=tk.Y, padx=5, pady=5) # 文件选择 ttk.Button(right_frame, text="打开PDF", command=self.open_pdf).pack(fill=tk.X, padx=5, pady=5) # 页面导航 nav_frame = ttk.Frame(right_frame) nav_frame.pack(fill=tk.X, padx=5, pady=5) ttk.Button(nav_frame, text="上一页", command=self.prev_page).pack(side=tk.LEFT, padx=2) self.page_label = ttk.Label(nav_frame, text="0/0") self.page_label.pack(side=tk.LEFT, padx=5) ttk.Button(nav_frame, text="下一页", command=self.next_page).pack(side=tk.LEFT, padx=2) # 缩放控制 zoom_frame = ttk.Frame(right_frame) zoom_frame.pack(fill=tk.X, padx=5, pady=5) ttk.Button(zoom_frame, text="放大", command=self.zoom_in).pack(side=tk.LEFT, padx=2) ttk.Button(zoom_frame, text="缩小", command=self.zoom_out).pack(side=tk.LEFT, padx=2) ttk.Button(zoom_frame, text="重置视图", command=self.reset_view).pack(side=tk.LEFT, padx=2) # 自适应模式选择 fit_frame = ttk.Frame(right_frame) fit_frame.pack(fill=tk.X, padx=5, pady=5) ttk.Label(fit_frame, text="自适应:").pack(side=tk.LEFT, padx=2) self.fit_mode = tk.StringVar(value="page") ttk.Radiobutton(fit_frame, text="整页", variable=self.fit_mode, value="page", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2) ttk.Radiobutton(fit_frame, text="宽度", variable=self.fit_mode, value="width", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2) ttk.Radiobutton(fit_frame, text="实际大小", variable=self.fit_mode, value="none", command=self.update_fit_mode).pack(side=tk.LEFT, padx=2) # 提取选项 ttk.Label(right_frame, text="提取选项").pack(fill=tk.X, padx=5, pady=5) self.extract_type = tk.StringVar(value="selection") ttk.Radiobutton(right_frame, text="选区提取", variable=self.extract_type, value="selection").pack(anchor=tk.W, padx=20) ttk.Radiobutton(right_frame, text="整页提取", variable=self.extract_type, value="full_page").pack(anchor=tk.W, padx=20) # 表格识别选项 table_frame = ttk.LabelFrame(right_frame, text="表格识别选项") table_frame.pack(fill=tk.X, padx=5, pady=5) ttk.Label(table_frame, text="列容差:").grid(row=0, column=0, padx=5, pady=2, sticky=tk.W) self.column_tolerance_var = tk.StringVar(value=str(self.column_tolerance)) ttk.Entry(table_frame, textvariable=self.column_tolerance_var, width=5).grid(row=0, column=1, padx=2, pady=2) ttk.Label(table_frame, text="行容差:").grid(row=1, column=0, padx=5, pady=2, sticky=tk.W) self.row_tolerance_var = tk.StringVar(value=str(self.row_tolerance)) ttk.Entry(table_frame, textvariable=self.row_tolerance_var, width=5).grid(row=1, column=1, padx=2, pady=2) ttk.Button(table_frame, text="应用设置", command=self.apply_table_settings).grid(row=0, column=2, rowspan=2, padx=5, pady=2) # 选区坐标显示 selection_frame = ttk.LabelFrame(right_frame, text="选区坐标") selection_frame.pack(fill=tk.X, padx=5, pady=5) # 左上角坐标 ttk.Label(selection_frame, text="左上角:").grid(row=0, column=0, padx=5, pady=2, sticky=tk.W) self.selection_x1_var = tk.StringVar(value="0") self.selection_y1_var = tk.StringVar(value="0") ttk.Entry(selection_frame, textvariable=self.selection_x1_var, width=8, state="readonly").grid(row=0, column=1, padx=2, pady=2) ttk.Label(selection_frame, text=",").grid(row=0, column=2, padx=0, pady=2) ttk.Entry(selection_frame, textvariable=self.selection_y1_var, width=8, state="readonly").grid(row=0, column=3, padx=2, pady=2) # 右下角坐标 ttk.Label(selection_frame, text="右下角:").grid(row=1, column=0, padx=5, pady=2, sticky=tk.W) self.selection_x2_var = tk.StringVar(value="0") self.selection_y2_var = tk.StringVar(value="0") ttk.Entry(selection_frame, textvariable=self.selection_x2_var, width=8, state="readonly").grid(row=1, column=1, padx=2, pady=2) ttk.Label(selection_frame, text=",").grid(row=1, column=2, padx=0, pady=2) ttk.Entry(selection_frame, textvariable=self.selection_y2_var, width=8, state="readonly").grid(row=1, column=3, padx=2, pady=2) # 选区尺寸 ttk.Label(selection_frame, text="尺寸:").grid(row=2, column=0, padx=5, pady=2, sticky=tk.W) self.selection_width_var = tk.StringVar(value="0") self.selection_height_var = tk.StringVar(value="0") ttk.Entry(selection_frame, textvariable=self.selection_width_var, width=8, state="readonly").grid(row=2, column=1, padx=2, pady=2) ttk.Label(selection_frame, text="×").grid(row=2, column=2, padx=0, pady=2) ttk.Entry(selection_frame, textvariable=self.selection_height_var, width=8, state="readonly").grid(row=2, column=3, padx=2, pady=2) # 选区重置按钮 ttk.Button(right_frame, text="重置选区", command=self.reset_selection).pack(fill=tk.X, padx=5, pady=5) # 提取按钮 ttk.Button(right_frame, text="提取文本", command=self.extract_text).pack(fill=tk.X, padx=5, pady=5) # 提取所有页面相同区域按钮 ttk.Button(right_frame, text="提取所有页面相同区域", command=self.extract_all_pages_same_region).pack(fill=tk.X, padx=5, pady=5) # 导出按钮 ttk.Button(right_frame, text="导出到Excel", command=self.export_to_excel).pack(fill=tk.X, padx=5, pady=5) # 提取结果显示 result_frame = ttk.LabelFrame(right_frame, text="提取结果") result_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) self.result_text = tk.Text(result_frame, height=10) self.result_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 状态栏 self.status_var = tk.StringVar(value="就绪") self.status_bar = ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W) self.status_bar.pack(side=tk.BOTTOM, fill=tk.X) # 绑定回车键到页码跳转 self.page_entry.bind("<Return>", lambda event: self.jump_to_page()) # 绑定缩略图画布配置事件 self.thumbnails_frame.bind("<Configure>", self.on_thumbnails_configure) def apply_table_settings(self): """应用表格识别设置""" try: self.column_tolerance = float(self.column_tolerance_var.get()) self.row_tolerance = float(self.row_tolerance_var.get()) self.status_var.set(f"已应用表格识别设置: 列容差={self.column_tolerance}, 行容差={self.row_tolerance}") except ValueError: messagebox.showerror("输入错误", "请输入有效的数值") def on_thumbnails_configure(self, event): """更新缩略图画布的滚动区域""" self.thumbnail_canvas.configure(scrollregion=self.thumbnail_canvas.bbox("all")) def open_pdf(self): file_path = filedialog.askopenfilename(filetypes=[("PDF文件", "*.pdf")]) if file_path: try: self.pdf_document = fitz.open(file_path) self.total_pages = len(self.pdf_document) self.current_page = 0 self.extracted_data = [] self.table_data = None self.all_pages_table_data = None self.result_text.delete(1.0, tk.END) # 更新页面标签 self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}") # 生成缩略图 self.generate_thumbnails() # 显示当前页面 self.display_current_page() self.status_var.set(f"已打开文件: {os.path.basename(file_path)}") except Exception as e: messagebox.showerror("错误", f"无法打开PDF文件: {str(e)}") def generate_thumbnails(self): """生成PDF页面缩略图""" # 清除现有缩略图 self.thumbnails = [] for widget in self.thumbnails_frame.winfo_children(): widget.destroy() y_offset = 5 for page_num in range(self.total_pages): # 创建缩略图容器 thumbnail_frame = ttk.Frame(self.thumbnails_frame) thumbnail_frame.pack(fill=tk.X, padx=5, pady=2) # 生成缩略图 page = self.pdf_document[page_num] pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) img = Image.open(io.BytesIO(pix.tobytes("ppm"))) photo = ImageTk.PhotoImage(img) # 存储引用以防止垃圾回收 self.thumbnails.append(photo) # 创建缩略图按钮 thumbnail_btn = tk.Button(thumbnail_frame, image=photo, command=lambda p=page_num: self.go_to_page(p), relief=tk.FLAT) thumbnail_btn.pack(side=tk.LEFT, padx=5, pady=5) # 添加页码标签 page_label = ttk.Label(thumbnail_frame, text=f"页面 {page_num + 1}") page_label.pack(side=tk.LEFT, padx=5) # 高亮显示当前页面 if page_num == self.current_page: thumbnail_frame.config(style="ActiveThumbnail.TFrame") page_label.config(style="ActiveThumbnail.TLabel") else: # 鼠标悬停效果 thumbnail_frame.bind("<Enter>", lambda e, f=thumbnail_frame: f.config(style="HoverThumbnail.TFrame")) thumbnail_frame.bind("<Leave>", lambda e, f=thumbnail_frame: f.config(style="")) page_label.bind("<Enter>", lambda e, l=page_label: l.config(style="HoverThumbnail.TLabel")) page_label.bind("<Leave>", lambda e, l=page_label: l.config(style="")) def display_current_page(self): if not self.pdf_document: return self.pdf_canvas.delete("all") self.selection_start = None self.selection_end = None self.is_selecting = False # 重置选区坐标显示 self.update_selection_coordinates(0, 0, 0, 0) page = self.pdf_document[self.current_page] # 根据自适应模式计算缩放比例 if self.auto_fit_mode == "page": # 自适应整页 self.adjust_zoom_to_fit_page() elif self.auto_fit_mode == "width": # 自适应宽度 self.adjust_zoom_to_fit_width() else: # 实际大小 pass # 使用当前zoom_factor # 使用计算的缩放比例渲染页面 matrix = fitz.Matrix(self.zoom_factor, self.zoom_factor) pix = page.get_pixmap(matrix=matrix) img = Image.open(io.BytesIO(pix.tobytes("ppm"))) photo = ImageTk.PhotoImage(img) # 存储引用以防止垃圾回收 self.current_image = photo # 显示图像 self.image_id = self.pdf_canvas.create_image(0, 0, anchor=tk.NW, image=photo) # 获取页面边界框(用于限制选区) self.page_bbox = self.pdf_canvas.bbox(self.image_id) # 设置画布滚动区域 self.pdf_canvas.config(scrollregion=self.pdf_canvas.bbox(tk.ALL)) # 居中显示 self.center_view() # 状态栏显示页面信息 self.status_var.set(f"页面 {self.current_page + 1}/{self.total_pages}, 缩放比例: {self.zoom_factor:.1f}x") # 更新缩略图高亮 self.generate_thumbnails() def adjust_zoom_to_fit_page(self): """调整缩放比例以适应整个页面""" if not self.pdf_document: return # 获取当前页面和画布尺寸 page = self.pdf_document[self.current_page] canvas_width = self.pdf_canvas.winfo_width() canvas_height = self.pdf_canvas.winfo_height() # 考虑滚动条宽度 scrollbar_width = 15 canvas_width -= scrollbar_width canvas_height -= scrollbar_width # 计算页面宽高 page_width = page.rect.width page_height = page.rect.height # 计算适应画布的缩放比例 width_factor = canvas_width / page_width height_factor = canvas_height / page_height # 取较小的缩放比例以确保整个页面可见 self.zoom_factor = min(width_factor, height_factor) def adjust_zoom_to_fit_width(self): """调整缩放比例以适应页面宽度""" if not self.pdf_document: return # 获取当前页面和画布宽度 page = self.pdf_document[self.current_page] canvas_width = self.pdf_canvas.winfo_width() # 考虑滚动条宽度 scrollbar_width = 15 canvas_width -= scrollbar_width # 计算页面宽度 page_width = page.rect.width # 计算适应画布宽度的缩放比例 self.zoom_factor = canvas_width / page_width def center_view(self): """居中显示PDF页面""" if not self.pdf_document: return # 获取画布和页面尺寸 canvas_width = self.pdf_canvas.winfo_width() canvas_height = self.pdf_canvas.winfo_height() page_width = self.pdf_canvas.bbox(tk.ALL)[2] page_height = self.pdf_canvas.bbox(tk.ALL)[3] # 计算居中位置 if page_width > canvas_width: # 页面宽度大于画布,使用滚动条 xview = 0 else: # 页面宽度小于画布,居中显示 xview = (canvas_width - page_width) / 2 / canvas_width if page_height > canvas_height: # 页面高度大于画布,使用滚动条 yview = 0 else: # 页面高度小于画布,居中显示 yview = (canvas_height - page_height) / 2 / canvas_height # 设置视图位置 self.pdf_canvas.xview_moveto(xview) self.pdf_canvas.yview_moveto(yview) def go_to_page(self, page_num): """跳转到指定页面""" if 0 <= page_num < self.total_pages: self.current_page = page_num self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}") self.display_current_page() self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(page_num + 1)) def jump_to_page(self): """根据输入的页码跳转到指定页面""" if not self.pdf_document: return try: page_num = int(self.page_entry.get()) - 1 if 0 <= page_num < self.total_pages: self.go_to_page(page_num) else: messagebox.showwarning("页码错误", f"请输入1到{self.total_pages}之间的页码") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page + 1)) except ValueError: messagebox.showwarning("输入错误", "请输入有效的页码") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page + 1)) def prev_page(self): if self.current_page > 0: self.current_page -= 1 self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}") self.display_current_page() self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page + 1)) def next_page(self): if self.current_page < self.total_pages - 1: self.current_page += 1 self.page_label.config(text=f"{self.current_page + 1}/{self.total_pages}") self.display_current_page() self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page + 1)) def on_mouse_down(self, event): # 检查是否点击在页面内 if self.page_bbox and self.is_point_in_page(event.x, event.y): self.is_selecting = True self.selection_start = (event.x, event.y) # 限制起点在页面内 self.selection_start = self.clamp_point_to_page(self.selection_start[0], self.selection_start[1]) # 创建选区矩形 self.selection_rect = self.pdf_canvas.create_rectangle( self.selection_start[0], self.selection_start[1], self.selection_start[0], self.selection_start[1], outline="red", width=2, stipple="gray25", fill="#FF000033") # 更新选区坐标显示 self.update_selection_coordinates( int(self.selection_start[0]), int(self.selection_start[1]), int(self.selection_start[0]), int(self.selection_start[1]) ) # 状态栏显示选区信息 self.status_var.set(f"选区起点: ({int(self.selection_start[0])}, {int(self.selection_start[1])})") # 清除之前的辅助线条 for line in self.helper_lines: self.pdf_canvas.delete(line) self.helper_lines = [] def on_mouse_drag(self, event): if self.is_selecting and hasattr(self, 'selection_rect'): # 限制终点在页面内 end_x, end_y = self.clamp_point_to_page(event.x, event.y) # 更新选区矩形 self.pdf_canvas.coords(self.selection_rect, self.selection_start[0], self.selection_start[1], end_x, end_y) # 计算选区坐标 x1, y1 = self.selection_start x2, y2 = end_x, end_y # 确保坐标按左上右下排序 if x1 > x2: x1, x2 = x2, x1 if y1 > y2: y1, y2 = y2, y1 # 更新选区坐标显示 self.update_selection_coordinates(int(x1), int(y1), int(x2), int(y2)) # 计算选区尺寸 width = x2 - x1 height = y2 - y1 # 显示选区坐标和尺寸 coord_text = f"选区: ({int(x1)}, {int(y1)}) - ({int(x2)}, {int(y2)}) | 尺寸: {int(width)}×{int(height)} 像素" self.status_var.set(coord_text) # 更新辅助线条 for line in self.helper_lines: self.pdf_canvas.delete(line) self.helper_lines = [] # 绘制水平辅助线 h_line1 = self.pdf_canvas.create_line(x1, y1, x2, y1, fill="blue", dash=(4, 4)) h_line2 = self.pdf_canvas.create_line(x1, y2, x2, y2, fill="blue", dash=(4, 4)) # 绘制垂直辅助线 v_line1 = self.pdf_canvas.create_line(x1, y1, x1, y2, fill="blue", dash=(4, 4)) v_line2 = self.pdf_canvas.create_line(x2, y1, x2, y2, fill="blue", dash=(4, 4)) self.helper_lines.extend([h_line1, h_line2, v_line1, v_line2]) def on_mouse_up(self, event): if self.is_selecting: self.is_selecting = False # 限制终点在页面内 end_x, end_y = self.clamp_point_to_page(event.x, event.y) self.selection_end = (end_x, end_y) # 检查选区是否太小 width = abs(self.selection_end[0] - self.selection_start[0]) height = abs(self.selection_end[1] - self.selection_start[1]) if width < 10 or height < 10: # 选区太小,删除选区 self.pdf_canvas.delete(self.selection_rect) self.selection_start = None self.selection_end = None # 重置选区坐标显示 self.update_selection_coordinates(0, 0, 0, 0) self.status_var.set("选区太小,已重置") else: # 更新选区矩形 self.pdf_canvas.coords(self.selection_rect, self.selection_start[0], self.selection_start[1], self.selection_end[0], self.selection_end[1]) # 计算最终选区坐标 x1, y1 = self.selection_start x2, y2 = self.selection_end # 确保坐标按左上右下排序 if x1 > x2: x1, x2 = x2, x1 if y1 > y2: y1, y2 = y2, y1 # 更新选区坐标显示 self.update_selection_coordinates(int(x1), int(y1), int(x2), int(y2)) # 计算选区尺寸 width = x2 - x1 height = y2 - y1 # 显示最终选区坐标和尺寸 coord_text = f"选区已完成: ({int(x1)}, {int(y1)}) - ({int(x2)}, {int(y2)}) | 尺寸: {int(width)}×{int(height)} 像素" self.status_var.set(coord_text) # 清除辅助线条 for line in self.helper_lines: self.pdf_canvas.delete(line) self.helper_lines = [] def update_selection_coordinates(self, x1, y1, x2, y2): """更新选区坐标显示""" # 确保坐标按左上右下排序 if x1 > x2: x1, x2 = x2, x1 if y1 > y2: y1, y2 = y2, y1 # 更新坐标显示 self.selection_x1_var.set(str(x1)) self.selection_y1_var.set(str(y1)) self.selection_x2_var.set(str(x2)) self.selection_y2_var.set(str(y2)) # 计算并更新尺寸 width = x2 - x1 height = y2 - y1 self.selection_width_var.set(str(width)) self.selection_height_var.set(str(height)) def is_point_in_page(self, x, y): """检查点是否在页面内""" if not self.page_bbox: return False x1, y1, x2, y2 = self.page_bbox return x1 <= x <= x2 and y1 <= y <= y2 def clamp_point_to_page(self, x, y): """将点限制在页面范围内""" if not self.page_bbox: return (x, y) x1, y1, x2, y2 = self.page_bbox clamped_x = max(x1, min(x, x2)) clamped_y = max(y1, min(y, y2)) return (clamped_x, clamped_y) def extract_text(self): if not self.pdf_document: messagebox.showinfo("提示", "请先打开PDF文件") return if self.extract_type.get() == "selection": if not self.selection_start or not self.selection_end: messagebox.showinfo("提示", "请先选择文本区域") return # 确保选区坐标正确排序 x0, y0 = min(self.selection_start[0], self.selection_end[0]), min(self.selection_start[1], self.selection_end[1]) x1, y1 = max(self.selection_start[0], self.selection_end[0]), max(self.selection_start[1], self.selection_end[1]) # 计算选区在原始PDF中的比例 page = self.pdf_document[self.current_page] # 调整选区坐标为PDF坐标 pdf_x0 = x0 / self.zoom_factor pdf_y0 = y0 / self.zoom_factor pdf_x1 = x1 / self.zoom_factor pdf_y1 = y1 / self.zoom_factor # 创建选区矩形 rect = fitz.Rect(pdf_x0, pdf_y0, pdf_x1, pdf_y1) # 使用dict模式提取文本,获取更详细的布局信息 text_data = page.get_text("dict", clip=rect) else: # 整页提取 page = self.pdf_document[self.current_page] text_data = page.get_text("dict") if text_data and 'blocks' in text_data: # 分析文本块,识别表格结构 table_data = self.analyze_table_structure(text_data) # 添加页码到表格数据 if table_data: self.table_data = [[f"页面 {self.current_page + 1}"] + row for row in table_data] else: self.table_data = None # 更新结果显示 self.result_text.delete(1.0, tk.END) if self.table_data and len(self.table_data) > 0: # 显示表格预览 self.result_text.insert(tk.END, "已识别表格结构:\n\n") for row in self.table_data: row_text = " | ".join([cell if cell else " " for cell in row]) self.result_text.insert(tk.END, f"{row_text}\n") self.status_var.set(f
最新发布
06-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值