Python实现PDF图像识别与内容比对-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_47427038/article/details/131739170

提取文字并进行比对

1.获取本地txt文件的路径，进行切割处理，提取第一部分和第二部分路径
2.将切割好的绝对路径下的图片，进行图像识别
3.提取图片中的公章处字体
4.第一步骤切割好的第一部分和提取图片上的文字，进行比对
5.比对结果不一致时，打印出具体的错误名称信息

引用的txt文本
XXX有限公司电子回单专用章>>C:\upload\11130020210401091821680463\3646bea6-485c-4bad-9237-ba7593e670cb.pdf
XXX电子回单专用章>>C:\upload\11130020210401091821680463\24b92bd6-de2e-4dce-a356-296c12d2778b.pdf

pdf格式的部分提取展示
在这里插入图片描述

import fitz
import os
import datetime,sys
import pytesseract
from PIL import Image
import cv2
import numpy as np

def test_readfile(readfile_path):
    global imagespdf
    global name
    with open(readfile_path,mode='r',encoding='GBK',newline='') as f:
       lines= f.readlines()
       print(type(lines))
       print(lines)
    for i in range(0,len(lines)):
        line= lines[i]
        print('line=',line)
        # line=line[0:]
        names= line[0:].split('>')
        print('names=',names)
        name=names[0]
        print('name=',name)
        images=names[2]
        image=images.split('\\')
        image=image[-1]
        print('images=',images)
        print('image=',image)
        imagespdf=[names[2]]
        print('imagespdf=',imagespdf)

def test_convertpdf(file_path,image_path,zoom_x=2,zoom_y=2,core=0):
    '''
    # 将PDF转化为图片
    file_path: pdf文件的路径
    image_path: 图像要保存的文件夹
    zoom_x: x方向的缩放系数
    zoom_y: y方向的缩放系数
    core: 旋转角度
    '''
    #开始时间
    starttime=datetime.datetime.now()
    global dir_path
    global filename
    #打开PDF
    pdf =fitz.open(file_path)
    #逐行读取PDF
    for pg in range(0,pdf.pageCount):
             page=pdf[pg]
             sname=os.path.basename(pdf.name)
             file_name = sname.split('.')