使用paddleocr飞浆识别pdf表格以及文本

部署运行你感兴趣的模型镜像

1. 识别pdf上的表格

import io
import json
import time
import cv2
import fitz
import numpy as np
from PIL import Image
import os
# 设置下载模型的环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PaddleOCR, PPStructureV3

def pdf_to_image_bytes(page, page_index=0, dpi=200):
    """将 PDF 指定页面转换为 PNG 格式的字节数据"""
    # import fitz  # PyMuPDF
    # from PIL import Image
    # import io
    # doc = fitz.open(pdf_path)
    # page = doc.load_page(page_index)  # 读取指定页
    # 设置高分辨率渲染 (DPI)
    zoom = dpi / 72  # 72是PDF的标准DPI
    mat = fitz.Matrix(zoom, zoom)
    # mat = fitz.Matrix(1.8, 1.8)
    # 渲染为RGB图像
    pix = page.get_pixmap(matrix=mat, colorspace="rgb")
    # 直接转换为PIL图像
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # 转换为字节流
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")  # 可改为 JPEG/其他格式
    return img_bytes.getvalue()



def PPStructureV3_table(image_path):
    # 初始化表格识别器
    pipeline = PPStructureV3(
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
        use_textline_orientation=False,
        use_formula_recognition=False, # 否使用公式识别
        use_region_detection=False, # 是否使用区域检测
        use_table_recognition=True,   # 是否使用表格识别
    )
    json_path_temp = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''))
    if not os.path.exists(json_path_temp):
        os.makedirs(json_path_temp)

    images_path = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''), 'images')
    if not os.path.exists(images_path):
        os.makedirs(images_path)
     # 识别pdf
    if isinstance(image_path, str) and image_path.endswith('.pdf'):
        doc = fitz.open(image_path)
        for page_num in range(len(doc)):
            print(f'页码数:{page_num}')
            page = doc.load_page(page_num)
            cv_image = pdf_to_image_bytes(page)
            print('cv_image:', type(cv_image))

            image = Image.open(io.BytesIO(cv_image))
            image.save(os.path.join(images_path, f'out{page_num+1}.jpg'))

            image_array = cv2.imdecode(np.frombuffer(io.BytesIO(cv_image).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
            print(image_array.shape)
            result = pipeline.predict(input=image_array)
            json_path = os.path.join(json_path_temp, f'out{page_num+1}.json')
            for res in result:
                res.save_to_json(json_path)
                PPStructureV3_table_data(res.json, image_path, page_num)
        return
    elif isinstance(image_path, str):
        pil_image = Image.open(image_path)
        # 将 PIL 图像转换为 NumPy 数组
        cv_image = np.array(pil_image)
    else:
        cv_image = image_path
    # 识别图片
    result = pipeline.predict(input=cv_image)
    json_path = r'/picke_list_file/output/out1.json'
    del pipeline
    gc.collect()  # 强制进行垃圾回收
    for res in result:
        # 进行解析json
        PPStructureV3_table_data(res.json, image_path)
        # 保存成json
        # res.save_to_json(json_path)




2. 识别pdf上的文本

import os
# 设置环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PaddleOCR, PPChatOCRv4Doc, PPStructureV3
from PIL import Image
import numpy as np
import fitz
import io
import cv2

def pdf_to_image_bytes(page, page_index=0, dpi=200):
    """将 PDF 指定页面转换为 PNG 格式的字节数据"""
    # import fitz  # PyMuPDF
    # from PIL import Image
    # import io
    # doc = fitz.open(pdf_path)
    # page = doc.load_page(page_index)  # 读取指定页

    # 设置高分辨率渲染 (DPI)
    # zoom = dpi / 72  # 72是PDF的标准DPI
    # mat = fitz.Matrix(zoom, zoom)

    mat = fitz.Matrix(1.8, 1.8)
    # 渲染为RGB图像
    pix = page.get_pixmap(matrix=mat, colorspace="rgb")
    # 直接转换为PIL图像
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # 转换为字节流
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")  # 可改为 JPEG/其他格式
    return img_bytes.getvalue()


def PPStructureV3_table(image_path):
    # 初始化表格识别器
    pipeline = PaddleOCR(
        use_angle_cls=True, lang="ch", device='cpu', cpu_threads=1,
        det_db_thresh=0.6
    )

    json_path_temp = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''))
    if not os.path.exists(json_path_temp):
        os.makedirs(json_path_temp)

    images_path = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''), 'images')
    if not os.path.exists(images_path):
        os.makedirs(images_path)

    if isinstance(image_path, str) and image_path.endswith('.pdf'):
        doc = fitz.open(image_path)
        for page_num in range(len(doc)):
            print(f'页码数:{page_num}')

            page = doc.load_page(page_num)
            cv_image = pdf_to_image_bytes(page)
            print('cv_image:', type(cv_image))

            image = Image.open(io.BytesIO(cv_image))
            image.save(os.path.join(images_path, f'out{page_num+1}.jpg'))

            image_array = cv2.imdecode(np.frombuffer(io.BytesIO(cv_image).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
            print(image_array.shape)

            # result = pipeline.ocr(image_array)
            result = pipeline.predict(input=image_array)

            json_path = os.path.join(json_path_temp, f'out{page_num+1}.json')
            for res in result:
                res.save_to_json(json_path)
                
        return
    elif isinstance(image_path, str):
        pil_image = Image.open(image_path)
        # 将 PIL 图像转换为 NumPy 数组
        cv_image = np.array(pil_image)
    else:
        cv_image = image_path
    result = pipeline.predict(input=cv_image)
    json_path = r'/picke_list_file/output/out1.json'
    
    del pipeline
    gc.collect()  # 强制进行垃圾回收
    for res in result:
        print(res.json)


3. 写成接口

import gc
import threading
import time
import asyncio
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, Union, List, Dict
from fastapi import FastAPI, File, UploadFile, Form
import uvicorn
from typing import List
from fastapi import FastAPI, File, UploadFile
import os
from json import loads as jsonLoads, dumps as jsonDumps
from sys import platform as sysPlatform

import io
import json
import time

import cv2
import fitz
import numpy as np
from PIL import Image
import os
# 设置环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PPStructureV3

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)

app.add_middleware(
    CORSMiddleware,
    # allow_origins=origins, # 表示允许任何源
    allow_origins=["*"],  # 表示允许任何源
    allow_credentials=True,
    allow_methods=["POST"],
    allow_headers=["*"],
)


def receive_bytes_file(imageBytes):     # 接收二进制文件
    pipeline = PPStructureV3(
        use_table_recognition=True, # 是否使用表格识别
        cpu_threads=1,
        # text_det_limit_side_len=768, # 降低检测边长限制(默认960)
        # text_det_box_thresh=0.4,      # 降低框阈值
        text_recognition_batch_size=4,  # 降低批处理大小

        use_doc_orientation_classify=False,  # 文档方向分类
        use_doc_unwarping=False,  # 文档解扭曲
        use_textline_orientation=False,  # 文本行方向识别
        use_seal_recognition=False,  # 印章识别
        use_formula_recognition=False,  # 公式识别
        use_chart_recognition=False,  # 禁用图表识别
        use_region_detection=False,  # 禁用区域检测

    )

    image_array = cv2.imdecode(np.frombuffer(io.BytesIO(imageBytes).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
    print('shape:', image_array.shape)
    result = pipeline.predict(input=image_array)
    # json_path = os.path.join(json_path_temp, f'out{page_num + 1}.json')
    del pipeline
    gc.collect() # 强制进行垃圾回收
    for res in result:
        # res.save_to_json(json_path)
        return res.json
    return {}


@app.post("/send_one_image")
async def update_item(file: UploadFile = File(...), label: str = Form(None)):
    imageBytes = await file.read()
    # with open(fr'\images\temp.jpg', 'wb') as f:
    #     f.write(imageBytes)
    text = receive_bytes_file(imageBytes)
    return {"res": text}


@app.post("/send_images")
async def update_item(files: List[UploadFile] = File(...), label: str = Form(None)):
    if not os.path.exists('./images'):
        os.makedirs('./images')
    text_list = []
    for file in files:
        imageBytes = await file.read()
        with open(f'./images/{file.filename}', 'wb') as f:
            f.write(imageBytes)
        text = receive_bytes_file(imageBytes)
        text_list.append(text)
    return {"res": text_list}


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=19315)

您可能感兴趣的与本文相关的镜像

PaddlePaddle-v3.3

PaddlePaddle-v3.3

PaddlePaddle

PaddlePaddle是由百度自主研发的深度学习平台,自 2016 年开源以来已广泛应用于工业界。作为一个全面的深度学习生态系统,它提供了核心框架、模型库、开发工具包等完整解决方案。目前已服务超过 2185 万开发者,67 万企业,产生了 110 万个模型

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值