使用paddleocr飞浆识别pdf表格以及文本

最新推荐文章于 2025-11-01 00:41:59 发布

原创最新推荐文章于 2025-11-01 00:41:59 发布 · 507 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#pdf #swift #开发语言

部署运行你感兴趣的模型镜像

1. 识别pdf上的表格

import io
import json
import time
import cv2
import fitz
import numpy as np
from PIL import Image
import os
# 设置下载模型的环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PaddleOCR, PPStructureV3

def pdf_to_image_bytes(page, page_index=0, dpi=200):
    """将 PDF 指定页面转换为 PNG 格式的字节数据"""
    # import fitz  # PyMuPDF
    # from PIL import Image
    # import io
    # doc = fitz.open(pdf_path)
    # page = doc.load_page(page_index)  # 读取指定页
    # 设置高分辨率渲染 (DPI)
    zoom = dpi / 72  # 72是PDF的标准DPI
    mat = fitz.Matrix(zoom, zoom)
    # mat = fitz.Matrix(1.8, 1.8)
    # 渲染为RGB图像
    pix = page.get_pixmap(matrix=mat, colorspace="rgb")
    # 直接转换为PIL图像
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # 转换为字节流
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")  # 可改为 JPEG/其他格式
    return img_bytes.getvalue()



def PPStructureV3_table(image_path):
    # 初始化表格识别器
    pipeline = PPStructureV3(
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
        use_textline_orientation=False,
        use_formula_recognition=False, # 否使用公式识别
        use_region_detection=False, # 是否使用区域检测
        use_table_recognition=True,   # 是否使用表格识别
    )
    json_path_temp = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''))
    if not os.path.exists(json_path_temp):
        os.makedirs(json_path_temp)

    images_path = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''), 'images')
    if not os.path.exists(images_path):
        os.makedirs(images_path)
     # 识别pdf
    if isinstance(image_path, str) and image_path.endswith('.pdf'):
        doc = fitz.open(image_path)
        for page_num in range(len(doc)):
            print(f'页码数:{page_num}')
            page = doc.load_page(page_num)
            cv_image = pdf_to_image_bytes(page)
            print('cv_image:', type(cv_image))

            image = Image.open(io.BytesIO(cv_image))
            image.save(os.path.join(images_path, f'out{page_num+1}.jpg'))

            image_array = cv2.imdecode(np.frombuffer(io.BytesIO(cv_image).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
            print(image_array.shape)
            result = pipeline.predict(input=image_array)
            json_path = os.path.join(json_path_temp, f'out{page_num+1}.json')
            for res in result:
                res.save_to_json(json_path)
                PPStructureV3_table_data(res.json, image_path, page_num)
        return
    elif isinstance(image_path, str):
        pil_image = Image.open(image_path)
        # 将 PIL 图像转换为 NumPy 数组
        cv_image = np.array(pil_image)
    else:
        cv_image = image_path
    # 识别图片
    result = pipeline.predict(input=cv_image)
    json_path = r'/picke_list_file/output/out1.json'
    del pipeline
    gc.collect()  # 强制进行垃圾回收
    for res in result:
        # 进行解析json
        PPStructureV3_table_data(res.json, image_path)
        # 保存成json
        # res.save_to_json(json_path)

2. 识别pdf上的文本

import os
# 设置环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PaddleOCR, PPChatOCRv4Doc, PPStructureV3
from PIL import Image
import numpy as np
import fitz
import io
import cv2

def pdf_to_image_bytes(page, page_index=0, dpi=200):
    """将 PDF 指定页面转换为 PNG 格式的字节数据"""
    # import fitz  # PyMuPDF
    # from PIL import Image
    # import io
    # doc = fitz.open(pdf_path)
    # page = doc.load_page(page_index)  # 读取指定页

    # 设置高分辨率渲染 (DPI)
    # zoom = dpi / 72  # 72是PDF的标准DPI
    # mat = fitz.Matrix(zoom, zoom)

    mat = fitz.Matrix(1.8, 1.8)
    # 渲染为RGB图像
    pix = page.get_pixmap(matrix=mat, colorspace="rgb")
    # 直接转换为PIL图像
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # 转换为字节流
    img_bytes = io.BytesIO()
    img.save(img_bytes, format="PNG")  # 可改为 JPEG/其他格式
    return img_bytes.getvalue()


def PPStructureV3_table(image_path):
    # 初始化表格识别器
    pipeline = PaddleOCR(
        use_angle_cls=True, lang="ch", device='cpu', cpu_threads=1,
        det_db_thresh=0.6
    )

    json_path_temp = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''))
    if not os.path.exists(json_path_temp):
        os.makedirs(json_path_temp)

    images_path = os.path.join(os.path.dirname(image_path), os.path.basename(image_path).replace('.pdf', ''), 'images')
    if not os.path.exists(images_path):
        os.makedirs(images_path)

    if isinstance(image_path, str) and image_path.endswith('.pdf'):
        doc = fitz.open(image_path)
        for page_num in range(len(doc)):
            print(f'页码数:{page_num}')

            page = doc.load_page(page_num)
            cv_image = pdf_to_image_bytes(page)
            print('cv_image:', type(cv_image))

            image = Image.open(io.BytesIO(cv_image))
            image.save(os.path.join(images_path, f'out{page_num+1}.jpg'))

            image_array = cv2.imdecode(np.frombuffer(io.BytesIO(cv_image).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
            print(image_array.shape)

            # result = pipeline.ocr(image_array)
            result = pipeline.predict(input=image_array)

            json_path = os.path.join(json_path_temp, f'out{page_num+1}.json')
            for res in result:
                res.save_to_json(json_path)
                
        return
    elif isinstance(image_path, str):
        pil_image = Image.open(image_path)
        # 将 PIL 图像转换为 NumPy 数组
        cv_image = np.array(pil_image)
    else:
        cv_image = image_path
    result = pipeline.predict(input=cv_image)
    json_path = r'/picke_list_file/output/out1.json'
    
    del pipeline
    gc.collect()  # 强制进行垃圾回收
    for res in result:
        print(res.json)

3. 写成接口

import gc
import threading
import time
import asyncio
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, Union, List, Dict
from fastapi import FastAPI, File, UploadFile, Form
import uvicorn
from typing import List
from fastapi import FastAPI, File, UploadFile
import os
from json import loads as jsonLoads, dumps as jsonDumps
from sys import platform as sysPlatform

import io
import json
import time

import cv2
import fitz
import numpy as np
from PIL import Image
import os
# 设置环境变量
os.environ["PADDLE_PDX_CACHE_HOME"] = r"D:\paddlex_models"
from paddleocr import PPStructureV3

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)

app.add_middleware(
    CORSMiddleware,
    # allow_origins=origins, # 表示允许任何源
    allow_origins=["*"],  # 表示允许任何源
    allow_credentials=True,
    allow_methods=["POST"],
    allow_headers=["*"],
)


def receive_bytes_file(imageBytes):     # 接收二进制文件
    pipeline = PPStructureV3(
        use_table_recognition=True, # 是否使用表格识别
        cpu_threads=1,
        # text_det_limit_side_len=768, # 降低检测边长限制（默认960）
        # text_det_box_thresh=0.4,      # 降低框阈值
        text_recognition_batch_size=4,  # 降低批处理大小

        use_doc_orientation_classify=False,  # 文档方向分类
        use_doc_unwarping=False,  # 文档解扭曲
        use_textline_orientation=False,  # 文本行方向识别
        use_seal_recognition=False,  # 印章识别
        use_formula_recognition=False,  # 公式识别
        use_chart_recognition=False,  # 禁用图表识别
        use_region_detection=False,  # 禁用区域检测

    )

    image_array = cv2.imdecode(np.frombuffer(io.BytesIO(imageBytes).read(), dtype=np.uint8), cv2.IMREAD_COLOR)
    print('shape:', image_array.shape)
    result = pipeline.predict(input=image_array)
    # json_path = os.path.join(json_path_temp, f'out{page_num + 1}.json')
    del pipeline
    gc.collect() # 强制进行垃圾回收
    for res in result:
        # res.save_to_json(json_path)
        return res.json
    return {}


@app.post("/send_one_image")
async def update_item(file: UploadFile = File(...), label: str = Form(None)):
    imageBytes = await file.read()
    # with open(fr'\images\temp.jpg', 'wb') as f:
    #     f.write(imageBytes)
    text = receive_bytes_file(imageBytes)
    return {"res": text}


@app.post("/send_images")
async def update_item(files: List[UploadFile] = File(...), label: str = Form(None)):
    if not os.path.exists('./images'):
        os.makedirs('./images')
    text_list = []
    for file in files:
        imageBytes = await file.read()
        with open(f'./images/{file.filename}', 'wb') as f:
            f.write(imageBytes)
        text = receive_bytes_file(imageBytes)
        text_list.append(text)
    return {"res": text_list}


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=19315)

您可能感兴趣的与本文相关的镜像