OCR中训练时加入PAD效果图展示。

最新推荐文章于 2025-03-11 20:19:04 发布

原创最新推荐文章于 2025-03-11 20:19:04 发布 · 350 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #图像识别 #cv

本文介绍在训练文字识别模型时，如何处理不同尺寸的图片输入问题。通过使用PAD技术，确保所有图片统一大小，避免了图片在reshape过程中可能产生的变形，保持了模型输入的一致性。

对图片加入PAD

在训练文字识别模型时，真实场景下的训练集图片的长宽可变，字数也不固定，字数从1-n（一般为25左右）不等，但是送入训练模型时，会经过统一的reshape，如果不同长度的图片可能会存在着变形，此时需要对长度小于100的图片进行pad，在进行reshape，会解决上述情况。实现代码如下：

"coding = utf-8"
import os
import sys
import torch
import cv2
from PIL import Image
import numpy as np
import math
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage


class ResizeNormalize(object):

    def __init__(self, size, interpolation=Image.BICUBIC):
        self.size = size
        self.interpolation = interpolation
        self.toTensor = transforms.ToTensor()

    def __call__(self, img):
        img = img.resize(self.size, self.interpolation)
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        return img



# 字不足宽度的补最右边的像素
class NormalizePAD(object):

    def __init__(self, max_size, PAD_type='right'):
        self.toTensor = transforms.ToTensor()
        self.max_size = max_size
        print('self.max_size',self.max_size)
        self.max_width_half = math.floor(max_size[2] / 2)
        self.PAD_type = PAD_type

    def __call__(self, img):
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        c, h, w = img.size()
        Pad_img = torch.FloatTensor(*self.max_size).fill_(0)
        print('Pad_img',Pad_img)
        print('img.shape',img.shape)
        print('Pad_img.shape',Pad_img.shape)
        Pad_img[:, :, :w] = img  # right pad
        if self.max_size[2] != w:  # add border Pad
            Pad_img[:, :, w:] = img[:, :, w - 1].unsqueeze(2).expand(c, h, self.max_size[2] - w)
        print('Pad_img.shape1',Pad_img.shape)  
        print('Pad_img.shape2',Pad_img)
        return Pad_img


imgH=32
imgW=100
input_path = "/home/zhou/PAD_img/input_images"
save_path = "/home/zhou/PAD_img/pad_images"
save_path_nopad = "/home/zhou/PAD_img/no_pad_images"

keep_ratio_with_pad = True
if keep_ratio_with_pad:  # same concept with 'Rosetta' paper
    resized_max_w = imgW
    transform = NormalizePAD((3, imgH, resized_max_w))
    
    filelist = os.listdir(input_path)
    for item in filelist:
        img_path = os.path.join(input_path, item)
        image=Image.open(img_path)
        #h = image.shape[0]
        #w = image.shape[1]
        w, h = image.size
        print('w',w)
        ratio = w / float(h)
        if math.ceil(imgH * ratio) > imgW:
            resized_w = imgW #如果输入图片的尺寸宽度高度比大于100/32，resize后的尺寸宽度等于100
            print(1)
        else:
            print(2)
            resized_w = math.ceil(imgH * ratio)
        print('resized_w',resized_w)
        resized_image = image.resize((resized_w, imgH), Image.BICUBIC)
        print('resized_image.size',resized_image.size)
        resized_images = transform(resized_image)
        print('resized_images11',resized_images)
        img=resized_images.cpu()
        img=img.squeeze()
        npimg=img.permute(1,2,0).numpy().astype('uint8')
        print(npimg.shape)
        img_name = os.path.join(save_path, item)
        cv2.imwrite(img_name,npimg)