PVT图像二分类推理v1.0

原创已于 2024-08-23 11:16:28 修改 · 213 阅读
CC 4.0 BY-SA版权
文章标签：
于 2024-08-09 19:00:52 首次发布
from PIL import Image
import cv2
import numpy as np
import onnxruntime as ort
import torchvision.transforms as transforms


# 定义默认的均值和标准差
# 这些值是针对 ImageNet 数据集计算出来的均值和标准差，它们被广泛用于预训练模型上，特别是那些在 ImageNet 上预训练的模型。这些值是基于 ImageNet 数据集中的 RGB 图像计算得出的
IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]

def build_transform(is_train, args):
    resize_im = args.input_size > 32
    if is_train:
        # this should always dispatch to transforms_imagenet_train
        transform = transforms.Compose([
            transforms.RandomResizedCrop(args.input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)
        ])
        if not resize_im:
            # replace RandomResizedCropAndInterpolation with
            # RandomCrop
            transform.transforms[0] = transforms.RandomCrop(
                args.input_size, padding=4)
        return transform

    t = []
    if resize_im:
        size = int((256 / 224) * args.input_size)
        t.append(
            transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC),  # to maintain same ratio w.r.t. 224 images
        )
        t.append(transforms.CenterCrop(args.input_size))  # 裁切到input_size尺寸

    t.append(transforms.ToTensor())
    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
    return transforms.Compose(t)


def preprocess_image(image_path, target_size=(224, 224), is_train=False, args=None):
    """
    Preprocesses the input image.
    """
    # Load the image
    image = cv2.imread(image_path)
    # Convert to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Convert the NumPy array to a PIL Image
    image = Image.fromarray(image)


    # 输入是Nunmpy数组
    # Apply transformations
    transform = build_transform(is_train, args)
    image = transform(image)

    # Convert to numpy array
    image = np.array(image)

    # Add batch dimension
    image = np.expand_dims(image, axis=0)

    return image

    # 输入是Pytorch张量
    # # Apply transformations
    # transform = build_transform(is_train, args)
    # image = transform(image)

    # # Add batch dimension
    # image = image.unsqueeze(0)

    # return image


def run_inference(onnx_model_path, image_path, is_train=False, args=None):
    """
    Runs inference on the input image using the ONNX model.
    """
    # Load the ONNX model
    session = ort.InferenceSession(onnx_model_path)

    # Get the input name
    input_name = session.get_inputs()[0].name

    # Preprocess the image
    preprocessed_image = preprocess_image(image_path, is_train=is_train, args=args)

    # Run inference
    outputs = session.run(None, {input_name: preprocessed_image})
    # Get the predicted class
    predicted_class = np.argmax(outputs[0])

    return predicted_class

if __name__ == "__main__":
    # Path to the ONNX model
    onnx_model_path = 'checkpoints/pvt_v2_b5/checkpoint_0.onnx'
    # Path to the input image
    image_path = '/home/nvidia/aigc/classify/PVT/classification/AIGC/val/aigc_0/0002.png'
    
    # Define arguments
    args = type('', (), {})()  # Empty class for arguments
    args.input_size = 224  # Input size of the model
    args.color_jitter = 0.4  # Color jitter factor
    args.aa = 'rand'  # Auto augmentation policy
    args.train_interpolation = 'bicubic'  # Interpolation method
    args.reprob = 0.25  # Probability of performing mixup or cutmix when training
    args.remode = 'pixel'  # How to apply mixup/cutmix params. Per-pixel (default), per-labeled-instance, per-image
    args.recount = 1  # Number of mixes/cuts per image
    args.is_train = False  # Whether we are training or not

    # Run inference
    prediction = run_inference(onnx_model_path, image_path, is_train=False, args=args)

    # Print the prediction
    print(f"The predicted class is: {prediction}")


"""
[BUG1]
  File "infer.py", line 101, in <module>
    prediction = run_inference(onnx_model_path, image_path, is_train=False, args=args)
  File "infer.py", line 74, in run_inference
    preprocessed_image = preprocess_image(image_path, is_train=is_train, args=args)
  File "infer.py", line 52, in preprocess_image
    image = transform(image)
报错：    raise TypeError(f"img should be PIL Image. Got {type(img)}")
TypeError: img should be PIL Image. Got <class 'numpy.ndarray'>

[debug]
问题在于 preprocess_image 函数中的图像数据类型不匹配。
torchvision.transforms 中的变换函数期望输入的是一个 PIL Image 对象，而您的代码中使用 cv2.imread 加载的图像是一个 NumPy 数组。

为了修复这个问题，您需要将 NumPy 数组转换为 PIL Image 对象，然后再应用变换。您可以使用 PIL.Image.fromarray 方法来实现这一转换。

"""