Onnx转tensorRT，TensorRT为10.7.0版本

Code Horse

已于 2025-03-07 10:30:04 修改

阅读量563

点赞数 11

CC 4.0 BY-SA版权

文章标签： python 开发语言算法

于 2025-03-07 10:06:43 首次发布

本文链接：https://blog.youkuaiyun.com/qq_40736019/article/details/146087612

最近刚开始在做将pt转onnx，转tensorRT的操作，网上查到的资料均为TensorRT老版本，现记录一下新版本使用。

参考资料：trt-samples-for-hackathon-cn/cookbook/01-SimpleDemo/TensorRT-10 at master · NVIDIA/trt-samples-for-hackathon-cn · GitHub

先记录一下，后面再做解释

一、onnx转tensorRT

from collections import OrderedDict  # keep the order of the tensors implicitly
from pathlib import Path

import numpy as np
import tensorrt as trt
from cuda import cudart
import torch
import cupy as cp
import onnxruntime as ort

# yapf:disable

trt_file = Path("./best640.trt")
input_tensor_name = "input"
data = np.arange(3 * 640 * 640, dtype=np.float32).reshape(1, 3, 640, 640)                  # Inference input data
# data = np.load('input_data_coat.npy')
def run():
    logger = trt.Logger(trt.Logger.ERROR)                                       # Create Logger, available level: VERBOSE, INFO, WARNING, ERROR, INTERNAL_ERROR
    if trt_file.exists():                                                       # Load engine from file and skip building process if it existed
        with open(trt_file, "rb") as f:
            engine_bytes = f.read()
        if engine_bytes == None:
            print("Fail getting serialized engine")
            return
        print("Succeed getting serialized engine")
    else:                                                                       # Build a serialized network from scratch
        builder = trt.Builder(logger)                                           # Create Builder
        config = builder.create_builder_config()                                # Create BuidlerConfig to set attribution of the network
        network = builder.create_network()                                      # Create Network
        profile = builder.create_optimization_profile()                         # Create OptimizationProfile if using Dynamic-Shape mode
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)     # Set workspace for the building process (all GPU memory is used by default)

        # onnx
        onnx_file = Path("./best640.onnx")
        onnx_parser = trt.OnnxParser(network, logger)
        with open(onnx_file, 'rb') as f:
            onnx_model = f.read()
            if not onnx_parser.parse(onnx_model):
                print("ERROR: Failed to parse ONNX model")
                for error in range(onnx_parser.num_errors):
                    print(onnx_parser.get_error(error))
                return

        # input_tensor = network.add_input(input_tensor_name, trt.float16, [-1, -1, -1, -1])  # Set input tensor of the network
        # profile.set_shape(input_tensor.name, [1, 3, 224, 224], [1, 3, 224, 224], [1, 3, 224, 224])  # Set dynamic shape range of the input tensor
        # config.add_optimization_profile(profile)                                # Add the Optimization Profile into the BuilderConfig

        # input_tensor_name2 = network.get_input(0).name
        # input_tensor = network.add_input(input_tensor_name, trt.float32, [-1, -1, -1, -1])
        profile.set_shape(input_tensor_name, [1, 3, 640, 640], [1, 3, 640, 640], [1, 3, 640, 640])  # Set dynamic shape range of the input tensor
        config.add_optimization_profile(profile)                                # Add the Optimization Profile into the BuilderConfig

        # identity_layer = network.add_identity(input_tensor)                     # Here is only an identity layer in this simple network, which the output is exactly equal to input
        # network.mark_output(identity_layer.get_output(0))                       # Mark the tensor for output

        engine_bytes = builder.build_serialized_network(network, config)        # Create a serialized network from the network
        if engine_bytes == None:
            print("Fail building engine")
            return
        print("Succeed building engine")
        with open(trt_file, "wb") as f:                                         # Save the serialized network as binaray file
            f.write(engine_bytes)
            print(f"Succeed saving engine ({trt_file})")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes)          # Create inference engine
    if engine == None:
        print("Fail getting engine for inference")
        return
    print("Succeed getting engine for inference")
    context = engine.create_execution_context()                                 # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)

    tensor_name_list = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]

    context.set_input_shape(input_tensor_name, data.shape)                      # Set runtime size of input tensor if using Dynamic-Shape mode

    for name in tensor_name_list:                                               # Print information of input / output tensors
        mode = engine.get_tensor_mode(name)
        data_type = engine.get_tensor_dtype(name)
        buildtime_shape = engine.get_tensor_shape(name)
        runtime_shape = context.get_tensor_shape(name)
        print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")

    buffer = OrderedDict()                                                      # Prepare the memory buffer on host and device
    for name in tensor_name_list:
        data_type = engine.get_tensor_dtype(name)
        runtime_shape = context.get_tensor_shape(name)
        n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
        host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
        device_buffer = cudart.cudaMalloc(n_byte)[1]
        buffer[name] = [host_buffer, device_buffer, n_byte]


    buffer[input_tensor_name][0] = np.ascontiguousarray(data)                   # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson

    for name in tensor_name_list:
        context.set_tensor_address(name, buffer[name][1])                       # Bind address of device buffer to context

    for name in tensor_name_list:                                               # Copy input data from host to device
        if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
            cudart.cudaMemcpy(buffer[name][1], buffer[name][0].ctypes.data, buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_async_v3(0)                                                 # Do inference computation

    for name in tensor_name_list:                                               # Copy output data from device to host
        if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
            cudart.cudaMemcpy(buffer[name][0].ctypes.data, buffer[name][1], buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    # t1 = torch.tensor(cp.asnumpy(buffer['output'][0]), device='cuda')
    for name in tensor_name_list[0:2]:
        print(name)
        print(buffer[name][0])

    for _, device_buffer, _ in buffer.values():                                 # Free the GPU memory buffer after all work
        cudart.cudaFree(device_buffer)

if __name__ == "__main__":
    # os.system("rm -rf *.trt")

    run()                                                                       # Build a TensorRT engine and do inference
    run()                                                                       # Load a TensorRT engine and do inference

    print("Finish")

二、TensorRT的使用

以下以yolo v7举例，使用的时候出现一个问题，letterbox函数中，原始scaleFill=True，意味着按比例缩放，最后图像resize到了448*640，本想将tensorRT转为动态尺寸版本，未成功。后来将scaleFill设为False，多余的部分填充黑边，图像均调整为了640*640，用tensorRT静态尺寸版本成功，框位置几乎未受影响。

yolo v7检测，推理时间从pt的0.21秒，降到onnx的0.018秒，降到tensorRT 0.01秒。

class Detection:
    def __init__(self):
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        trt_file = './best640.trt'
        logger = trt.Logger(trt.Logger.ERROR)
        with open(trt_file, "rb") as f:
            engine_bytes = f.read()   
        self.engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes)          # Create inference engine
        self.context = self.engine.create_execution_context()                                 # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)
        self.tensor_name_list = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        self.context.set_input_shape("input", (1, 3, 640, 640))                      # Set runtime size of input tensor if using Dynamic-Shape mode
        for name in self.tensor_name_list:                                               # Print information of input / output tensors
            mode = self.engine.get_tensor_mode(name)
            data_type = self.engine.get_tensor_dtype(name)
            buildtime_shape = self.engine.get_tensor_shape(name)
            runtime_shape = self.context.get_tensor_shape(name)
            # print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")
        self.buffer = OrderedDict()                                                      # Prepare the memory buffer on host and device
        for name in self.tensor_name_list:
            data_type = self.engine.get_tensor_dtype(name)
            runtime_shape = self.context.get_tensor_shape(name)
            n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
            host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
            device_buffer = cudart.cudaMalloc(n_byte)[1]
            self.buffer[name] = [host_buffer, device_buffer, n_byte]
    
    def infer(self, data):
        self.buffer["input"][0] = np.ascontiguousarray(data)                   # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson
        for name in self.tensor_name_list:
            self.context.set_tensor_address(name, self.buffer[name][1])                       # Bind address of device buffer to context

        for name in self.tensor_name_list:                                               # Copy input data from host to device
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                cudart.cudaMemcpy(self.buffer[name][1], self.buffer[name][0].ctypes.data, self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

        self.context.execute_async_v3(0)                                                 # Do inference computation

        for name in self.tensor_name_list:                                               # Copy output data from device to host
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
                cudart.cudaMemcpy(self.buffer[name][0].ctypes.data, self.buffer[name][1], self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

        return self.buffer['output'][0]

    def get_detection_result(self, img):
        img_resize_init = self.letterbox(img, (640, 640), stride=32)[0]
        img_resize = img_resize_init[:, :, ::-1].transpose(2, 0, 1)
        img_resize = np.ascontiguousarray(img_resize)

        img_cuda = torch.from_numpy(img_resize).to(self.device)
        img_cuda = img_cuda.half()
        img_cuda /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img_cuda.ndimension() == 3:
            img_cuda = img_cuda.unsqueeze(0)

        img_numpy = img_cuda.cpu().numpy()
        pred = self.infer(img_numpy.astype(np.float16))
        # pred = self.session.run(None, {'input': img_numpy.astype(np.float16)})
        pred = torch.tensor(pred).to(self.device)

        pred = non_max_suppression(pred, 0.25, 0.45, classes=None, agnostic=None)

        img_raw, bbox_raw = None, []
        for i, det in enumerate(pred):
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img_numpy.shape[2:], det[:, :4], img.shape).round()
                x1, y1, x2, y2 = map(int, det[-1][:4])
                bbox_raw = [x1, y1, x2, y2]
                bbox_raw = enlarge_bbox(bbox_raw)
                x1, y1, x2, y2 = bbox_raw
                img_raw = img[y1:y2, x1:x2]
        return img_raw, bbox_raw

    def letterbox(self, img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True, stride=32):
        # Resize and pad image while meeting stride-multiple constraints
        shape = img.shape[:2]  # current shape [height, width]
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        # Scale ratio (new / old)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        if not scaleup:  # only scale down, do not scale up (for better test mAP)
            r = min(r, 1.0)

        # Compute padding
        ratio = r, r  # width, height ratios
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
        if auto:  # minimum rectangle
            dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
        elif scaleFill:  # stretch
            dw, dh = 0.0, 0.0
            new_unpad = (new_shape[1], new_shape[0])
            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
        dw /= 2  # divide padding into 2 sides
        dh /= 2
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
        return img, ratio, (dw, dh)