Onnx转tensorRT,TensorRT为10.7.0版本

最近刚开始在做将pt转onnx,转tensorRT的操作,网上查到的资料均为TensorRT老版本,现记录一下新版本使用。

参考资料:trt-samples-for-hackathon-cn/cookbook/01-SimpleDemo/TensorRT-10 at master · NVIDIA/trt-samples-for-hackathon-cn · GitHub

先记录一下,后面再做解释

一、onnx转tensorRT

from collections import OrderedDict  # keep the order of the tensors implicitly
from pathlib import Path

import numpy as np
import tensorrt as trt
from cuda import cudart
import torch
import cupy as cp
import onnxruntime as ort

# yapf:disable

trt_file = Path("./best640.trt")
input_tensor_name = "input"
data = np.arange(3 * 640 * 640, dtype=np.float32).reshape(1, 3, 640, 640)                  # Inference input data
# data = np.load('input_data_coat.npy')
def run():
    logger = trt.Logger(trt.Logger.ERROR)                                       # Create Logger, available level: VERBOSE, INFO, WARNING, ERROR, INTERNAL_ERROR
    if trt_file.exists():                                                       # Load engine from file and skip building process if it existed
        with open(trt_file, "rb") as f:
            engine_bytes = f.read()
        if engine_bytes == None:
            print("Fail getting serialized engine")
            return
        print("Succeed getting serialized engine")
    else:                                                                       # Build a serialized network from scratch
        builder = trt.Builder(logger)                                           # Create Builder
        config = builder.create_builder_config()                                # Create BuidlerConfig to set attribution of the network
        network = builder.create_network()                                      # Create Network
        profile = builder.create_optimization_profile()                         # Create OptimizationProfile if using Dynamic-Shape mode
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)     # Set workspace for the building process (all GPU memory is used by default)

        # onnx
        onnx_file = Path("./best640.onnx")
        onnx_parser = trt.OnnxParser(network, logger)
        with open(onnx_file, 'rb') as f:
            onnx_model = f.read()
            if not onnx_parser.parse(onnx_model):
                print("ERROR: Failed to parse ONNX model")
                for error in range(onnx_parser.num_errors):
                    print(onnx_parser.get_error(error))
                return

        # input_tensor = network.add_input(input_tensor_name, trt.float16, [-1, -1, -1, -1])  # Set input tensor of the network
        # profile.set_shape(input_tensor.name, [1, 3, 224, 224], [1, 3, 224, 224], [1, 3, 224, 224])  # Set dynamic shape range of the input tensor
        # config.add_optimization_profile(profile)                                # Add the Optimization Profile into the BuilderConfig

        # input_tensor_name2 = network.get_input(0).name
        # input_tensor = network.add_input(input_tensor_name, trt.float32, [-1, -1, -1, -1])
        profile.set_shape(input_tensor_name, [1, 3, 640, 640], [1, 3, 640, 640], [1, 3, 640, 640])  # Set dynamic shape range of the input tensor
        config.add_optimization_profile(profile)                                # Add the Optimization Profile into the BuilderConfig

        # identity_layer = network.add_identity(input_tensor)                     # Here is only an identity layer in this simple network, which the output is exactly equal to input
        # network.mark_output(identity_layer.get_output(0))                       # Mark the tensor for output

        engine_bytes = builder.build_serialized_network(network, config)        # Create a serialized network from the network
        if engine_bytes == None:
            print("Fail building engine")
            return
        print("Succeed building engine")
        with open(trt_file, "wb") as f:                                         # Save the serialized network as binaray file
            f.write(engine_bytes)
            print(f"Succeed saving engine ({trt_file})")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes)          # Create inference engine
    if engine == None:
        print("Fail getting engine for inference")
        return
    print("Succeed getting engine for inference")
    context = engine.create_execution_context()                                 # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)

    tensor_name_list = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]

    context.set_input_shape(input_tensor_name, data.shape)                      # Set runtime size of input tensor if using Dynamic-Shape mode

    for name in tensor_name_list:                                               # Print information of input / output tensors
        mode = engine.get_tensor_mode(name)
        data_type = engine.get_tensor_dtype(name)
        buildtime_shape = engine.get_tensor_shape(name)
        runtime_shape = context.get_tensor_shape(name)
        print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")

    buffer = OrderedDict()                                                      # Prepare the memory buffer on host and device
    for name in tensor_name_list:
        data_type = engine.get_tensor_dtype(name)
        runtime_shape = context.get_tensor_shape(name)
        n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
        host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
        device_buffer = cudart.cudaMalloc(n_byte)[1]
        buffer[name] = [host_buffer, device_buffer, n_byte]


    buffer[input_tensor_name][0] = np.ascontiguousarray(data)                   # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson

    for name in tensor_name_list:
        context.set_tensor_address(name, buffer[name][1])                       # Bind address of device buffer to context

    for name in tensor_name_list:                                               # Copy input data from host to device
        if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
            cudart.cudaMemcpy(buffer[name][1], buffer[name][0].ctypes.data, buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_async_v3(0)                                                 # Do inference computation

    for name in tensor_name_list:                                               # Copy output data from device to host
        if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
            cudart.cudaMemcpy(buffer[name][0].ctypes.data, buffer[name][1], buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    # t1 = torch.tensor(cp.asnumpy(buffer['output'][0]), device='cuda')
    for name in tensor_name_list[0:2]:
        print(name)
        print(buffer[name][0])

    for _, device_buffer, _ in buffer.values():                                 # Free the GPU memory buffer after all work
        cudart.cudaFree(device_buffer)

if __name__ == "__main__":
    # os.system("rm -rf *.trt")

    run()                                                                       # Build a TensorRT engine and do inference
    run()                                                                       # Load a TensorRT engine and do inference

    print("Finish")

二、TensorRT的使用

        以下以yolo v7举例,使用的时候出现一个问题,letterbox函数中,原始scaleFill=True,意味着按比例缩放,最后图像resize到了448*640,本想将tensorRT转为动态尺寸版本,未成功。后来将scaleFill设为False,多余的部分填充黑边,图像均调整为了640*640,用tensorRT静态尺寸版本成功,框位置几乎未受影响。

        yolo v7检测,推理时间从pt的0.21秒,降到onnx的0.018秒,降到tensorRT 0.01秒。

class Detection:
    def __init__(self):
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        trt_file = './best640.trt'
        logger = trt.Logger(trt.Logger.ERROR)
        with open(trt_file, "rb") as f:
            engine_bytes = f.read()   
        self.engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes)          # Create inference engine
        self.context = self.engine.create_execution_context()                                 # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)
        self.tensor_name_list = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        self.context.set_input_shape("input", (1, 3, 640, 640))                      # Set runtime size of input tensor if using Dynamic-Shape mode
        for name in self.tensor_name_list:                                               # Print information of input / output tensors
            mode = self.engine.get_tensor_mode(name)
            data_type = self.engine.get_tensor_dtype(name)
            buildtime_shape = self.engine.get_tensor_shape(name)
            runtime_shape = self.context.get_tensor_shape(name)
            # print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")
        self.buffer = OrderedDict()                                                      # Prepare the memory buffer on host and device
        for name in self.tensor_name_list:
            data_type = self.engine.get_tensor_dtype(name)
            runtime_shape = self.context.get_tensor_shape(name)
            n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
            host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
            device_buffer = cudart.cudaMalloc(n_byte)[1]
            self.buffer[name] = [host_buffer, device_buffer, n_byte]
    
    def infer(self, data):
        self.buffer["input"][0] = np.ascontiguousarray(data)                   # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson
        for name in self.tensor_name_list:
            self.context.set_tensor_address(name, self.buffer[name][1])                       # Bind address of device buffer to context

        for name in self.tensor_name_list:                                               # Copy input data from host to device
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                cudart.cudaMemcpy(self.buffer[name][1], self.buffer[name][0].ctypes.data, self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

        self.context.execute_async_v3(0)                                                 # Do inference computation

        for name in self.tensor_name_list:                                               # Copy output data from device to host
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
                cudart.cudaMemcpy(self.buffer[name][0].ctypes.data, self.buffer[name][1], self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

        return self.buffer['output'][0]

    def get_detection_result(self, img):
        img_resize_init = self.letterbox(img, (640, 640), stride=32)[0]
        img_resize = img_resize_init[:, :, ::-1].transpose(2, 0, 1)
        img_resize = np.ascontiguousarray(img_resize)

        img_cuda = torch.from_numpy(img_resize).to(self.device)
        img_cuda = img_cuda.half()
        img_cuda /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img_cuda.ndimension() == 3:
            img_cuda = img_cuda.unsqueeze(0)

        img_numpy = img_cuda.cpu().numpy()
        pred = self.infer(img_numpy.astype(np.float16))
        # pred = self.session.run(None, {'input': img_numpy.astype(np.float16)})
        pred = torch.tensor(pred).to(self.device)

        pred = non_max_suppression(pred, 0.25, 0.45, classes=None, agnostic=None)

        img_raw, bbox_raw = None, []
        for i, det in enumerate(pred):
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img_numpy.shape[2:], det[:, :4], img.shape).round()
                x1, y1, x2, y2 = map(int, det[-1][:4])
                bbox_raw = [x1, y1, x2, y2]
                bbox_raw = enlarge_bbox(bbox_raw)
                x1, y1, x2, y2 = bbox_raw
                img_raw = img[y1:y2, x1:x2]
        return img_raw, bbox_raw

    def letterbox(self, img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True, stride=32):
        # Resize and pad image while meeting stride-multiple constraints
        shape = img.shape[:2]  # current shape [height, width]
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        # Scale ratio (new / old)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        if not scaleup:  # only scale down, do not scale up (for better test mAP)
            r = min(r, 1.0)

        # Compute padding
        ratio = r, r  # width, height ratios
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
        if auto:  # minimum rectangle
            dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
        elif scaleFill:  # stretch
            dw, dh = 0.0, 0.0
            new_unpad = (new_shape[1], new_shape[0])
            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
        dw /= 2  # divide padding into 2 sides
        dh /= 2
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
        return img, ratio, (dw, dh)


    

### 将ONNX模型换为TensorRT模型的方法 #### 使用 `onnx-tensorrt` 库进行换 为了实现这一目标,首先需要确保环境中已经正确安装并配置了 `onnx-tensorrt` 工具[^1]。一旦环境准备就绪,可以通过调用该库提供的接口来完成从ONNXTensorRT换工作。 ```python import tensorrt as trt from onnx import ModelProto from onnx_tensorrt.backend import prepare # 加载ONNX模型 model_path = "your_model.onnx" with open(model_path, &#39;rb&#39;) as f: model_proto = ModelProto() model_proto.ParseFromString(f.read()) # 创建TensorRT引擎 trt_engine = prepare(model_proto) # 保存生成的TRT引擎文件 output_file = "your_model.trt" with open(output_file, &#39;wb&#39;) as f: f.write(trt_engine.engine.serialize()) ``` #### 利用 `trtexec` 命令行工具执行换操作 另一种更为简便的方式是借助于官方提供的命令行实用程序——`trtexec` 来处理这个过程[^3]: ```bash trtexec --onnx=your_model.onnx --saveEngine=your_model.trt --fp16 ``` 这条指令会自动读取指定路径下的 `.onnx` 文件,并将其编译成适用于特定硬件平台优化后的 `.trt` 引擎文件。如果希望启用半精度浮点数支持,则可以在命令后面加上 `--fp16` 参数以加速计算性能的同时减少内存占用。 #### 数据预处理注意事项 值得注意的是,在实际应用过程中可能会遇到一些问题,比如当输入图像由 OpenCV 或 PIL 等库加载时,默认情况下其像素值是以整型形式存储(`int8`);然而大多数深度学习框架期望接收的是标准化之后的浮点数值 (`float32`) 。因此建议在传入网络之前先做必要的类型换: ```python image = cv2.imread(&#39;input_image.jpg&#39;) image = image.astype(np.float32) / 255.0 # 归一化处理 ``` 上述代码片段展示了如何将原始图像数据调整为目标神经网络所需的格式,从而避免因数据不匹配而导致的结果异常(如返回 NaN 值等问题)[^4]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值