从pytorch到onnx到tensorrt 推理
简介
本文记录本人在Jetson NX板子上学习推理加速onnx、TensorRT模型的过程和资源。本人的TensorRT版本为8.4.1.5。
下载Jetson平台可用的包
https://elinux.org/Jetson_Zoo
onnx-gpu 安装
从zoo下载gpu包。
测试:
import onnxruntime as ort
import tensorrt
print(ort.get_device())
print(ort.get_available_providers())
print(tensorrt.__version__ )
It should look like:
Python 3.8.20 | packaged by conda-forge | (default, Sep 30 2024, 17:47:05)
[GCC 13.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import onnxruntime as ort
>>> import tensorrt
>>> print(ort.get_device())
GPU
>>> print(ort.get_available_providers())
['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
>>> print(tensorrt.__version__ )
8.4.1.5
otherwise, u may need to pip uninstall onnxruntime
(to uninstall onnxruntime cpu version) , then reinstall the gpu version. If you got problem with cuda, pls check this one
trt安装
板子上先查一下有没有
dpkg -l | grep TensorRT
有的话,假如是在python3.6系统内,使用以下命令来复制到虚拟环境的site-packages:
sudo ln -s /usr/lib/python3.6/dist-packages/tensorrt* /home/jetson/.local/lib/python3.8/site-packages
onnx optimization
(可选)可以简化一点点onnx模型结构:https://github.com/Howell-Yang/onnx2trt
Onnx 转 trt
一般的转换:
/usr/src/tensorrt/bin/trtexec --onnx=example.onnx --saveEngine=example.engine
16fp量化(量化需要注意量化和反量化,新手不建议量化):
/usr/src/tensorrt/bin/trtexec --onnx=example.onnx --fp16 --saveEngine=example.engine
–onnx代表输入的模型文件, --fp16代表使用半精度浮点类型数据,–saveEngine代表保存的推理引擎名称
执行结束后可以得到类似如下的性能结果报告:
[03/18/2025-17:49:58] [I] === Performance summary ===
[03/18/2025-17:49:58] [I] Throughput: 11.7634 qps
[03/18/2025-17:49:58] [I] Latency: min = 87.5422 ms, max = 94.814 ms, mean = 92.471 ms, median = 92.236 ms, percentile(99%) = 94.814 ms
[03/18/2025-17:49:58] [I] Enqueue Time: min = 1.50806 ms, max = 5.00293 ms, mean = 2.02811 ms, median = 1.89932 ms, percentile(99%) = 5.00293 ms
[03/18/2025-17:49:58] [I] H2D Latency: min = 0.205322 ms, max = 0.312988 ms, mean = 0.257786 ms, median = 0.258057 ms, percentile(99%) = 0.312988 ms
[03/18/2025-17:49:58] [I] GPU Compute Time: min = 82.1777 ms, max = 85.1783 ms, mean = 82.9545 ms, median = 82.6403 ms, percentile(99%) = 85.1783 ms
[03/18/2025-17:49:58] [I] D2H Latency: min = 5.04346 ms, max = 9.95288 ms, mean = 9.25878 ms, median = 9.35913 ms, percentile(99%) = 9.95288 ms
[03/18/2025-17:49:58] [I] Total Host Walltime: 3.23036 s
[03/18/2025-17:49:58] [I] Total GPU Compute Time: 3.15227 s
[03/18/2025-17:49:58] [W] * GPU compute time is unstable, with coefficient of variance = 1.0244%.
[03/18/2025-17:49:58] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
[03/18/2025-17:49:58] [I] Explanations of the performance metrics are printed in the verbose logs.
[03/18/2025-17:49:58] [I]
&&&& PASSED TensorRT.trtexec [TensorRT v8401] # /usr/src/tensorrt/bin/trtexec --onnx=pspnet_50_ade20k.onnx --saveEngine=pspnet_50_ade20k.engine
可变batch推理
我了个豆啊,https://blog.youkuaiyun.com/sinat_39307513/article/details/114820347,可变batch推理,以后有条件可以搞一波
推理测试
思路:务必按照以下次序来验证模型是否正常:
- pytorch 模型测试
- onnx gpu 模型测试
- tensorrt cuda 模型测试
而且上一阶段的代码基本上可以复用于下一阶段的代码,特别是预处理
onnx gpu 推理测试
最终模型的输出的shape是(1,150,640,480)
#!/usr/bin/env python3
import onnx
import numpy as np
import onnxruntime as ort
import cv2
from skimage.transform import resize
model_path = 'pspnet_640_480.onnx'
# 验证模型合法性
onnx_model = onnx.load(model_path)
onnx.checker.check_model(onnx_model)
# 读入图像并调整为输入维度
img = cv2.imread("test.png")
img = resize(img, (640,480), mode = 'reflect', anti_aliasing=True, preserve_range = True) # Give float64
img = img.astype(np.float32)
img -= np.array([104.00699, 116.66877, 122.67892]) # Mean value of dataset
# Convert HWC -> CHW
img = img.transpose(2, 0, 1)
# Shape becomes (1, C, H, W)
img = np.expand_dims(img, axis=0)
sess = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider'])
input_name = sess.get_inputs()[0].name
outputs = sess.run(None, {input_name: img})[0]
print(outputs)
tensorrt cuda 推理测试
最终模型的输出的shape是(1,1,150,640,480)
#!/usr/bin/env python3
"""
Take in an image (rgb or rgb-d)
Use CNN to do semantic segmantation
Out put a cloud point with semantic color registered
\author Xuan Zhang
\date May - July 2018
"""
import torch
import cv2
import tensorrt as trt
import numpy as np
from skimage.transform import resize
def trt_version():
return trt.__version__
def torch_device_from_trt(device):
if device == trt.TensorLocation.DEVICE:
return torch.device("cuda")
elif device == trt.TensorLocation.HOST:
return torch.device("cpu")
else:
return TypeError("%s is not supported by torch" % device)
def torch_dtype_from_trt(dtype):
if dtype == trt.int8:
return torch.int8
elif trt_version() >= '7.0' and dtype == trt.bool:
return torch.bool
elif dtype == trt.int32:
return torch.int32
elif dtype == trt.float16:
return torch.float16
elif dtype == trt.float32:
return torch.float32
else:
raise TypeError("%s is not supported by torch" % dtype)
class TRTModule(torch.nn.Module):
def __init__(self, engine=None, input_names=None, output_names=None):
super(TRTModule, self).__init__()
self.engine = engine
if self.engine is not None:
# engine创建执行context
self.context = self.engine.create_execution_context()
self.input_names = input_names
self.output_names = output_names
def forward(self, *inputs):
batch_size = inputs[0].shape[0]
bindings = [None] * (len(self.input_names) + len(self.output_names))
# 创建输出tensor,并分配内存
outputs = [None] * len(self.output_names)
for i, output_name in enumerate(self.output_names):
idx = self.engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))#找到对应的数据类型
print(dtype)
# print(self.engine.get_binding_shape(-1))
shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx))#找到对应的形状大小
print(shape,i, output_name)
device = torch_device_from_trt(self.engine.get_location(idx))
print(device)
output = torch.empty(size=shape, dtype=dtype, device=device)
outputs[i] = output
bindings[idx] = output.data_ptr()#绑定输出数据指针
for i, input_name in enumerate(self.input_names):
idx = self.engine.get_binding_index(input_name)
bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
self.context.execute_async_v2(
bindings= bindings, stream_handle=torch.cuda.current_stream().cuda_stream
)# 执行推理
outputs = tuple(outputs)
if len(outputs) == 1:
outputs = outputs[0]
return outputs
logger = trt.Logger(trt.Logger.INFO)
with open("pspnet_640_480.engine", "rb") as f, trt.Runtime(logger) as runtime:
engine=runtime.deserialize_cuda_engine(f.read())# 输入trt本地文件,返回ICudaEngine对象
for idx in range(engine.num_bindings): # 查看输入输出的名字,类型,大小
is_input = engine.binding_is_input(idx)
name = engine.get_binding_name(idx)
op_type = engine.get_binding_dtype(idx)
shape = engine.get_binding_shape(idx)
print('input id:',idx,' is input: ', is_input,' binding name:', name, ' shape:', shape, 'type: ', op_type)
trt_model = TRTModule(engine, ["input.1"], ["638"])
img = cv2.imread("test.png")
img = resize(img, (640,480), mode = 'reflect', anti_aliasing=True, preserve_range = True) # Give float64
img = img.astype(np.float32)
img -= np.array([104.00699, 116.66877, 122.67892]) # Mean value of dataset
# Convert HWC -> CHW
img = img.transpose(2, 0, 1)
# Shape becomes (1, C, H, W)
img = np.expand_dims(img, axis=0)
img_input = torch.from_numpy(img)
img_input = img_input.to('cuda')
# 运行模型
result_trt = trt_model(img_input)
print(result_trt.shape)