最近刚开始在做将pt转onnx,转tensorRT的操作,网上查到的资料均为TensorRT老版本,现记录一下新版本使用。
先记录一下,后面再做解释
一、onnx转tensorRT
from collections import OrderedDict # keep the order of the tensors implicitly
from pathlib import Path
import numpy as np
import tensorrt as trt
from cuda import cudart
import torch
import cupy as cp
import onnxruntime as ort
# yapf:disable
trt_file = Path("./best640.trt")
input_tensor_name = "input"
data = np.arange(3 * 640 * 640, dtype=np.float32).reshape(1, 3, 640, 640) # Inference input data
# data = np.load('input_data_coat.npy')
def run():
logger = trt.Logger(trt.Logger.ERROR) # Create Logger, available level: VERBOSE, INFO, WARNING, ERROR, INTERNAL_ERROR
if trt_file.exists(): # Load engine from file and skip building process if it existed
with open(trt_file, "rb") as f:
engine_bytes = f.read()
if engine_bytes == None:
print("Fail getting serialized engine")
return
print("Succeed getting serialized engine")
else: # Build a serialized network from scratch
builder = trt.Builder(logger) # Create Builder
config = builder.create_builder_config() # Create BuidlerConfig to set attribution of the network
network = builder.create_network() # Create Network
profile = builder.create_optimization_profile() # Create OptimizationProfile if using Dynamic-Shape mode
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # Set workspace for the building process (all GPU memory is used by default)
# onnx
onnx_file = Path("./best640.onnx")
onnx_parser = trt.OnnxParser(network, logger)
with open(onnx_file, 'rb') as f:
onnx_model = f.read()
if not onnx_parser.parse(onnx_model):
print("ERROR: Failed to parse ONNX model")
for error in range(onnx_parser.num_errors):
print(onnx_parser.get_error(error))
return
# input_tensor = network.add_input(input_tensor_name, trt.float16, [-1, -1, -1, -1]) # Set input tensor of the network
# profile.set_shape(input_tensor.name, [1, 3, 224, 224], [1, 3, 224, 224], [1, 3, 224, 224]) # Set dynamic shape range of the input tensor
# config.add_optimization_profile(profile) # Add the Optimization Profile into the BuilderConfig
# input_tensor_name2 = network.get_input(0).name
# input_tensor = network.add_input(input_tensor_name, trt.float32, [-1, -1, -1, -1])
profile.set_shape(input_tensor_name, [1, 3, 640, 640], [1, 3, 640, 640], [1, 3, 640, 640]) # Set dynamic shape range of the input tensor
config.add_optimization_profile(profile) # Add the Optimization Profile into the BuilderConfig
# identity_layer = network.add_identity(input_tensor) # Here is only an identity layer in this simple network, which the output is exactly equal to input
# network.mark_output(identity_layer.get_output(0)) # Mark the tensor for output
engine_bytes = builder.build_serialized_network(network, config) # Create a serialized network from the network
if engine_bytes == None:
print("Fail building engine")
return
print("Succeed building engine")
with open(trt_file, "wb") as f: # Save the serialized network as binaray file
f.write(engine_bytes)
print(f"Succeed saving engine ({trt_file})")
engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes) # Create inference engine
if engine == None:
print("Fail getting engine for inference")
return
print("Succeed getting engine for inference")
context = engine.create_execution_context() # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)
tensor_name_list = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
context.set_input_shape(input_tensor_name, data.shape) # Set runtime size of input tensor if using Dynamic-Shape mode
for name in tensor_name_list: # Print information of input / output tensors
mode = engine.get_tensor_mode(name)
data_type = engine.get_tensor_dtype(name)
buildtime_shape = engine.get_tensor_shape(name)
runtime_shape = context.get_tensor_shape(name)
print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")
buffer = OrderedDict() # Prepare the memory buffer on host and device
for name in tensor_name_list:
data_type = engine.get_tensor_dtype(name)
runtime_shape = context.get_tensor_shape(name)
n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
device_buffer = cudart.cudaMalloc(n_byte)[1]
buffer[name] = [host_buffer, device_buffer, n_byte]
buffer[input_tensor_name][0] = np.ascontiguousarray(data) # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson
for name in tensor_name_list:
context.set_tensor_address(name, buffer[name][1]) # Bind address of device buffer to context
for name in tensor_name_list: # Copy input data from host to device
if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
cudart.cudaMemcpy(buffer[name][1], buffer[name][0].ctypes.data, buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
context.execute_async_v3(0) # Do inference computation
for name in tensor_name_list: # Copy output data from device to host
if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
cudart.cudaMemcpy(buffer[name][0].ctypes.data, buffer[name][1], buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
# t1 = torch.tensor(cp.asnumpy(buffer['output'][0]), device='cuda')
for name in tensor_name_list[0:2]:
print(name)
print(buffer[name][0])
for _, device_buffer, _ in buffer.values(): # Free the GPU memory buffer after all work
cudart.cudaFree(device_buffer)
if __name__ == "__main__":
# os.system("rm -rf *.trt")
run() # Build a TensorRT engine and do inference
run() # Load a TensorRT engine and do inference
print("Finish")
二、TensorRT的使用
以下以yolo v7举例,使用的时候出现一个问题,letterbox函数中,原始scaleFill=True,意味着按比例缩放,最后图像resize到了448*640,本想将tensorRT转为动态尺寸版本,未成功。后来将scaleFill设为False,多余的部分填充黑边,图像均调整为了640*640,用tensorRT静态尺寸版本成功,框位置几乎未受影响。
yolo v7检测,推理时间从pt的0.21秒,降到onnx的0.018秒,降到tensorRT 0.01秒。
class Detection:
def __init__(self):
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
trt_file = './best640.trt'
logger = trt.Logger(trt.Logger.ERROR)
with open(trt_file, "rb") as f:
engine_bytes = f.read()
self.engine = trt.Runtime(logger).deserialize_cuda_engine(engine_bytes) # Create inference engine
self.context = self.engine.create_execution_context() # Create Execution Context from the engine (analogy to a GPU context, or a CPU process)
self.tensor_name_list = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
self.context.set_input_shape("input", (1, 3, 640, 640)) # Set runtime size of input tensor if using Dynamic-Shape mode
for name in self.tensor_name_list: # Print information of input / output tensors
mode = self.engine.get_tensor_mode(name)
data_type = self.engine.get_tensor_dtype(name)
buildtime_shape = self.engine.get_tensor_shape(name)
runtime_shape = self.context.get_tensor_shape(name)
# print(f"{'Input ' if mode == trt.TensorIOMode.INPUT else 'Output'}->{data_type}, {buildtime_shape}, {runtime_shape}, {name}")
self.buffer = OrderedDict() # Prepare the memory buffer on host and device
for name in self.tensor_name_list:
data_type = self.engine.get_tensor_dtype(name)
runtime_shape = self.context.get_tensor_shape(name)
n_byte = trt.volume(runtime_shape) * np.dtype(trt.nptype(data_type)).itemsize
host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
device_buffer = cudart.cudaMalloc(n_byte)[1]
self.buffer[name] = [host_buffer, device_buffer, n_byte]
def infer(self, data):
self.buffer["input"][0] = np.ascontiguousarray(data) # Set runtime data, MUST use np.ascontiguousarray, it is a SERIOUS lesson
for name in self.tensor_name_list:
self.context.set_tensor_address(name, self.buffer[name][1]) # Bind address of device buffer to context
for name in self.tensor_name_list: # Copy input data from host to device
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
cudart.cudaMemcpy(self.buffer[name][1], self.buffer[name][0].ctypes.data, self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
self.context.execute_async_v3(0) # Do inference computation
for name in self.tensor_name_list: # Copy output data from device to host
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
cudart.cudaMemcpy(self.buffer[name][0].ctypes.data, self.buffer[name][1], self.buffer[name][2], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
return self.buffer['output'][0]
def get_detection_result(self, img):
img_resize_init = self.letterbox(img, (640, 640), stride=32)[0]
img_resize = img_resize_init[:, :, ::-1].transpose(2, 0, 1)
img_resize = np.ascontiguousarray(img_resize)
img_cuda = torch.from_numpy(img_resize).to(self.device)
img_cuda = img_cuda.half()
img_cuda /= 255.0 # 0 - 255 to 0.0 - 1.0
if img_cuda.ndimension() == 3:
img_cuda = img_cuda.unsqueeze(0)
img_numpy = img_cuda.cpu().numpy()
pred = self.infer(img_numpy.astype(np.float16))
# pred = self.session.run(None, {'input': img_numpy.astype(np.float16)})
pred = torch.tensor(pred).to(self.device)
pred = non_max_suppression(pred, 0.25, 0.45, classes=None, agnostic=None)
img_raw, bbox_raw = None, []
for i, det in enumerate(pred):
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img_numpy.shape[2:], det[:, :4], img.shape).round()
x1, y1, x2, y2 = map(int, det[-1][:4])
bbox_raw = [x1, y1, x2, y2]
bbox_raw = enlarge_bbox(bbox_raw)
x1, y1, x2, y2 = bbox_raw
img_raw = img[y1:y2, x1:x2]
return img_raw, bbox_raw
def letterbox(self, img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True, stride=32):
# Resize and pad image while meeting stride-multiple constraints
shape = img.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # only scale down, do not scale up (for better test mAP)
r = min(r, 1.0)
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
elif scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)