onnxruntime和tensorrt动态输入推理

部署运行你感兴趣的模型镜像

onnxruntime动态输入推理

lenet的onnxruntime动态输入推理

导出下面的onnx模型:
在这里插入图片描述
可以看到,该模型的输入batch是动态的。

onnx动态输入推理(python):

import cv2
import numpy as np
import onnxruntime
from pathlib import Path


onnx_session = onnxruntime.InferenceSession("lenet.onnx", providers=['CPUExecutionProvider'])

image_files = Path('./images').glob('*.jpg')
input_tensor = []
for image_file in image_files:
    image = cv2.imread(str(image_file), -1)
    blob = cv2.dnn.blobFromImage(image, 1/255., size=(28,28))
    input_tensor.append(np.squeeze(blob, axis=0))

input_name = []
for node in onnx_session.get_inputs():
    input_name.append(node.name)

output_name = []
for node in onnx_session.get_outputs():
    output_name.append(node.name)

inputs = {}
for name in input_name:
    inputs[name] = np.array(input_tensor)

outputs = onnx_session.run(None, inputs)[0]
print(np.argmax(outputs, axis=1))

onnx动态输入推理(cpp):

#include <iostream>
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>


int main(int argc, char* argv[])
{
	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "lenet");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(1);
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

	const wchar_t* model_path = L"lenet.onnx";
	Ort::Session session(env, model_path, session_options);
	Ort::AllocatorWithDefaultOptions allocator;

	std::vector<const char*>  input_node_names;
	for (size_t i = 0; i < session.GetInputCount(); i++)
	{
		input_node_names.push_back(session.GetInputName(i, allocator));
	}

	std::vector<const char*> output_node_names;
	for (size_t i = 0; i < session.GetOutputCount(); i++)
	{
		output_node_names.push_back(session.GetOutputName(i, allocator));
	}

	std::vector<std::string> filenames;
	std::vector<cv::Mat> images;
	cv::glob("./images/*.jpg", filenames);
	for (const auto& filename : filenames)
	{
		cv::Mat image = cv::imread(filename, -1);
		images.push_back(image);
	}

	const size_t input_tensor_size = images.size() * 1 * 28 * 28;
	std::vector<float> input_tensor_values(input_tensor_size);
	for (size_t n = 0; n < images.size(); n++)
	{
		images[n].convertTo(images[n], CV_32F, 1.0 / 255);
		for (int i = 0; i < 28; i++)
		{
			for (int j = 0; j < 28; j++)
			{
				input_tensor_values[n* 28 * 28 + i * 28 + j] = images[n].at<float>(i, j);
			}
		}
	}
	
	std::vector<int64_t> input_node_dims = { int(images.size()), 1, 28, 28 };
	auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());

	std::vector<Ort::Value> inputs;
	inputs.push_back(std::move(input_tensor));

	std::vector<Ort::Value> outputs = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());

	const float* rawOutput = outputs[0].GetTensorData<float>();
	std::vector<int64_t> outputShape = outputs[0].GetTensorTypeAndShapeInfo().GetShape();
	size_t count = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
	std::vector<float> preds(rawOutput, rawOutput + count);

	for (size_t i = 0; i < images.size(); i++)
	{
		std::cout << std::max_element(preds.begin() + 10 * i, preds.begin() + 10 * (i + 1)) - preds.begin() - 10 * i << std::endl;
	}

	return 0;
}

yolov5的onnxruntime动态输入推理

下载yolov5-7.0的代码:https://github.com/ultralytics/yolov5/releases/tag/v7.0,通过

python export.py --dynamic --include ['onnx']

导出下面的onnx模型:
在这里插入图片描述
可以看到,该模型的输入batch以及宽高都是动态的。

onnx动态输入推理(python):

import cv2
import numpy as np
import onnxruntime
from pathlib import Path


class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
        'hair drier', 'toothbrush'] #coco80类别      
input_shape = (640, 640) 
score_threshold = 0.2  
nms_threshold = 0.5
confidence_threshold = 0.2   


def nms(boxes, scores, score_threshold, nms_threshold):
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    keep = []
    index = scores.argsort()[::-1] 

    while index.size > 0:
        i = index[0]
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]]) 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22 - x11 + 1)                              
        h = np.maximum(0, y22 - y11 + 1) 
        overlaps = w * h
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= nms_threshold)[0]
        index = index[idx + 1]
    return keep


def xywh2xyxy(x):
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y


def filter_box(outputs): #过滤掉无用的框    
    outputs = np.squeeze(outputs)
    outputs = outputs[outputs[..., 4] > confidence_threshold]
    classes_scores = outputs[..., 5:]
     
    boxes = []
    scores = []
    class_ids = []
    for i in range(len(classes_scores)):
        class_id = np.argmax(classes_scores[i])
        outputs[i][4] *= classes_scores[i][class_id]
        outputs[i][5] = class_id
        if outputs[i][4] > score_threshold:
            boxes.append(outputs[i][:6])
            scores.append(outputs[i][4])
            class_ids.append(outputs[i][5])
            
    boxes = np.array(boxes)
    boxes = xywh2xyxy(boxes)
    scores = np.array(scores)
    indices = nms(boxes, scores, score_threshold, nms_threshold) 
    output = boxes[indices]
    return output


def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    
    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))    
    dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2  # wh padding 
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im


def scale_boxes(boxes, shape):
    # Rescale boxes (xyxy) from input_shape to shape
    gain = min(input_shape[0] / shape[0], input_shape[1] / shape[1])  # gain  = old / new
    pad = (input_shape[1] - shape[1] * gain) / 2, (input_shape[0] - shape[0] * gain) / 2  # wh padding
    boxes[..., [0, 2]] -= pad[0]  # x padding
    boxes[..., [1, 3]] -= pad[1]  # y padding
    boxes[..., :4] /= gain
    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
    return boxes


def draw(image, box_data):
    box_data = scale_boxes(box_data, image.shape)
    boxes = box_data[...,:4].astype(np.int32) 
    scores = box_data[...,4]
    classes = box_data[...,5].astype(np.int32)
   
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 1)
        cv2.putText(image, '{0} {1:.2f}'.format(class_names[cl], score), (top, left), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)


if __name__=="__main__":
    image_files = Path('./images').glob('*.jpg')
    images = []
    input_tensor = []
    for image_file in image_files:
        image = cv2.imread(str(image_file), -1)
        images.append(image)
        input = letterbox(image, input_shape)
        input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)  #BGR2RGB和HWC2CHW
        input = input / 255.0
        input_tensor.append(input)
    
    onnx_session = onnxruntime.InferenceSession('yolov5n_dynamic.onnx', providers=['CPUExecutionProvider', 'CUDAExecutionProvider'])
        
    input_name = []
    for node in onnx_session.get_inputs():
        input_name.append(node.name)

    output_name = []
    for node in onnx_session.get_outputs():
        output_name.append(node.name)

    inputs = {}
    for name in input_name:
        inputs[name] =  np.array(input_tensor)
        
    outputs = onnx_session.run(None, inputs)[0]
    
    for i in range(outputs.shape[0]):
        boxes = filter_box(outputs[i])
        draw(images[i], boxes)
        cv2.imwrite('result'+str(i)+'.jpg', images[i])

onnx动态输入推理(cpp):

#include <iostream>
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>


const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };			//类别名称

const int input_width = 640;
const int input_height = 640;
const float score_threshold = 0.2;
const float nms_threshold = 0.5;
const float confidence_threshold = 0.2;
const int num_classes = class_names.size();
const int output_numprob = 5 + num_classes;
const int output_numbox = 3 * (input_width / 8 * input_height / 8 + input_width / 16 * input_height / 16 + input_width / 32 * input_height / 32);
std::vector<std::string> filenames;


//LetterBox处理
void LetterBox(const cv::Mat & image, cv::Mat & outImage,
	const cv::Size & newShape = cv::Size(640, 640), const cv::Scalar & color = cv::Scalar(114, 114, 114))
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}


//预处理
void pre_process(std::vector<cv::Mat> & images, std::vector<float> & inputs)
{
	cv::Vec4d params;
	cv::Mat letterbox;
	std::vector<cv::Mat> split_images;

	for (size_t i = 0; i < images.size(); i++)
	{
		LetterBox(images[i], letterbox, cv::Size(input_width, input_height));
		cv::cvtColor(letterbox, letterbox, cv::COLOR_BGR2RGB);
		letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);
		cv::split(letterbox, split_images);
		for (size_t i = 0; i < letterbox.channels(); ++i)
		{
			std::vector<float> split_image_data = split_images[i].reshape(1, 1);
			inputs.insert(inputs.end(), split_image_data.begin(), split_image_data.end());
		}
	}
}


//网络推理
void process(const wchar_t* model, std::vector<float> & inputs, std::vector<Ort::Value> & outputs)
{
	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "yolov5n");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(12);//设置线程数
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);//启用模型优化策略

	//CUDA option set
	OrtCUDAProviderOptions cuda_option;
	cuda_option.device_id = 0;
	cuda_option.arena_extend_strategy = 0;
	cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
	cuda_option.gpu_mem_limit = SIZE_MAX;
	cuda_option.do_copy_in_default_stream = 1;
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
	session_options.AppendExecutionProvider_CUDA(cuda_option);
	Ort::Session session(env, model, session_options);
	Ort::AllocatorWithDefaultOptions allocator;//通过使用Ort::AllocatorWithDefaultOptions类,可以方便地进行内存分配和管理,而无需手动实现内存分配和释放的细节。

	std::vector<const char*>  input_node_names;
	for (size_t i = 0; i < session.GetInputCount(); i++)
	{
		input_node_names.push_back("images");
	}

	std::vector<const char*> output_node_names;
	for (size_t i = 0; i < session.GetOutputCount(); i++)
	{
		output_node_names.push_back("output0");
	}

	// create input tensor object from data values
	std::vector<int64_t> input_node_dims = { (int)filenames.size(), 3, input_width, input_height };
	auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, inputs.data(), inputs.size(), input_node_dims.data(), input_node_dims.size());

	std::vector<Ort::Value> ort_inputs;
	ort_inputs.push_back(std::move(input_tensor));//右值引用,避免不必要的拷贝和内存分配操作

	// score model & input tensor, get back output tensor	
	outputs = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());

	session.release();
	env.release();
}


//NMS
void nms(std::vector<cv::Rect> & boxes, std::vector<float> & scores, float score_threshold, float nms_threshold, std::vector<int> & indices)
{
	assert(boxes.size() == scores.size());

	struct BoxScore
	{
		cv::Rect box;
		float score;
		int id;
	};
	std::vector<BoxScore> boxes_scores;
	for (size_t i = 0; i < boxes.size(); i++)
	{
		BoxScore box_conf;
		box_conf.box = boxes[i];
		box_conf.score = scores[i];
		box_conf.id = i;
		if (scores[i] > score_threshold)	boxes_scores.push_back(box_conf);
	}

	std::sort(boxes_scores.begin(), boxes_scores.end(), [](BoxScore a, BoxScore b) { return a.score > b.score; });

	std::vector<float> area(boxes_scores.size());
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		area[i] = boxes_scores[i].box.width * boxes_scores[i].box.height;
	}

	std::vector<bool> isSuppressed(boxes_scores.size(), false);
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		if (isSuppressed[i])  continue;
		for (size_t j = i + 1; j < boxes_scores.size(); ++j)
		{
			if (isSuppressed[j])  continue;

			float x1 = (std::max)(boxes_scores[i].box.x, boxes_scores[j].box.x);
			float y1 = (std::max)(boxes_scores[i].box.y, boxes_scores[j].box.y);
			float x2 = (std::min)(boxes_scores[i].box.x + boxes_scores[i].box.width, boxes_scores[j].box.x + boxes_scores[j].box.width);
			float y2 = (std::min)(boxes_scores[i].box.y + boxes_scores[i].box.height, boxes_scores[j].box.y + boxes_scores[j].box.height);
			float w = (std::max)(0.0f, x2 - x1);
			float h = (std::max)(0.0f, y2 - y1);
			float inter = w * h;
			float ovr = inter / (area[i] + area[j] - inter);

			if (ovr >= nms_threshold)  isSuppressed[j] = true;
		}
	}

	for (int i = 0; i < boxes_scores.size(); ++i)
	{
		if (!isSuppressed[i])	indices.push_back(boxes_scores[i].id);
	}
}


//box缩放到原图尺寸
void scale_boxes(cv::Rect & box, cv::Size size)
{
	float gain = std::min(input_width * 1.0 / size.width, input_height * 1.0 / size.height);
	int pad_w = (input_width - size.width * gain) / 2;
	int pad_h = (input_height - size.height * gain) / 2;
	box.x -= pad_w;
	box.y -= pad_h;
	box.x /= gain;
	box.y /= gain;
	box.width /= gain;
	box.height /= gain;
}


//可视化函数
void draw_result(cv::Mat & image, std::string label, cv::Rect box)
{
	cv::rectangle(image, box, cv::Scalar(255, 0, 0), 1);
	int baseLine;
	cv::Size label_size = cv::getTextSize(label, 1, 1, 1, &baseLine);
	cv::Point tlc = cv::Point(box.x, box.y);
	cv::Point brc = cv::Point(box.x, box.y + label_size.height + baseLine);
	cv::putText(image, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 1);
}


//后处理
void post_process(std::vector<cv::Mat> & images, std::vector<cv::Mat> & results, std::vector<Ort::Value> & outputs)
{
	for (int n = 0; n < images.size(); ++n)
	{
		std::vector<cv::Rect> boxes;
		std::vector<float> scores;
		std::vector<int> class_ids;

		for (int i = 0; i < output_numbox; ++i)
		{
			float* ptr = const_cast<float*> (outputs[0].GetTensorData<float>()) + i * output_numprob + n * output_numbox * output_numprob;
			float objness = ptr[4];
			if (objness < confidence_threshold)
				continue;

			float* classes_scores = ptr + 5;
			int class_id = std::max_element(classes_scores, classes_scores + num_classes) - classes_scores;
			float max_class_score = classes_scores[class_id];
			float score = max_class_score * objness;
			if (score < score_threshold)
				continue;

			float x = ptr[0];
			float y = ptr[1];
			float w = ptr[2];
			float h = ptr[3];
			int left = int(x - 0.5 * w);
			int top = int(y - 0.5 * h);
			int width = int(w);
			int height = int(h);
			cv::Rect box = cv::Rect(left, top, width, height);
			scale_boxes(box, images[n].size());
			boxes.push_back(box);
			scores.push_back(score);
			class_ids.push_back(class_id);
		}

		std::vector<int> indices;
		nms(boxes, scores, score_threshold, nms_threshold, indices);
		for (int i = 0; i < indices.size(); i++)
		{
			int idx = indices[i];
			cv::Rect box = boxes[idx];
			std::string label = class_names[class_ids[idx]] + ":" + cv::format("%.2f", scores[idx]); //class_ids[idx]是class_id
			draw_result(results[n], label, box);
		}
	}
}


int main(int argc, char* argv[])
{
	std::vector<cv::Mat> images;
	cv::glob("./images/*.jpg", filenames);
	for (const auto& filename : filenames)
	{
		cv::Mat image = cv::imread(filename, -1);
		images.push_back(image);
	}
	std::vector<cv::Mat> results = images;

	std::vector<float> inputs;
	pre_process(images, inputs);

	const wchar_t* model = L"yolov5n_dynamic.onnx";
	std::vector<Ort::Value> outputs;
	process(model, inputs, outputs);

	post_process(images, results, outputs);

	for (size_t i = 0; i < results.size(); i++)
	{
		cv::imwrite("result" + std::to_string(i) + ".jpg", results[i]);
	}

	return 0;
}

tensorrt动态输入推理

lenet的tensorrt动态输入推理

使用trtexec工具转换模型:

 ./trtexec.exe --onnx=lenet.onnx --saveEngine=lenet.engine --minShapes=input:1x1x28x28 --optShapes=input:2x1x28x28 --maxShapes=input:8x1x28x28

tensorrt动态输入推理(python):

import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from pathlib import Path


logger = trt.Logger(trt.Logger.WARNING)
with open("lenet.engine", "rb") as f, trt.Runtime(logger) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

image_files = Path('./images').glob('*.jpg')
input_tensor = []
for image_file in image_files:
    image = cv2.imread(str(image_file), -1)
    blob = cv2.dnn.blobFromImage(image, 1/255., size=(28,28))
    input_tensor.append(np.squeeze(blob, axis=0))
    
h_input = cuda.pagelocked_empty(trt.volume((len(input_tensor),1,28,28)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume((len(input_tensor),10)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()
np.copyto(h_input, np.array(input_tensor).ravel())

with engine.create_execution_context() as context:
    context.set_input_shape("input", (len(input_tensor),1,28,28))
    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()
    pred = np.argmax(h_output.reshape(len(input_tensor),10), axis=1)
    print(pred)

tensorrt动态输入推理(cpp):

#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <NvOnnxParser.h>


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;


std::vector<unsigned char> load_file(const std::string & file)
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0)
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)& data[0], length);
	}
	in.close();
	return data;
}


int main()
{
	std::vector<std::string> filenames;
	std::vector<cv::Mat> images;
	cv::glob("./images/*.jpg", filenames);
	for (const auto& filename : filenames)
	{
		cv::Mat image = cv::imread(filename, -1);
		images.push_back(image);
	}

	TRTLogger logger;
	auto engine_data = load_file("lenet.engine");
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
	nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if (engine == nullptr)
	{
		printf("Deserialize cuda engine failed.\n");
		runtime->destroy();
		return -1;
	}

	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
	execution_context->setBindingDimensions(0, nvinfer1::Dims4{ (int)images.size(), 1, 28, 28 });
	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);

	int input_numel = images.size() * 1 * 28 * 28;
	int output_numel = images.size() * 10;
	float* input_data_host = nullptr;
	cudaMallocHost(&input_data_host, input_numel * sizeof(float));
	for (size_t n = 0; n < images.size(); n++)
	{
		images[n].convertTo(images[n], CV_32FC1, 1.0f / 255.0f);
		float* pimage = (float*)images[n].data;
		for (int i = 0; i < 28 * 28; i++)
		{
			input_data_host[n * 28 * 28 + i] = pimage[i];
		}
	}

	float* input_data_device = nullptr;
	float* output_data_host = new float[output_numel];
	float* output_data_device = nullptr;
	cudaMalloc(&input_data_device, input_numel * sizeof(float));
	cudaMalloc(&output_data_device, output_numel * sizeof(float));
	cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);
	float* bindings[] = { input_data_device, output_data_device };

	bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, output_numel * sizeof(float), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	for (size_t i = 0; i < images.size(); i++)
	{
		std::cout << std::max_element(output_data_host + 10 * i, output_data_host + 10 * (i + 1)) - output_data_host - 10 * i << std::endl;
	}

	cudaStreamDestroy(stream);
	execution_context->destroy();
	engine->destroy();
	runtime->destroy();

	return 0;
}

yolov5的tensorrt动态输入推理

使用trtexec工具转换模型:

trtexec.exe --onnx=yolov5n.onnx --saveEngine=yolov5n.engine --minShapes=images:1x3x480x480 --optShapes=images:1x3x640x640 --maxShapes=images:8x3x960x960

tensorrt动态输入推理(python):

import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit 
import pycuda.driver as cuda  
from pathlib import Path


class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
        'hair drier', 'toothbrush'] #coco80类别      
input_shape = (640, 640) 
score_threshold = 0.2  
nms_threshold = 0.5
confidence_threshold = 0.2   


def nms(boxes, scores, score_threshold, nms_threshold):
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    keep = []
    index = scores.argsort()[::-1] 

    while index.size > 0:
        i = index[0]
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]]) 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22 - x11 + 1)                              
        h = np.maximum(0, y22 - y11 + 1) 
        overlaps = w * h
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= nms_threshold)[0]
        index = index[idx + 1]
    return keep


def xywh2xyxy(x):
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y


def filter_box(outputs): #过滤掉无用的框    
    outputs = np.squeeze(outputs).astype(dtype=np.float32)
    outputs = outputs[outputs[..., 4] > confidence_threshold]
    classes_scores = outputs[..., 5:]
     
    boxes = []
    scores = []
    class_ids = []
    for i in range(len(classes_scores)):
        class_id = np.argmax(classes_scores[i])
        outputs[i][4] *= classes_scores[i][class_id]
        outputs[i][5] = class_id
        if outputs[i][4] > score_threshold:
            boxes.append(outputs[i][:6])
            scores.append(outputs[i][4])
            class_ids.append(outputs[i][5])
            
    boxes = np.array(boxes)
    boxes = xywh2xyxy(boxes)
    scores = np.array(scores)
    indices = nms(boxes, scores, score_threshold, nms_threshold) 
    output = boxes[indices]
    return output


def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    
    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))    
    dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2  # wh padding 
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im


def scale_boxes(input_shape, boxes, shape):
    # Rescale boxes (xyxy) from input_shape to shape
    gain = min(input_shape[0] / shape[0], input_shape[1] / shape[1])  # gain  = old / new
    pad = (input_shape[1] - shape[1] * gain) / 2, (input_shape[0] - shape[0] * gain) / 2  # wh padding
    boxes[..., [0, 2]] -= pad[0]  # x padding
    boxes[..., [1, 3]] -= pad[1]  # y padding
    boxes[..., :4] /= gain
    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
    return boxes


def draw(image, box_data):
    box_data = scale_boxes(input_shape, box_data, image.shape)
    boxes = box_data[...,:4].astype(np.int32) 
    scores = box_data[...,4]
    classes = box_data[...,5].astype(np.int32)
   
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 1)
        cv2.putText(image, '{0} {1:.2f}'.format(class_names[cl], score), (top, left), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)


if __name__=="__main__":
    logger = trt.Logger(trt.Logger.WARNING)
    with open("yolov5n.engine", "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    image_files = Path('./images').glob('*.jpg')
    images = []
    input_tensor = []
    for image_file in image_files:
        image = cv2.imread(str(image_file), -1)
        images.append(image)
        input = letterbox(image, input_shape)
        input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)  #BGR2RGB和HWC2CHW
        input = input / 255.0
        input_tensor.append(input)

    inputs_host = cuda.pagelocked_empty(trt.volume((len(input_tensor),3,640,640)), dtype=np.float32)
    outputs_host = cuda.pagelocked_empty(trt.volume((len(input_tensor),25200,85)), dtype=np.float32)
    inputs_device = cuda.mem_alloc(inputs_host.nbytes)
    outputs_device = cuda.mem_alloc(outputs_host.nbytes)
    stream = cuda.Stream()
    np.copyto(inputs_host, np.array(input_tensor).ravel())

    with engine.create_execution_context() as context:
        context.set_input_shape("images", (len(input_tensor),3,640,640))
        cuda.memcpy_htod_async(inputs_device, inputs_host, stream)
        context.execute_async_v2(bindings=[int(inputs_device), int(outputs_device)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(outputs_host, outputs_device, stream)
        stream.synchronize()  
        outputs_host = outputs_host.reshape(len(input_tensor),25200,85)
        for i in range(len(input_tensor)):
            boxes = filter_box(outputs_host[i])
            draw(images[i], boxes)
            cv2.imwrite('result'+str(i)+'.jpg', images[i])

tensorrt动态输入推理(cpp):

#include <iostream>
#include <fstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>


const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };			//类别名称

const int input_width = 640;
const int input_height = 640;
const float score_threshold = 0.2;
const float nms_threshold = 0.5;
const float confidence_threshold = 0.2;
const int num_classes = class_names.size();
const int output_numprob = 5 + num_classes;
const int output_numbox = 3 * (input_width / 8 * input_height / 8 + input_width / 16 * input_height / 16 + input_width / 32 * input_height / 32);
int input_numel;
int output_numel;
std::vector<std::string> filenames;


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;


std::vector<unsigned char> load_file(const std::string& file)
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0)
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)&data[0], length);
	}
	in.close();
	return data;
}


//LetterBox处理
void LetterBox(const cv::Mat& image, cv::Mat& outImage,
	const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114))
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}


//预处理
void pre_process(std::vector<cv::Mat>& images, float* input_data_host)
{
	cv::Mat letterbox;
	int image_area = input_width * input_height;

	for (size_t i = 0; i < images.size(); i++)
	{
		LetterBox(images[i], letterbox, cv::Size(input_width, input_height));
		letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);
		float* pimage = (float*)letterbox.data;
		float* phost_b = input_data_host + image_area * (3 * i + 0);
		float* phost_g = input_data_host + image_area * (3 * i + 1);
		float* phost_r = input_data_host + image_area * (3 * i + 2);
		for (int i = 0; i < image_area; ++i, pimage += 3)
		{
			*phost_r++ = pimage[0];
			*phost_g++ = pimage[1];
			*phost_b++ = pimage[2];
		}
	}
}


//网络推理
void process(std::string model, float* input_data_host, float* output_data_host)
{
	TRTLogger logger;
	auto engine_data = load_file(model);
	auto runtime = nvinfer1::createInferRuntime(logger);
	auto engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());

	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);
	auto execution_context = engine->createExecutionContext();
	execution_context->setBindingDimensions(0, nvinfer1::Dims4{ (int)filenames.size(), 3, input_width, input_height });

	float* input_data_device = nullptr;
	cudaMalloc(&input_data_device, sizeof(float) * input_numel);
	cudaMemcpyAsync(input_data_device, input_data_host, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);

	float* output_data_device = nullptr;
	cudaMalloc(&output_data_device, sizeof(float) * output_numel);

	float* bindings[] = { input_data_device, output_data_device };
	execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	cudaStreamDestroy(stream);
	cudaFree(input_data_device);
	cudaFree(output_data_device);
}


//NMS
void nms(std::vector<cv::Rect>& boxes, std::vector<float>& scores, float score_threshold, float nms_threshold, std::vector<int>& indices)
{
	assert(boxes.size() == scores.size());

	struct BoxScore
	{
		cv::Rect box;
		float score;
		int id;
	};
	std::vector<BoxScore> boxes_scores;
	for (size_t i = 0; i < boxes.size(); i++)
	{
		BoxScore box_conf;
		box_conf.box = boxes[i];
		box_conf.score = scores[i];
		box_conf.id = i;
		if (scores[i] > score_threshold)	boxes_scores.push_back(box_conf);
	}

	std::sort(boxes_scores.begin(), boxes_scores.end(), [](BoxScore a, BoxScore b) { return a.score > b.score; });

	std::vector<float> area(boxes_scores.size());
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		area[i] = boxes_scores[i].box.width * boxes_scores[i].box.height;
	}

	std::vector<bool> isSuppressed(boxes_scores.size(), false);
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		if (isSuppressed[i])  continue;
		for (size_t j = i + 1; j < boxes_scores.size(); ++j)
		{
			if (isSuppressed[j])  continue;

			float x1 = (std::max)(boxes_scores[i].box.x, boxes_scores[j].box.x);
			float y1 = (std::max)(boxes_scores[i].box.y, boxes_scores[j].box.y);
			float x2 = (std::min)(boxes_scores[i].box.x + boxes_scores[i].box.width, boxes_scores[j].box.x + boxes_scores[j].box.width);
			float y2 = (std::min)(boxes_scores[i].box.y + boxes_scores[i].box.height, boxes_scores[j].box.y + boxes_scores[j].box.height);
			float w = (std::max)(0.0f, x2 - x1);
			float h = (std::max)(0.0f, y2 - y1);
			float inter = w * h;
			float ovr = inter / (area[i] + area[j] - inter);

			if (ovr >= nms_threshold)  isSuppressed[j] = true;
		}
	}

	for (int i = 0; i < boxes_scores.size(); ++i)
	{
		if (!isSuppressed[i])	indices.push_back(boxes_scores[i].id);
	}
}


//box缩放到原图尺寸
void scale_boxes(cv::Rect& box, cv::Size size)
{
	float gain = std::min(input_width * 1.0 / size.width, input_height * 1.0 / size.height);
	int pad_w = (input_width - size.width * gain) / 2;
	int pad_h = (input_height - size.height * gain) / 2;
	box.x -= pad_w;
	box.y -= pad_h;
	box.x /= gain;
	box.y /= gain;
	box.width /= gain;
	box.height /= gain;
}


//可视化函数
void draw_result(cv::Mat& image, std::string label, cv::Rect box)
{
	cv::rectangle(image, box, cv::Scalar(255, 0, 0), 1);
	int baseLine;
	cv::Size label_size = cv::getTextSize(label, 1, 1, 1, &baseLine);
	cv::Point tlc = cv::Point(box.x, box.y);
	cv::Point brc = cv::Point(box.x, box.y + label_size.height + baseLine);
	cv::putText(image, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 1);
}


//后处理
void post_process(std::vector<cv::Mat>& images, std::vector<cv::Mat>& results, float* output_data_host)
{
	for (int n = 0; n < images.size(); ++n)
	{
		std::vector<cv::Rect> boxes;
		std::vector<float> scores;
		std::vector<int> class_ids;

		for (int i = 0; i < output_numbox; ++i)
		{
			float* ptr = output_data_host + i * output_numprob + n * output_numbox * output_numprob;
			float objness = ptr[4];
			if (objness < confidence_threshold)
				continue;

			float* classes_scores = 5 + ptr;
			int class_id = std::max_element(classes_scores, classes_scores + num_classes) - classes_scores;
			float max_class_score = classes_scores[class_id];
			float score = max_class_score * objness;
			if (score < score_threshold)
				continue;

			float x = ptr[0];
			float y = ptr[1];
			float w = ptr[2];
			float h = ptr[3];
			int left = int(x - 0.5 * w);
			int top = int(y - 0.5 * h);
			int width = int(w);
			int height = int(h);

			cv::Rect box = cv::Rect(left, top, width, height);
			scale_boxes(box, images[n].size());
			boxes.push_back(box);
			scores.push_back(score);
			class_ids.push_back(class_id);
		}
		//std::cout << boxes.size() << std::endl;

		std::vector<int> indices;
		nms(boxes, scores, score_threshold, nms_threshold, indices);

		for (int i = 0; i < indices.size(); i++)
		{
			int idx = indices[i];
			cv::Rect box = boxes[idx];
			std::string label = class_names[class_ids[idx]] + ":" + cv::format("%.2f", scores[idx]); //class_ids[idx]是class_id
			draw_result(results[n], label, box);
		}
	}
}


int main(int argc, char* argv[])
{
	std::vector<cv::Mat> images;
	cv::glob("./images/*.jpg", filenames);
	for (const auto& filename : filenames)
	{
		cv::Mat image = cv::imread(filename, -1);
		images.push_back(image);
	}
	std::vector<cv::Mat> results = images;

	float* inputs = nullptr;
	float* outputs = nullptr;
	input_numel = images.size() * 3 * input_width * input_height;
	output_numel = images.size() * output_numbox * output_numprob;
	cudaMallocHost(&inputs, sizeof(float) * input_numel);
	cudaMallocHost(&outputs, sizeof(float) * output_numel);

	pre_process(images, inputs);

	std::string model = "yolov5n.engine";
	process(model, inputs, outputs);

	post_process(images, results, outputs);

	for (size_t i = 0; i < results.size(); i++)
	{
		cv::imwrite("result" + std::to_string(i) + ".jpg", results[i]);
	}

	cudaFreeHost(inputs);
	cudaFreeHost(outputs);

	return 0;
}

您可能感兴趣的与本文相关的镜像

TensorRT-v8.6

TensorRT-v8.6

TensorRT

TensorRT 是NVIDIA 推出的用于深度学习推理加速的高性能推理引擎。它可以将深度学习模型优化并部署到NVIDIA GPU 上,实现低延迟、高吞吐量的推理过程。

ONNX Runtime TensorRT 是两种广泛使用的深度学习推理框架,它们各自具有不同的特点优势,适用于不同的应用场景。 ### 区别 #### 1. 支持的模型格式 - **ONNX Runtime**:专注于支持 ONNX(Open Neural Network Exchange)格式的模型。它能够直接加载运行 `.onnx` 文件,并且支持多种硬件平台操作系统[^2]。 - **TensorRT**:虽然也支持 ONNX 模型的导入,但其核心是 NVIDIA 的高性能推理引擎,通常会将模型转换为 `.engine` 文件以进行优化加速推理[^3]。 #### 2. 性能优化 - **ONNX Runtime**:提供了跨平台的支持,并集成了多种优化技术,如内存优化、线程优化等。它更适合 CPU GPU 的通用推理任务,尤其是在 Windows Linux 平台上[^2]。 - **TensorRT**:专为 NVIDIA GPU 设计,利用了底层硬件特性进行极致的性能优化。它通过量化、层融合、内核自动调优等技术显著提升推理速度,特别适合需要高吞吐量低延迟的应用场景[^1]。 #### 3. 动态输入处理 - **ONNX Runtime**:支持动态输入形状,但在某些情况下可能不如 TensorRT 灵活。例如,在处理变长序列或图像尺寸变化较大的情况时,可能需要额外的预处理步骤[^4]。 - **TensorRT**:提供了更强大的动态输入支持,尤其是通过 `trtexec` 工具可以轻松设置最小、最优最大输入尺寸,从而更好地适应不同批次大小输入维度的变化[^3]。 #### 4. 多 Batch 推理 - **ONNX Runtime**:可以通过简单的 NumPy 数组拼接实现多 Batch 推理,代码较为直观,适合快速原型开发小规模部署[^4]。 - **TensorRT**:在多 Batch 场景下表现更为高效,尤其在批量处理大量数据时,能够充分利用 GPU 的并行计算能力,提供更高的吞吐量。 #### 5. 部署与集成 - **ONNX Runtime**:易于集成到现有的 Python 或 C++ 项目中,API 简洁明了,适合快速部署轻量级应用[^2]。 - **TensorRT**:部署过程相对复杂,涉及模型转换、引擎构建等步骤,但一旦完成,可以获得更高的性能收益,适合对性能要求极高的生产环境[^1]。 ### 联系 #### 1. 共同支持 ONNX 格式 两者都可以接受 ONNX 格式的模型作为输入,这意味着用户可以在 PyTorch 或 TensorFlow 中训练模型后,将其导出为 ONNX 格式,再分别使用 ONNX Runtime 或 TensorRT 进行推理,确保模型的可移植性灵活性。 #### 2. 提供推理加速 尽管实现方式不同,但 ONNX Runtime TensorRT 都旨在提高深度学习模型的推理效率。它们都采用了各种优化策略,如内存管理、并行计算等,以减少推理时间资源消耗[^1]。 #### 3. 可用于相同的应用场景 虽然各自的优化重点不同,但两者都可以应用于计算机视觉、自然语言处理等领域。例如,在目标检测任务中,YOLOv8 模型既可以使用 ONNX Runtime 进行推理,也可以通过 TensorRT 构建 TRT 引擎来加速推理过程[^1]。 --- ### 示例代码:ONNX Runtime TensorRT 的基本用法 #### ONNX Runtime 推理示例(Python) ```python import cv2 import numpy as np import onnxruntime # 加载图像并预处理 img0 = cv2.imread("2.png", 0) blob0 = cv2.dnn.blobFromImage(img0, 1/255., size=(28,28), swapRB=True, crop=False) # 初始化 ONNX Runtime 会话 onnx_session = onnxruntime.InferenceSession("lenet.onnx", providers=['CUDAExecutionProvider']) # 获取输入输出名称 input_name = [node.name for node in onnx_session.get_inputs()] output_name = [node.name for node in onnx_session.get_outputs()] # 准备输入数据 inputs = {input_name[0]: blob0} # 执行推理 outputs = onnx_session.run(None, inputs)[0] print(np.argmax(outputs, axis=1)) ``` #### TensorRT 推理示例(C++) ```cpp #include <NvInfer.h> #include <cuda_runtime.h> // 假设已经加载了 TensorRT 引擎 nvinfer1::ICudaEngine* engine = ...; nvinfer1::IExecutionContext* context = engine->createExecutionContext(); // 分配输入输出缓冲区 void* buffers[2]; cudaMalloc(&buffers[0], batchSize * inputSize * sizeof(float)); cudaMalloc(&buffers[1], batchSize * outputSize * sizeof(float)); // 将输入数据复制到设备 cudaMemcpy(buffers[0], inputData, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice); // 执行推理 context->execute(batchSize, buffers); // 将输出结果复制回主机 cudaMemcpy(outputData, buffers[1], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost); // 清理资源 cudaFree(buffers[0]); cudaFree(buffers[1]); delete context; ``` --- ###
评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

给算法爸爸上香

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值