result_of学习

    result_of是一个小但很有用的组件,可以帮助程序员确定一个调用表达式的返回类型,主要用于泛型编程和其他Boost库组件,它已被收入TR1。

result_of位于名字空间boost,为了使用result_of组件,需要包含头文件<boost/utility/result_of.hpp>,即:

    #include<boost/utility/result_of.hpp>

    using namespace boost;


一般来说result_of在一般编程中很少会被使用到,可能会在一些模板封装里面由于对模板要处理的类型不能够确定,可以使用这个其来辅助处理,避免了算法相同,就因为对函数类型不能够确定而麻烦地分开写。下面是result_of使用例子:

#include <iostream>
#include <string>



#include <boost/utility/result_of.hpp>
using namespace std;
using namespace boost;



template<class type>
type square(type x) 
{ 
	return x; 
}

int square(int x) 
{ 
	x = 11;
 return (x * x); 
}


template<class Fun, 
class Arg> 
	void test_result(const Fun& fun, Arg arg) 
{ 
	typename boost::result_of<Fun(Arg)>::type val = fun(arg); 
	std::cout << "val == " << val << std::endl; 
}

typedef int (*TiSquare)(int);
typedef double (*TdSquare)(double);
typedef float (*TfSquare)(float);
typedef string (*TsSquare)(string);

void main()
{
	//square<double>(2);

	int x = 2;
	test_result<TiSquare,int>(&square,x); 
	double y = 4.4;
	test_result<TdSquare,double>(&square,y);
	float z = 4.3f;
	test_result<TfSquare,float>(&square,z);
	string w = "abcdefg";
	test_result<TsSquare,string>(&square,w);
}

结果:
val == 121
val == 4.4
val == 4.3
val == abcdefg
请按任意键继续. . .


mport ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.8 IOU_THRESHOLD = 0.6 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: # print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = 1 def infer(self, raw_image_generator): # threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) input_image, image_raw, origin_h, origin_w = self.preprocess_image(raw_image_generator) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[0], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process(output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]) return result_boxes, result_scores, result_classid, end - start # # Draw rectangles and labels on the original image # for j in range(len(result_boxes)): # box = result_boxes[j] # plot_one_box( # box, # batch_image_raw[i], # label="{}:{:.2f}".format( # categories[int(result_classid[j])], result_scores[j] # ), # ) def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 6))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes # class inferThread(threading.Thread): # def __init__(self, yolov5_wrapper, image_path_batch): # threading.Thread.__init__(self) # self.yolov5_wrapper = yolov5_wrapper # self.image_path_batch = image_path_batch # def run(self): # batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch)) # for i, img_path in enumerate(self.image_path_batch): # parent, filename = os.path.split(img_path) # save_name = os.path.join('output', filename) # # Save image # cv2.imwrite(save_name, batch_image_raw[i]) # print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) # def get_max_area(img , boxs, result_classid, result_scores): # max_area = 0 # cls_id = 10 # score_1=0 # for box, class_id, score in zip(boxs, result_classid, result_scores): # x1, y1, x2, y2 = box # area = (x2-x1)*(y2-y1) # if area > 100: # cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), # thickness=3, lineType=cv2.LINE_AA) # if area > max_area: # max_area = int(area) # cls_id = int(class_id) # score_1=score # return cls_id, max_area, score_1 def get_max_area(boxs, result_classid , result_scores): max_area = 0 cls_id = 0 _score = 0 max_box=[] for box, class_id ,score in zip(boxs, result_classid,result_scores): x1, y1, x2, y2 = box area = (y2-y1)*(y2-y1) if area > max_area: max_area = int(area) cls_id = int(class_id) max_box = box _score = score return cls_id, max_area, max_box ,_score def draw_rerectangle(frame,box,cls,score,area): x1, y1, x2, y2 = box cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), thickness=3, lineType=cv2.LINE_AA) cv2.putText(frame, str("cls_id:%s" % cls), (20, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(frame, str("area:%d" % area), (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(frame, str("score:%f" % score), (20, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "/media/jian/My Passport/资料/car/ai_control-FULL-3.0(14)/ai_control-FULL-3.0/src/opencv_detection/scripts/small_yolo/libmyplugins.so" engine_file_path = "/media/jian/My Passport/资料/car/ai_control-FULL-3.0(14)/ai_control-FULL-3.0/src/opencv_detection/scripts/small_yolo/small.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["danger","side_walk","speed","speed_limit","turn_left","turn_right","lane_change"] # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) cap = cv2.VideoCapture(0,cv2.CAP_V4L2) try: while 1: ret, img = cap.read() if not ret: continue image = cv2.resize(img, (320, 240), interpolation=cv2.INTER_AREA) #w*h # print(image.shape) # frame = image[:224, 96:320, :] #h*w result_boxes, result_scores, result_classid, use_time = yolov5_wrapper.infer( image) cls_id = 10 area = 0 if (len(result_classid) != 0): cls_id, area, score = get_max_area(image,result_boxes, result_classid,result_scores) cv2.putText(image, str("cls_id:%s" % categories[cls_id]), (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(image, str("area:%d" % area), (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(image, str("score:%f" % score), (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.imshow("frame", image) if cv2.waitKey(1) & 0xFF == ord('q'): break finally: # destroy the instance cap.release() cv2.destroyAllWindows() yolov5_wrapper.destroy()
最新发布
07-07
<think>我们正在处理一个关于使用TensorRT部署YOLOv5模型进行实时目标检测,并对检测结果进行后处理和可视化的问题。根据引用内容,我们可以总结出部署流程主要包括以下步骤: 1. 将PyTorch模型(.pt)转换为ONNX格式。 2. 将ONNX模型转换为TensorRT引擎(.engine)。 3. 使用TensorRT引擎进行推理。 4. 对推理结果进行后处理(包括非极大值抑制等)。 5. 将检测结果可视化(如图像上绘制边界框和标签)。 接下来,我们将按照这些步骤详细说明。 ### 步骤1:将YOLOv5模型转换为ONNX格式 YOLOv5官方仓库提供了将PyTorch模型导出为ONNX模型的脚本。我们可以使用`export.py`脚本进行转换。 ```bash python export.py --weights yolov5s.pt --include onnx --img 640 --batch 1 ``` 参数说明: - `--weights`: 指定预训练的PyTorch模型权重文件。 - `--include onnx`: 指定导出为ONNX格式。 - `--img`: 指定输入图像的尺寸(这里使用默认640x640)。 - `--batch`: 指定批处理大小(实时检测通常为1)。 注意:确保你的环境安装了YOLOv5的依赖(`pip install -r requirements.txt`)和ONNX(`pip install onnx`)。 ### 步骤2:将ONNX模型转换为TensorRT引擎 可以使用TensorRT提供的`trtexec`工具或使用Python API进行转换。这里我们使用Python API,以便更灵活地控制。 首先,安装TensorRT。推荐使用pip安装: ```bash pip install nvidia-tensorrt ``` 或者根据系统环境从NVIDIA官网下载TensorRT的tar包进行安装。 然后,编写转换脚本(以下是一个基本示例): ```python import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def build_engine(onnx_file_path, engine_file_path): # 初始化TensorRT构建器 builder = trt.Builder(TRT_LOGGER) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, TRT_LOGGER) # 解析ONNX模型 with open(onnx_file_path, 'rb') as model: if not parser.parse(model.read()): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) return None # 配置构建参数 config = builder.create_builder_config() config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB # 构建引擎 serialized_engine = builder.build_serialized_network(network, config) # 保存引擎文件 with open(engine_file_path, "wb") as f: f.write(serialized_engine) return serialized_engine # 转换ONNX模型为TensorRT引擎 onnx_path = "yolov5s.onnx" engine_path = "yolov5s.engine" build_engine(onnx_path, engine_path) ``` ### 步骤3:使用TensorRT引擎进行推理 加载引擎文件并进行推理。注意,YOLOv5的输出格式需要根据模型版本和配置进行解析。 ```python import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import numpy as np class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: # 获取输入/输出的形状 binding_shape = engine.get_binding_shape(binding) # 计算内存大小 size = trt.volume(binding_shape) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # 分配主机和设备内存 host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # 将输入数据从主机内存复制到设备内存 [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # 执行推理 context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # 将输出数据从设备内存复制到主机内存 [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # 同步流 stream.synchronize() # 返回输出数据 return [out.host for out in outputs] # 加载引擎 TRT_LOGGER = trt.Logger(trt.Logger.WARNING) runtime = trt.Runtime(TRT_LOGGER) with open("yolov5s.engine", "rb") as f: serialized_engine = f.read() engine = runtime.deserialize_cuda_engine(serialized_engine) context = engine.create_execution_context() # 分配缓冲区 inputs, outputs, bindings, stream = allocate_buffers(engine) # 准备输入数据(假设输入为1x3x640x640的numpy数组) input_data = np.random.random((1,3,640,640)).astype(np.float32) np.copyto(inputs[0].host, input_data.ravel()) # 执行推理 outputs = do_inference(context, bindings, inputs, outputs, stream) ``` ### 步骤4:后处理 YOLOv5的输出通常包括多个尺度的预测(例如三个不同尺度的特征图),每个预测包含边界框坐标、置信度和类别概率。后处理主要包括: 1. 将原始输出转换为边界框(解码)。 2. 应用置信度阈值过滤掉低置信度的检测框。 3. 应用非极大值抑制(NMS)去除重叠的检测框。 以下是一个简化的后处理示例(注意:具体实现需要根据模型输出结构调整): ```python import numpy as np def postprocess(outputs, conf_thres=0.25, iou_thres=0.45): # 假设outputs是三个输出层的列表,每个层的形状为[1, num_anchors, 5+num_classes] # 实际中需要根据模型输出结构进行调整 # 合并所有输出 predictions = np.concatenate(outputs, axis=1) # 过滤掉低置信度的预测 mask = predictions[..., 4] > conf_thres predictions = predictions[mask] # 如果没有检测到目标,返回空列表 if predictions.shape[0] == 0: return [] # 将边界框坐标从中心点、宽高转换为左上角和右下角坐标 boxes = predictions[:, :4] # 计算类别置信度 = 目标置信度 * 类别概率 scores = predictions[:, 4:5] * predictions[:, 5:] # 获取每个框的最大类别置信度和类别索引 class_ids = np.argmax(scores, axis=1) class_scores = scores[np.arange(len(scores)), class_ids] # 应用非极大值抑制 indices = nms(boxes, class_scores, iou_thres) # 返回检测结果 return [boxes[i] for i in indices], [class_ids[i] for i in indices], [class_scores[i] for i in indices] def nms(boxes, scores, iou_thres): # 根据置信度排序 sorted_indices = np.argsort(scores)[::-1] keep = [] while sorted_indices.size > 0: # 选取置信度最高的框 i = sorted_indices[0] keep.append(i) # 计算当前框与剩余框的IoU ious = calculate_iou(boxes[i], boxes[sorted_indices[1:]]) # 保留IoU小于阈值的框 remaining_indices = np.where(ious < iou_thres)[0] + 1 sorted_indices = sorted_indices[remaining_indices] return keep def calculate_iou(box, boxes): # 计算一个框与多个框的IoU # box: [x1, y1, x2, y2] # boxes: [N, 4] # 计算交集区域 x1 = np.maximum(box[0], boxes[:, 0]) y1 = np.maximum(box[1], boxes[:, 1]) x2 = np.minimum(box[2], boxes[:, 2]) y2 = np.minimum(box[3], boxes[:, 3]) # 计算交集面积 intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1) # 计算并集面积 area_box = (box[2] - box[0]) * (box[3] - box[1]) area_boxes = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) union = area_box + area_boxes - intersection # 计算IoU iou = intersection / union return iou ``` ### 步骤5:可视化 将检测结果绘制在原始图像上。 ```python import cv2 def visualize(img, boxes, class_ids, class_scores, class_names): for box, class_id, score in zip(boxes, class_ids, class_scores): x1, y1, x2, y2 = box.astype(int) # 绘制边界框 cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2) # 添加标签和置信度 label = f"{class_names[class_id]}: {score:.2f}" cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 2) return img # 假设class_names是一个类别名称列表 class_names = ['person', 'car', ...] # 根据你的数据集类别填写 # 假设img是原始图像(HWC格式) img_with_detections = visualize(img, boxes, class_ids, class_scores, class_names) cv2.imshow("Detections", img_with_detections) cv2.waitKey(0) ``` ### 注意事项 1. 以上代码仅为示例,实际部署时需要根据具体的模型结构和输出进行调整。 2. 在转换ONNX模型时,确保输入尺寸(如640x640)与推理时一致。 3. 后处理部分需要根据模型输出结构进行修改,YOLOv5不同版本(v6.0, v6.1等)的输出结构可能略有不同。 4. 为了提高实时性,可以考虑使用CUDA进行后处理的加速。 ### 相关问题 1. 如何优化TensorRT引擎的推理速度? 2. YOLOv5的后处理(如NMS)是否有更高效的实现方式? 3. 如何评估部署后的YOLOv5模型的检测精度?</think>### 基于TensorRT的YOLOv5模型部署与实时目标检测 以下是完整的部署流程,包含模型转换、推理加速、后处理和可视化步骤: --- #### **1. 模型转换流程** 1. **PyTorch模型转ONNX** 使用YOLOv5官方`export.py`脚本: ```bash python export.py --weights yolov5s.pt --include onnx --img 640 --batch 1 --opset 12 ``` - 关键参数:`--img 640`(输入尺寸),`--batch 1`(实时推理批大小)[^1] 2. **ONNX转TensorRT引擎** 使用TensorRT的`trtexec`工具: ```bash trtexec --onnx=yolov5s.onnx --saveEngine=yolov5s.engine --fp16 --workspace=4096 ``` - 启用FP16加速(`--fp16`),分配显存工作区(`--workspace=4096`)[^1][^3] --- #### **2. TensorRT推理实现** ```python import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import numpy as np # 加载TensorRT引擎 def load_engine(engine_path): with open(engine_path, "rb") as f: runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) return runtime.deserialize_cuda_engine(f.read()) # 创建执行上下文 engine = load_engine("yolov5s.engine") context = engine.create_execution_context() # 分配输入/输出内存 input_binding = engine.get_binding_index("images") output_binding = engine.get_binding_index("output") input_shape = engine.get_binding_shape(input_binding) # (1, 3, 640, 640) output_shape = engine.get_binding_shape(output_binding) # 创建GPU内存缓冲 input_host = cuda.pagelocked_empty(input_shape, dtype=np.float32) input_device = cuda.mem_alloc(input_host.nbytes) output_host = cuda.pagelocked_empty(output_shape, dtype=np.float32) output_device = cuda.mem_alloc(output_host.nbytes) stream = cuda.Stream() ``` --- #### **3. 后处理关键步骤** YOLOv5输出解码(包含NMS): ```python def postprocess(prediction, conf_thres=0.5, iou_thres=0.6): # prediction: [batch, num_anchors, 5+num_classes] # 1. 过滤低置信度检测 mask = prediction[..., 4] > conf_thres detections = prediction[mask] # 2. 计算类别概率 scores = detections[:, 4:5] * detections[:, 5:] class_ids = np.argmax(scores, axis=1) class_scores = scores[np.arange(len(scores)), class_ids] # 3. 应用NMS keep = [] boxes = xywh2xyxy(detections[:, :4]) # 转换坐标格式 indices = cv2.dnn.NMSBoxes(boxes.tolist(), class_scores.tolist(), conf_thres, iou_thres) return [boxes[i] for i in indices], [class_ids[i] for i in indices] ``` > **坐标转换函数**: > $$ \text{xywh} \to \text{xyxy}: \begin{cases} x_1 = x - w/2 \\ y_1 = y - h/2 \\ x_2 = x + w/2 \\ y_2 = y + h/2 \end{cases} $$ --- #### **4. 实时检测与可视化** ```python import cv2 # 预处理函数 def preprocess(image): image = cv2.resize(image, (640, 640)) image = image.transpose(2, 0, 1) # HWC -> CHW image = image / 255.0 # 归一化 return np.ascontiguousarray(image, dtype=np.float32) # 主循环 cap = cv2.VideoCapture(0) # 摄像头输入 while cap.isOpened(): ret, frame = cap.read() if not ret: break # 预处理 input_data = preprocess(frame) np.copyto(input_host, input_data.ravel()) # 推理 cuda.memcpy_htod_async(input_device, input_host, stream) context.execute_async_v2( bindings=[int(input_device), int(output_device)], stream_handle=stream.handle ) cuda.memcpy_dtoh_async(output_host, output_device, stream) stream.synchronize() # 后处理 boxes, class_ids = postprocess(output_host.reshape(output_shape)) # 可视化 for box, cls_id in zip(boxes, class_ids): x1, y1, x2, y2 = map(int, box) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, f"{CLASS_NAMES[cls_id]}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,0), 2) cv2.imshow("YOLOv5-TensorRT", frame) if cv2.waitKey(1) == 27: break # ESC退出 ``` --- #### **性能优化建议** 1. **FP16/INT8量化**: - 在转换引擎时添加 `--fp16` 或 `--int8` 参数,速度提升30-50%[^3] 2. **动态批处理**: ```bash trtexec --onnx=model.onnx --minShapes=images:1x3x640x640 --optShapes=images:4x3x640x640 --maxShapes=images:16x3x640x640 ``` 3. **多流处理**: 使用多个CUDA流并行处理输入帧,提高GPU利用率[^3] --- #### **常见问题解决** - **ONNX转换失败**: 确保使用YOLOv5 v6.0+版本,指定 `--opset=12` 或更高 - **TensorRT精度下降**: 检查FP16/INT8校准,或回退到FP32模式 - **后处理瓶颈**: 将NMS移植到CUDA内核加速(参考[TensorRTx](https://github.com/wang-xinyu/tensorrtx)实现) 部署流程总结: $$ \text{PyTorch模型} \xrightarrow{\text{export.py}} \text{ONNX} \xrightarrow{\text{trtexec}} \text{TensorRT引擎} \xrightarrow{\text{Python/C++}} \text{实时检测} $$ [^1]: PT模型转换为ONNX模型 [^2]: 生成TensorRT engine文件 [^3]: TensorRT在深度学习框架中的高效部署 --- ### 相关问题 1. 如何解决TensorRT部署YOLOv5时的类别置信度异常问题? 2. YOLOv5的TensorRT部署在Jetson边缘设备上有哪些特殊优化? 3. 如何实现TensorRT引擎的动态批处理以提升吞吐量? 4. 在Python和C++中部署TensorRT模型有哪些性能差异? 5. 如何对TensorRT推理过程进行端到端的时延分析?
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值