onnx模型导出
from ultralytics import YOLOE
# Initialize a YOLOE model
model = YOLOE("yoloe-11l-seg.pt")
names = ["person", "bus"]
model.set_classes(names, model.get_text_pe(names))
model.export(format="onnx")
导出的onnx模型结构如下:
onnxruntime部署
import cv2
import numpy as np
import onnxruntime
from utils import *
class_names = ["person", "bus"]
input_shape = (640, 640)
score_threshold = 0.1
nms_threshold = 0.5
def post_process(outputs):
boxes = []
scores = []
class_ids = []
preds = []
output = np.squeeze(outputs[0])
classes_scores = output[4:(4+len(class_names)), ...]
for i in range(output.shape[1]):
class_id = np.argmax(classes_scores[...,i])
score = classes_scores[class_id][i]
if score > score_threshold:
boxes.append(np.concatenate([output[:4, i], np.array([score, class_id])]))
scores.append(score)
class_ids.append(class_id)
preds.append(output[..., i])
boxes = np.array(boxes)
boxes = xywh2xyxy(boxes)
scores = np.array(scores)
indices = nms(boxes, scores, score_threshold, nms_threshold)
boxes = boxes[indices]
masks_in = np.array(preds)[indices][..., -32:]
proto = np.squeeze(outputs[1]).astype(dtype=np.float32)
c, mh, mw = proto.shape
masks = (masks_in @ proto.reshape(c, -1)).reshape(-1, mh, mw)
downsampled_bboxes = boxes.copy()
downsampled_bboxes[:, 0] *= mw / input_shape[0]
downsampled_bboxes[:, 2] *= mw / input_shape[0]
downsampled_bboxes[:, 3] *= mh / input_shape[1]
downsampled_bboxes[:, 1] *= mh / input_shape[1]
masks = crop_mask(masks, downsampled_bboxes)
boxes = scale_boxes(boxes, input_shape, image.shape)
resized_masks = []
for mask in masks:
mask = cv2.resize(mask, input_shape, cv2.INTER_LINEAR)
mask = scale_mask(mask, input_shape, image.shape)
resized_masks.append(mask)
resized_masks = np.array(resized_masks)
resized_masks = resized_masks > 0
return boxes, resized_masks
if __name__=="__main__":
onnx_session = onnxruntime.InferenceSession('yoloe-11l-seg.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
input_name = []
for node in onnx_session.get_inputs():
input_name.append(node.name)
image = cv2.imread('bus.jpg', -1)
input = letterbox(image, input_shape)
input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32) #BGR2RGB和HWC2CHW
input = input / 255.0
input_tensor = []
input_tensor.append(input)
inputs = {}
inputs[input_name[0]] = np.array(input_tensor)
outputs = onnx_session.run(None, inputs)
boxes, resized_masks = post_process(outputs)
result = draw_result(image, boxes, resized_masks)
cv2.imwrite('result.jpg', result)
utils.py
'''
Author: taifyang
Date: 2024-06-12 22:23:07
LastEditors: taifyang 58515915+taifyang@users.noreply.github.com
LastEditTime: 2024-11-22 22:43:15
Description: utilities functions
'''
import cv2
import numpy as np
'''
description: Non-Maximum Suppression
param {*} boxes detect bounding boxes
param {*} scores detect scores
param {*} score_threshold detect score threshold
param {*} nms_threshold IOU threshold
return {*} detect indices
'''
def nms(boxes, scores, score_threshold, nms_threshold):
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (y2 - y1 + 1) * (x2 - x1 + 1)
keep = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
keep.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= nms_threshold)[0]
index = index[idx + 1]
return keep
'''
description: convert xywh bounding boxes to x1y1x2y2 bounding boxes
param {*} x xywh bounding boxes
return {*} x1y1x2y2 bounding boxes
'''
def xywh2xyxy(x):
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
return y
'''
description: letterbox image process
param {*} im input image
param {*} new_shape output shape
param {*} color filled color
return {*} output image
'''
def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
# Resize and pad image while meeting stride-multiple constraints
shape = im.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2 # wh padding
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return im
'''
description: scale boxes
param {*} boxes bounding boxes
param {*} input_shape input image shape
param {*} output_shape output image shape
return {*} scaled boxes
'''
def scale_boxes(boxes, input_shape, output_shape):
# Rescale boxes (xyxy) from self.inputs_shape to shape
gain = min(input_shape[0] / output_shape[0], input_shape[1] / output_shape[1]) # gain = old / new
pad = (input_shape[1] - output_shape[1] * gain) / 2, (input_shape[0] - output_shape[0] * gain) / 2 # wh padding
boxes[..., [0, 2]] -= pad[0] # x padding
boxes[..., [1, 3]] -= pad[1] # y padding
boxes[..., :4] /= gain
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, output_shape[1]) # x1, x2
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, output_shape[0]) # y1, y2
return boxes
'''
description: crop mask
param {*} masks input masks
param {*} boxes bounding boxes
return {*} cropped masks
'''
def crop_mask(masks, boxes):
n, h, w = masks.shape
x1, y1, x2, y2 = np.split(boxes[..., :4], 4, axis=1)
x1, y1, x2, y2 = np.expand_dims(x1, 2), np.expand_dims(y1, 2), np.expand_dims(x2, 2), np.expand_dims(y2, 2)
r = np.arange(w)[None, None, :]
c = np.arange(h)[None, :, None]
cropped_masks = masks * ((r >= x1) & (r < x2) & (c >= y1) & (c < y2))
return cropped_masks
'''
description: scale mask
param {*} mask input masks
param {*} input_shape input image shape
param {*} output_shape output image shape
return {*} scaled masks
'''
def scale_mask(mask, input_shape, output_shape):
gain = min(input_shape[0] / output_shape[0], input_shape[1] / output_shape[1]) # gain = old / new
pad = (input_shape[1] - output_shape[1] * gain) / 2, (input_shape[0] - output_shape[0] * gain) / 2 # wh padding
mask = mask[int(pad[1]):mask.shape[1]-int(pad[1]), int(pad[0]):mask.shape[0]-int(pad[0])]
mask = cv2.resize(mask, (output_shape[1], output_shape[0]), cv2.INTER_LINEAR)
return mask
'''
description: draw result
param {*} image input image
param {*} preds prediction result
param {*} masks masks
param {*} input_shape input image shape
return {*} output image
'''
def draw_result(image, preds, masks=[]):
image_copy = image.copy()
boxes = preds[...,:4].astype(np.int32)
scores = preds[...,4]
classes = preds[...,5].astype(np.int32)
for mask in masks:
image_copy[mask] = [np.random.randint(0,256), np.random.randint(0,256), np.random.randint(0,256)]
result = (image*0.5 + image_copy*0.5).astype(np.uint8)
for box, score, cl in zip(boxes, scores, classes):
top, left, right, bottom = box
cv2.rectangle(result, (top, left), (right, bottom), (255, 0, 0), 1)
cv2.putText(result, 'class:{0} score:{1:.2f}'.format(cl, score), (top, left), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)
return result
tensorrt部署
import cv2
import numpy as np
import tensorrt as trt
from common import *
from utils import *
class_names = ["person", "bus"]
input_shape = (640, 640)
score_threshold = 0.1
nms_threshold = 0.5
def post_process(outputs):
boxes = []
scores = []
class_ids = []
preds = []
output = np.squeeze(outputs[0].host.astype(np.float32).reshape(38, 8400))
classes_scores = output[4:(4+len(class_names)), ...]
for i in range(output.shape[1]):
class_id = np.argmax(classes_scores[...,i])
score = classes_scores[class_id][i]
if score > score_threshold:
boxes.append(np.concatenate([output[:4, i], np.array([score, class_id])]))
scores.append(score)
class_ids.append(class_id)
preds.append(output[..., i])
boxes = np.array(boxes)
boxes = xywh2xyxy(boxes)
scores = np.array(scores)
indices = nms(boxes, scores, score_threshold, nms_threshold)
boxes = boxes[indices]
masks_in = np.array(preds)[indices][..., -32:]
proto = np.squeeze(outputs[1].host.astype(dtype=np.float32).reshape(32, 160, 160))
c, mh, mw = proto.shape
masks = (masks_in @ proto.reshape(c, -1)).reshape(-1, mh, mw)
downsampled_bboxes = boxes.copy()
downsampled_bboxes[:, 0] *= mw / input_shape[0]
downsampled_bboxes[:, 2] *= mw / input_shape[0]
downsampled_bboxes[:, 3] *= mh / input_shape[1]
downsampled_bboxes[:, 1] *= mh / input_shape[1]
masks = crop_mask(masks, downsampled_bboxes)
boxes = scale_boxes(boxes, input_shape, image.shape)
resized_masks = []
for mask in masks:
mask = cv2.resize(mask, input_shape, cv2.INTER_LINEAR)
mask = scale_mask(mask, input_shape, image.shape)
resized_masks.append(mask)
resized_masks = np.array(resized_masks)
resized_masks = resized_masks > 0
return boxes, resized_masks
if __name__=="__main__":
image = cv2.imread('bus.jpg', -1)
input = letterbox(image, input_shape)
input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32) #BGR2RGB和HWC2CHW
input = input / 255.0
logger = trt.Logger(trt.Logger.WARNING)
with open("yoloe-11l-seg.engine", "rb") as f, trt.Runtime(logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(engine)
np.copyto(inputs[0].host, input.ravel())
do_inference(context, engine, bindings, inputs, outputs, stream)
boxes, resized_masks = post_process(outputs)
result = draw_result(image, boxes, resized_masks)
cv2.imwrite('result.jpg', result)
utils.py
'''
Author: taifyang
Date: 2024-06-12 22:23:07
LastEditors: taifyang 58515915+taifyang@users.noreply.github.com
LastEditTime: 2024-11-22 22:43:15
Description: utilities functions
'''
import cv2
import numpy as np
'''
description: Non-Maximum Suppression
param {*} boxes detect bounding boxes
param {*} scores detect scores
param {*} score_threshold detect score threshold
param {*} nms_threshold IOU threshold
return {*} detect indices
'''
def nms(boxes, scores, score_threshold, nms_threshold):
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (y2 - y1 + 1) * (x2 - x1 + 1)
keep = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
keep.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= nms_threshold)[0]
index = index[idx + 1]
return keep
'''
description: convert xywh bounding boxes to x1y1x2y2 bounding boxes
param {*} x xywh bounding boxes
return {*} x1y1x2y2 bounding boxes
'''
def xywh2xyxy(x):
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
return y
'''
description: letterbox image process
param {*} im input image
param {*} new_shape output shape
param {*} color filled color
return {*} output image
'''
def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
# Resize and pad image while meeting stride-multiple constraints
shape = im.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2 # wh padding
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return im
'''
description: scale boxes
param {*} boxes bounding boxes
param {*} input_shape input image shape
param {*} output_shape output image shape
return {*} scaled boxes
'''
def scale_boxes(boxes, input_shape, output_shape):
# Rescale boxes (xyxy) from self.inputs_shape to shape
gain = min(input_shape[0] / output_shape[0], input_shape[1] / output_shape[1]) # gain = old / new
pad = (input_shape[1] - output_shape[1] * gain) / 2, (input_shape[0] - output_shape[0] * gain) / 2 # wh padding
boxes[..., [0, 2]] -= pad[0] # x padding
boxes[..., [1, 3]] -= pad[1] # y padding
boxes[..., :4] /= gain
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, output_shape[1]) # x1, x2
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, output_shape[0]) # y1, y2
return boxes
'''
description: crop mask
param {*} masks input masks
param {*} boxes bounding boxes
return {*} cropped masks
'''
def crop_mask(masks, boxes):
n, h, w = masks.shape
x1, y1, x2, y2 = np.split(boxes[..., :4], 4, axis=1)
x1, y1, x2, y2 = np.expand_dims(x1, 2), np.expand_dims(y1, 2), np.expand_dims(x2, 2), np.expand_dims(y2, 2)
r = np.arange(w)[None, None, :]
c = np.arange(h)[None, :, None]
cropped_masks = masks * ((r >= x1) & (r < x2) & (c >= y1) & (c < y2))
return cropped_masks
'''
description: scale mask
param {*} mask input masks
param {*} input_shape input image shape
param {*} output_shape output image shape
return {*} scaled masks
'''
def scale_mask(mask, input_shape, output_shape):
gain = min(input_shape[0] / output_shape[0], input_shape[1] / output_shape[1]) # gain = old / new
pad = (input_shape[1] - output_shape[1] * gain) / 2, (input_shape[0] - output_shape[0] * gain) / 2 # wh padding
mask = mask[int(pad[1]):mask.shape[1]-int(pad[1]), int(pad[0]):mask.shape[0]-int(pad[0])]
mask = cv2.resize(mask, (output_shape[1], output_shape[0]), cv2.INTER_LINEAR)
return mask
'''
description: draw result
param {*} image input image
param {*} preds prediction result
param {*} masks masks
param {*} input_shape input image shape
return {*} output image
'''
def draw_result(image, preds, masks=[]):
image_copy = image.copy()
boxes = preds[...,:4].astype(np.int32)
scores = preds[...,4]
classes = preds[...,5].astype(np.int32)
for mask in masks:
image_copy[mask] = [np.random.randint(0,256), np.random.randint(0,256), np.random.randint(0,256)]
result = (image*0.5 + image_copy*0.5).astype(np.uint8)
for box, score, cl in zip(boxes, scores, classes):
top, left, right, bottom = box
cv2.rectangle(result, (top, left), (right, bottom), (255, 0, 0), 1)
cv2.putText(result, 'class:{0} score:{1:.2f}'.format(cl, score), (top, left), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)
return result
common.py
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import ctypes
from typing import Optional, List
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
try:
# Sometimes python does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
def GiB(val):
return val * 1 << 30
def add_help(description):
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args, _ = parser.parse_known_args()
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
"""
Parses sample arguments.
Args:
description (str): Description of the sample.
subfolder (str): The subfolder containing data relevant to this sample
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
Returns:
str: Path of data directory.
"""
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-d",
"--datadir",
help="Location of the TensorRT sample data directory, and any additional data directories.",
action="append",
default=[kDEFAULT_DATA_ROOT],
)
args, _ = parser.parse_known_args()
def get_data_path(data_dir):
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
if data_dir != kDEFAULT_DATA_ROOT:
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
print(
"WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
data_path
)
)
return data_path
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
return data_paths, locate_files(data_paths, find_files, err_msg)
def locate_files(data_paths, filenames, err_msg=""):
"""
Locates the specified files in the specified data directories.
If a file exists in multiple data directories, the first directory is used.
Args:
data_paths (List[str]): The data directories.
filename (List[str]): The names of the files to find.
Returns:
List[str]: The absolute paths of the files.
Raises:
FileNotFoundError if a file could not be located.
"""
found_files = [None] * len(filenames)
for data_path in data_paths:
# Find all requested files.
for index, (found, filename) in enumerate(zip(found_files, filenames)):
if not found:
file_path = os.path.abspath(os.path.join(data_path, filename))
if os.path.exists(file_path):
found_files[index] = file_path
# Check that all files were found
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError(
"Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)
)
return found_files
class HostDeviceMem:
"""Pair of host and device memory, where the host memory is wrapped in a numpy array"""
def __init__(self, size: int, dtype: np.dtype):
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, arr: np.ndarray):
if arr.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[:arr.size], arr.flat, casting='safe')
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
def allocate_buffers(engine: trt.ICudaEngine, profile_idx: Optional[int] = None):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for binding in tensor_names:
# get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
# Pick out the max shape to allocate enough memory for the binding.
shape = engine.get_tensor_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
shape_valid = np.all([s >= 0 for s in shape])
if not shape_valid and profile_idx is None:
raise ValueError(f"Binding {binding} has dynamic shape, " +\
"but no profile was specified.")
size = trt.volume(shape)
if engine.has_implicit_batch_dimension:
size *= engine.max_batch_size
dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
# Allocate host and device buffers
bindingMemory = HostDeviceMem(size, dtype)
# Append the device buffer to device bindings.
bindings.append(int(bindingMemory.device))
# Append to the appropriate list.
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
# Frees the resources allocated in allocate_buffers
def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
for mem in inputs + outputs:
mem.free()
cuda_call(cudart.cudaStreamDestroy(stream))
# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(cudart.cudaMemcpy(device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice))
# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(cudart.cudaMemcpy(host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
def _do_inference_base(inputs, outputs, stream, execute_async):
# Transfer input data to the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
# Run inference.
execute_async()
# Transfer predictions back from the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
# Synchronize the stream
cuda_call(cudart.cudaStreamSynchronize(stream))
# Return only the host outputs.
return [out.host for out in outputs]
def do_inference(context, engine, bindings, inputs, outputs, stream):
def execute_async_func():
context.execute_async_v3(stream_handle=stream)
# Setup context tensor address.
num_io = engine.num_io_tensors
for i in range(num_io):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
return _do_inference_base(inputs, outputs, stream, execute_async_func)

1977

被折叠的 条评论
为什么被折叠?



