YOLOv5+TensorRT部署
个人学习笔记,如有错误烦请指出!!!
模型优化与序列化
序列化
首先需要实例化ILogger
class TrtLogger:public nvinfer1::ILogger
{
nvinfer1::ILogger::Severity _verbosity;
std::ostream* _ostream;
public:
TrtLogger(Severity verbosity = Severity::kWARNING, std::ostream& ostream = std::cout)
:_verbosity(verbosity), _ostream(&ostream)
{
}
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= _verbosity)
{
time_t rawtime = std::time(0);
char buf[256];
ctime_s(buf, sizeof buf, &rawtime);
tm gmtm;
gmtime_s(&gmtm, &rawtime);
strftime(&buf[0], 256, "%Y-%m-%d %H:%M:%S", &gmtm);
const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? " BUG" : severity == Severity::kERROR
? " ERROR" : severity == Severity::kWARNING ? " WARNING" : severity == Severity::kINFO
? " INFO" : "UNKNOWN");
(*_ostream) << "[" << buf << " " << sevstr << "] " << msg << std::endl;
}
}
};
通过IParser解析ONNX文件,并通过IBuilder对网络模型进行创建与优化,最终以二进制的形式将序列化的engine写入文件中
nvinfer1::ICudaEngine* serializeEngine(const std::string onnxModel)
{
// 实例化日志对象
TrtLogger TLogger;
// 创建builder对象
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(TLogger);
// 设置batchsize
builder->setMaxBatchSize(1);
// 定义显式批处理模式
const auto explictitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
// 定义模型
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explictitBatch);
// 创建ONNX解析器
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, TLogger);
// 解析ONNX
parser->parseFromFile(onnxModel.c_str(),static_cast<int>(nvinfer1::ILogger::Severity::kWARNING));
// 配置模型优化信息
nvinfer1::IBuilderConfig* builderConfig = builder->createBuilderConfig();
// 设置最大工作空间
builderConfig->setMaxWorkspaceSize(1ULL << 30);
// 构建TensorRT engine
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *builderConfig);
// 设置保存engine的文件名
std::string engineName = onnxModel.substr(0, onnxModel.find_last_of(".")) + "engine";
// 序列化engine
nvinfer1::IHostMemory* modelStream = engine->serialize();
// 设置文件二进制写入方式
std::ofstream output_stream(engineName.c_str(), std::ios_base::out | std::ios_base::binary);
// 将engine序列化数据写入文件中
output_stream.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
output_stream.close();
parser->destroy();
network->destroy();
builderConfig->destroy();
builder->destroy();
return engine;
}
反序列化
首先、实例化日志对象,并构建Runtime,进而以二进制的方式读取.engine文件,最后通过Runtime反序列化得到engine
nvinfer1::ICudaEngine* deserializeEngine(std::string enginePath)
{
// 实例化日志对象
TrtLogger TLogger;
// 构建Iruntime
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(TLogger);
// 以二进制的方式读取engine
std::ifstream input_stream(enginePath, std::ios_base::in | std::ios_base::binary);
if (!input_stream.is_open())
{
std::cerr << "read" << enginePath << " error!" << std::endl;
}
// 将文件指针移动到文件的末尾
input_stream.seekg(0, input_stream.end);
// 获取文件的大小
size_t size = input_stream.tellg();
// 重新将文件指针移动到文件的开头
input_stream.seekg(0, input_stream.beg);
// 创建存储engine数据vector
std::vector<char> engineData(size);
// 读取engine
input_stream.read(engineData.data(), size);
// runtime 反序列化engine
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), size, nullptr);
return engine;
}
模型推理
图像预处理
等比缩放图像并对较小的边进行padding,使得图像尺寸等与.engine的输入尺寸相同,并将图像调整为对应的输入形式。
void processImage(cv::Mat& img,cv::Mat& output, cv::Size size,int& new_h,int& new_w)
{
double w_rate = size.width / (img.cols * 1.0);
double h_rate = size.height / (img.rows * 1.0);
// 边界填充长度
int padingLength = 0;
// 按照比率小的进行调整resize大小
if (w_rate > h_rate)
{
new_h = size.height;
new_w = img.cols * h_rate;
padingLength = (size.width - new_w) / 2;
cv::resize(img, output, cv::Size(new_w, new_h), cv::INTER_AREA);
// 为长度较小的增加边界
cv::copyMakeBorder(output, output, 0, 0, padingLength, padingLength, cv::BORDER_CONSTANT, 114);
}
else
{
new_w = size.width;
new_h = img.rows * w_rate;
padingLength = (size.height - new_h) / 2;
cv::resize(img, output, cv::Size(new_w, new_h), cv::INTER_AREA);
cv::copyMakeBorder(output, output, padingLength, padingLength, 0, 0, cv::BORDER_CONSTANT, 114);
}
//cv::imshow("resize", output);
//cv::waitKey(0);
// 将BGR格式转为RGB格式
cv::cvtColor(output, output, cv::COLOR_BGR2RGB);
// 归一化
output.convertTo(output, CV_32FC3, 1 / 255.);
}
推理与后处理
推理
构建Context管理推理过程,并分配内存空间与显存空间,
float* inference(std::string path,cv::Mat input,int& classesNum,int& boxNum)
{
std::string engineName = path.substr(0, path.find_last_of(".")) + "engine";
// 判断engine是否存在
std::ifstream f(engineName);
// 构建 engine
nvinfer1::ICudaEngine* engine;
if (!f.good())
{
// 通过ONNX构建engine 并序列化得到.engine
f.close();
engine = serializeEngine(path);
}
else
{
// 通过.engine 文件反序列化为engine
engine = deserializeEngine(engineName);
}
if (engine == nullptr)
{
std::cerr << "engine made failed" << std::endl;
}
// 构建IExecutionContext 管理推理过程
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
if (context == nullptr)
{
std::cerr << "context made failed" << std::endl;
}
// 获取输入输出索引
int inputIndex = engine->getBindingIndex("images");
int outputIndex = engine->getBindingIndex("output0");
nvinfer1::Dims dims_input = engine->getBindingDimensions(inputIndex);
int input_size = dims_input.d[0] * dims_input.d[1] * dims_input.d[2] * dims_input.d[3];
nvinfer1::Dims dims_output = engine->getBindingDimensions(outputIndex);
int output_size = dims_output.d[0] * dims_output.d[1] * dims_output.d[2]; // batchsize boxes (位置坐标、conf、cls分数)
// 类别数量
classesNum = dims_output.d[2] - 5;
boxNum = dims_output.d[1];
// 分配内存空间
void* GpuMemoryArray[2];
void* CpuMemoryArray[2];
// 输入、输出空间大小
int memeorySize[2] = { input_size * sizeof(float),output_size * sizeof(float) };
// 分配GPU空间
cudaMalloc(&GpuMemoryArray[inputIndex], memeorySize[0]);
cudaMalloc(&GpuMemoryArray[outputIndex], memeorySize[1]);
// 分配CPU空间
CpuMemoryArray[inputIndex] = malloc(memeorySize[0]);
CpuMemoryArray[outputIndex] = malloc(memeorySize[1]);
// 初始化CUDA流
cudaStream_t cudaStream;
cudaStreamCreate(&cudaStream);
// 存储TensorRT输入图像
std::vector<cv::Mat> inputWrappers{};
inputWrappers.emplace_back(dims_input.d[2], dims_input.d[3], CV_32FC1, CpuMemoryArray[inputIndex]);
inputWrappers.emplace_back(dims_input.d[2], dims_input.d[3], CV_32FC1, (char*)CpuMemoryArray[inputIndex] + sizeof(float) * dims_input.d[2] * dims_input.d[3]);
inputWrappers.emplace_back(dims_input.d[2], dims_input.d[3], CV_32FC1, (char*)CpuMemoryArray[inputIndex] + 2 * sizeof(float) * dims_input.d[2] * dims_input.d[3]);
// 通道分离,转为TensorRT输入格式
cv::split(input, inputWrappers);
// 将图像数据传到GPU
cudaMemcpyAsync(GpuMemoryArray[inputIndex], CpuMemoryArray[inputIndex], memeorySize[0], cudaMemcpyHostToDevice, cudaStream);
context->enqueueV2(GpuMemoryArray, cudaStream, nullptr);
// 将数据由显存传入内存
cudaMemcpyAsync(CpuMemoryArray[outputIndex], GpuMemoryArray[outputIndex], memeorySize[1], cudaMemcpyDeviceToHost, cudaStream);
cudaStreamSynchronize(cudaStream);
return (float*)CpuMemoryArray[outputIndex];
}
后处理
后处理主要包括:NMS和DrawBox
NMS
static float IOU(float lbox[], float rbox[])
{
std::cout << lbox[0] << " " << lbox[1] << " " << lbox[2] << " " << lbox[3] << std::endl;
std::cout << rbox[0] << " " << rbox[1] << " " << rbox[2] << " " << rbox[3] << std::endl;
float interBox[] = {
(std::max)(lbox[0],rbox[0]),
(std::max)(lbox[1],rbox[1]),
(std::min)(lbox[2],rbox[2]),
(std::min)(lbox[3],rbox[3]),
};
if ((interBox[0] > interBox[2]) || (interBox[1] > interBox[3]))
return 0.0f;
float interBoxS = (interBox[2] - interBox[0]) * (interBox[3] - interBox[1]);
float unionBoxS = ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS);
return interBoxS / unionBoxS;
}
static void nms(std::vector<Result>& res, float prod[], const int clsNum, const int boxNum, const float conf_thresh, const float nms_thresh = 0.5)
{
std::map<float, std::vector<Result>> m;
for (int i = 0; i < boxNum; i++)
{
int index = i * (clsNum + 5);
// 获取最大类别分数的地址
float* max_obj = std::max_element(prod + index + 5, prod + index + 5 + clsNum);
// 计算置信度
float c = prod[index + 4] * (*max_obj);
// 判断置信度是否大于设定值
if (!(c > conf_thresh)) continue;
// 计算cls
int cls = max_obj - prod - index - 5;
if (m.count(cls) == 0) m.emplace(cls, std::vector<Result>());
// 构造result
Result result;
//std::cout<<"prod " << prod[index+0] << " " << prod[index + 1] << " " << prod[index + 2] << " " << prod[index + 3] << std::endl;
result.bbox[0] = prod[index + 0] - prod[index + 2] / 2.0;
result.bbox[1] = prod[index + 1] - prod[index + 3] / 2.0;
result.bbox[2] = prod[index + 0] + prod[index + 2] / 2.0;
result.bbox[3] = prod[index + 1] + prod[index + 3] / 2.0;
//std::cout << "result " << result.bbox[0] << " " << result.bbox[1] << " " << result.bbox[2] << " " << result.bbox[3] << std::endl;
result.conf = c;
result.class_id = cls;
// 保存result
m[cls].push_back(result);
}
for (auto it = m.begin(); it != m.end(); it++)
{
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), compare_conf);
for (size_t m = 0; m < dets.size(); ++m)
{
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n)
{
// IOU大于阈值则移除box
if (IOU(item.bbox,dets[n].bbox) > nms_thresh)
{
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}
Draw
因为在进行TensorRT推理前对图像进行了resize,因此在绘制矩形框时需要先将坐标恢复至原图像尺寸下的坐标
void getRect(cv::Mat img, float boxs[],int new_w,int new_h)
{
float rate_h = (float)img.rows / new_h;
float rate_w = (float)img.cols / new_w;
boxs[0] = (boxs[0] - (INPUT_W - new_w) / 2.0) * rate_w;
boxs[1] = (boxs[1] - (INPUT_H - new_h) / 2.0) * rate_h;
boxs[2] = (boxs[2] - (INPUT_W - new_w) / 2.0) * rate_w;
boxs[3] = (boxs[3] - (INPUT_H - new_h) / 2.0) * rate_h;
}
通过转换后的坐标在原图标注物体的类别与box
void detect(cv::Mat& img,std::string modelPath)
{
// 预处理后的图像
cv::Mat dst;
// 输入图像尺寸
cv::Size size(INPUT_W, INPUT_H);
// 预处理后图像尺寸(不加Pading)
int new_w = 0, new_h = 0;
// 图像处理
processImage(img, dst, size, new_h, new_w);
int clsNum = 0, boxNum = 0;
// 加载engine,进行推理
float* prod = inference(modelPath, dst, clsNum, boxNum);
// 存储nms处理结果
std::vector<Result> result_obj;
// nms 非极大值抑制
nms(result_obj, prod, clsNum, boxNum, CONF_THRESH, NMS_THRESH);
for (size_t i = 0; i < result_obj.size(); i++)
{
// 将box坐标映射到原图尺寸
getRect(img, result_obj[i].bbox, new_w, new_h);
cv::Point p1(int(result_obj[i].bbox[0]), int(result_obj[i].bbox[1]));
cv::Point p2(int(result_obj[i].bbox[2]), int(result_obj[i].bbox[3]));
cv::rectangle(img, p1, p2, cv::Scalar(0, 0, 255), 2);
std::string label = classes[(int)result_obj[i].class_id] + cv::format("%.2f", result_obj[i].conf);
cv::putText(img, label, cv::Point(int(result_obj[i].bbox[0]), int(result_obj[i].bbox[1])), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 1);
}
cv::imshow("detect", img);
cv::waitKey(0);
}
本文介绍了如何使用YOLOv5模型与TensorRT进行部署,包括模型的序列化与反序列化,以及图像预处理、推理过程(含NMS后处理和DrawBox)的技术细节。
438





