生成 tensort 模型有两种方式:
1、通过 解析ONNX 文件创建网络后生成engine文件,这种方式通常是所有算子都被tensort API 支持。
2、通过tensort API plugin构建网络。这种方式比较灵活,适合支持自定义算子等等。
下面介绍大致的流程:
1、创建builder,依赖logger
2、使用builder 创建 config
3、创建网络
4、搭建网络:方式一,解析onnx;方式二, plugin 搭建网络
5、 build engine
6、序列化到engine模型文件。之前可以设置动态输入之类的。
注意事项:
1、动态batchSize 推理:
使用onnx parse 时需要设置profile, 使用 plugin 模式时需要在 addInput 处设置。
动态bathSize 生层的trt 模型在推理阶段一定要 setBindingDimensions ,否则运行报错。
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include<NvOnnxParser.h>
#include<cuda_runtime.h>
#include<stdio.h>
#include <iostream>
#include <fstream>
#include<vector>
#include<math.h>
#include<assert.h>
using namespace std;
using namespace nvonnxparser;
using namespace nvinfer1;
// define logger
class TRTLogger : public nvinfer1::ILogger{
public:
virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
if(severity <= Severity::kVERBOSE){
printf("%d: %s\n", severity, msg);
}
}
};
nvinfer1::Weights make_weights(float* ptr, int n){
nvinfer1::Weights w;
w.count = n; // The number of weights in the array.
w.type = nvinfer1::DataType::kFLOAT;
w.values = ptr;
return w;
}
//#define USE_PLUGIN
//#define INT8_INFER
const int set_max_batch = 4;
const int inputsize_h = 3;
const int inputsize_w = 3;
const int input_channel = 1;
const int output_channel = 1;
int build_trt(){
TRTLogger logger;
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(logger); // builder
nvinfer1::IBuilderConfig *config = builder->createBuilderConfig(); // configer
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(1); // network 1: 显式bs
#ifdef USE_PLUGIN
/* *********** next defination your network. *****************
1、use plugin to build network : image -> conv(3*3 + bias) -> sigmoid ->ouput
***************************************************************/
// define conv weight
float layer1_weight_values[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float layer1_bias_values[] = {0};
// network input name 、dtype 、 size
nvinfer1::ITensor *input = network->addInput("image", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims4(set_max_batch, input_channel, inputsize_h, inputsize_w));
nvinfer1::Weights layer1_conv_weight = make_weights(layer1_weight_values, 3 * 3);
nvinfer1::Weights layer1_bias_weight = make_weights(layer1_bias_values, 1);
// add conv layer
auto layer1 = network->addConvolutionNd(*input, 1, nvinfer1::DimsHW(3, 3), layer1_conv_weight, layer1_bias_weight);
layer1->setStrideNd(nvinfer1::DimsHW{1,1});
layer1->setPaddingNd(nvinfer1::DimsHW{1,1});
assert(layer1 != nullptr);
// add sigmoid layer, use last layer output as input
auto layer2 = network->addActivation(*(layer1->getOutput(0)), nvinfer1::ActivationType::kSIGMOID);
// mark output
network->markOutput(*(layer2->getOutput(0)));
#else
// onnxparser to network
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
if(!parser->parseFromFile("../demo.onnx", 1)){
printf("Failed to parser demo.onnx\n");
exit(1);
}
//动态输入需要使用设置profile,如果模型有多个输入,则必须多个profile
auto input_tensor=network->getInput(0);
auto input_dims = input_tensor->getDimensions();
auto profile = builder->createOptimizationProfile();
input_dims.d[0] = 1;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
input_dims.d[0] = set_max_batch;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
config->addOptimizationProfile(profile);
#endif
#ifdef INT8_INFER
config->setFlag(BuilderFlag::kINT8);
#else
config->setFlag(BuilderFlag::kFP16);
#endif
config->setMaxWorkspaceSize(1<<28);
printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f); // 256Mib
builder->setMaxBatchSize(set_max_batch);
//generate engine
nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
if(engine == nullptr){
printf("Build engine failed.\n");
network->destroy();
config->destroy();
builder->destroy();
return -1;
}
// serialize to file
nvinfer1::IHostMemory *model_data = engine->serialize();
FILE *f = fopen("../engine.trt", "wb");
fwrite(model_data->data(), 1, model_data->size(), f);
fclose(f);
// destroy order
model_data->destroy();
engine->destroy();
network->destroy();
config->destroy();
builder->destroy();
printf("Done.\n");
return 0;
}
vector<unsigned char> load_file(const string& file){
ifstream in(file, ios::in | ios::binary);
if (!in.is_open())
return {};
in.seekg(0, ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0){
in.seekg(0, ios::beg);
data.resize(length);
in.read((char*)&data[0], length);
//in.read((char*)data.data(), length);
}
in.close();
return data;
}
int infer_trt(const std::string &trt_file){
TRTLogger logger;
auto engine_data=load_file(trt_file);
nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(engine_data.data(),engine_data.size());
if(engine == nullptr){
printf("Deserialize cuda engine failed.\n");
runtime->destroy();
return -1;
}
nvinfer1::IExecutionContext *execution_context = engine->createExecutionContext();
// 动态batch size 必须设置,否则运行报错
const int inputIndex = engine->getBindingIndex("image");
execution_context->setBindingDimensions(inputIndex, nvinfer1::Dims4(set_max_batch, input_channel, inputsize_h, inputsize_w));
cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);
// ---------- 2. data to GPU ----------------------------
float input_data_host[] = { 1.0, 1.0, 1.0,
1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0,
1.0, 1.0, 1.0};
float* input_data_device = nullptr;
float output_data_host[18];
float* output_data_device = nullptr;
cudaMalloc(&input_data_device, sizeof(input_data_host));
cudaMalloc(&output_data_device, sizeof(output_data_host));
cudaMemcpyAsync(input_data_device, input_data_host, sizeof(input_data_host), cudaMemcpyHostToDevice, stream);
// 用一个指针数组指定input和output在gpu中的指针。
float* bindings[] = {input_data_device, output_data_device};
// infer
bool sucess = execution_context->enqueueV2((void**)bindings, stream, nullptr);
cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host),cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream); // wait cuda end
for(int i = 0; i< 18; i++){
std::cout << output_data_host[i] << " ";
}
std::cout << std::endl;
// 4. 释放内存 --
printf("Clean memory\n");
cudaStreamDestroy(stream);
cudaFree(input_data_device);
cudaFree(output_data_device);
execution_context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
int main(int argc, char ** argv){
if(build_trt()!=0){
return -1;
}
//infer_trt(argv[1]);
infer_trt("../engine.trt");
return 0;
}
CMakeLists.txt
project(trt_api)
add_definitions(-w)
find_package(CUDA REQUIRED)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Release)
#cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
include_directories(/home/a/TensorRT-8.5.1.7/include)
link_directories(/home/a/TensorRT-8.5.1.7/lib)
cuda_add_executable(helloapi tensorrt_helloworld.cpp)
target_link_libraries(helloapi nvinfer)
target_link_libraries(helloapi cudart)
target_link_libraries(helloapi nvonnxparser)
add_definitions(-O2)
模型导出 demo.onnx
import torch
import torch.nn as nn
import torch.onnx
class Model(nn.Module):
def __init__(self):
super().__init__()
self.conv = nn.Conv2d(1, 1, 3, padding=1)
#self.myselu = MYSELU(3)
self.myselu = nn.Sigmoid()
self.conv.weight.data.fill_(1)
self.conv.bias.data.fill_(0)
def forward(self, x):
x = self.conv(x)
x = self.myselu(x)
return x
model = Model().eval()
input = torch.tensor([
# batch 0
[
[1, 1, 1],
[1, 1, 1],
[1, 1, 1],
],
], dtype=torch.float32).view(1, 1, 3, 3)
print(input.shape)
output = model(input)
print(f"inference output = \n{output}")
torch.onnx.export(
model, # 这里的args,是指输入给model的参数,需要传递tuple,因此用括号
(input,),
"demo.onnx", # 储存的文件路径
verbose=True,# 打印详细信息
input_names=["image"], # 为输入和输出节点指定名称,方便后面查看或者操作
output_names=["output"],
opset_version=11,# 这里的opset,指,各类算子以何种方式导出,对应于symbolic_opset11
# 表示他有batch、height、width3个维度是动态的,在onnx中给其赋值为-1,通常,我们只设置batch为动态,其他的避免动态
dynamic_axes={
"image": {0: "batch"},
"output": {0: "batch"},
},
# 对于插件,需要禁用onnx检查
# enable_onnx_checker=False
operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
)
print("Done.!")