1:.pth ------.onnx --------.engine
利用的库是
2: float32-float16-int8
3:遇见显性,隐性batch可参考
【TensorRT】execute_async VS execute_async_v2_context.execute_async_v2_昌山小屋的博客-优快云博客
Developer Guide :: NVIDIA Deep Learning TensorRT Documentation
推理基本步骤可参考
1:初始化:将模型解析,获取输入输出大小并开辟相应内存
FODLINE::FODLINE(const char* modelPath) { int DEVICE; DEVICE = wether_GPU(); size_t size{0}; static Logger gLogger; char *trtModelStream{nullptr}; const std::string engine_file_path {modelPath}; cout << "path" << engine_file_path <<endl; std::ifstream file(engine_file_path, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; int num = engine->getNbBindings(); for (int i=1;i <num;i++){ auto out_dims = engine->getBindingDimensions(i); auto output_size1 = 1; for(int j=0;j<out_dims.nbDims;j++) { output_size1 *= out_dims.d[j]; } output_eng.push_back(new float[output_size1]); output_size.push_back(output_size1); } }
2:将图片传入并将数据copy到cuda计算,这个里面5代表输入和输出的分支加起来,一般第一个就是input后面的都是输出分支,这个看onnx和转trt时的定义,当然也可以写一个for这样的话就不用像下面那样列了四个输出。,如果输出完成,那么后面的就是后处理的,根据代码进行调整就可以了。要是还不明白就看下gitULTRL-DETECT: c++实现车道线检测 - Gitee.com。
void FODLINE::doInference(IExecutionContext& context, float* input, vector<float *> output, vector<int>output_size, cv::Size input_shape) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 5); void* buffers[5]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); const int outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1); const int outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2); const int outputIndex3 = engine.getBindingIndex(OUTPUT_BLOB_NAME3); assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT); int mBatchSize = engine.getMaxBatchSize(); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], output_size[0]*sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex1], output_size[1]*sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex2], output_size[2]*sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex3], output_size[3]*sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(1, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output[0], buffers[outputIndex], output_size[0]* sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(output[1], buffers[outputIndex1], output_size[1]* sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(output[2], buffers[outputIndex2], output_size[2]* sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(output[3], buffers[outputIndex3], output_size[3]* sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); CHECK(cudaFree(buffers[outputIndex1])); CHECK(cudaFree(buffers[outputIndex2])); CHECK(cudaFree(buffers[outputIndex3])); }