TensorRT

1:.pth ------.onnx --------.engine

利用的库是

2: float32-float16-int8

3:遇见显性,隐性batch可参考

【TensorRT】execute_async VS execute_async_v2_context.execute_async_v2_昌山小屋的博客-优快云博客

Developer Guide :: NVIDIA Deep Learning TensorRT Documentation

推理基本步骤可参考

1:初始化:将模型解析,获取输入输出大小并开辟相应内存

FODLINE::FODLINE(const char* modelPath)
{
    int DEVICE;
    DEVICE = wether_GPU();
    size_t size{0};
    static Logger gLogger;
    char *trtModelStream{nullptr};
    const std::string engine_file_path {modelPath};
    cout << "path" << engine_file_path <<endl;
    std::ifstream file(engine_file_path, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr); 
    context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
	int num = engine->getNbBindings();
    for (int i=1;i <num;i++){ 
    auto out_dims = engine->getBindingDimensions(i);
    auto output_size1 = 1;
	for(int j=0;j<out_dims.nbDims;j++) {
        output_size1 *= out_dims.d[j];
    }
    output_eng.push_back(new float[output_size1]);
    output_size.push_back(output_size1);
    }
}

2:将图片传入并将数据copy到cuda计算,这个里面5代表输入和输出的分支加起来,一般第一个就是input后面的都是输出分支,这个看onnx和转trt时的定义,当然也可以写一个for这样的话就不用像下面那样列了四个输出。,如果输出完成,那么后面的就是后处理的,根据代码进行调整就可以了。要是还不明白就看下gitULTRL-DETECT: c++实现车道线检测 - Gitee.com


void FODLINE::doInference(IExecutionContext& context, float* input,   vector<float *> output,  vector<int>output_size, cv::Size input_shape) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 5);
    void* buffers[5];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);

    assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    const int outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1);
    const int outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2);
    const int outputIndex3 = engine.getBindingIndex(OUTPUT_BLOB_NAME3);
    assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
    int mBatchSize = engine.getMaxBatchSize();

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], output_size[0]*sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex1], output_size[1]*sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex2], output_size[2]*sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex3], output_size[3]*sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(1, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output[0], buffers[outputIndex], output_size[0]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(output[1], buffers[outputIndex1], output_size[1]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(output[2], buffers[outputIndex2], output_size[2]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(output[3], buffers[outputIndex3], output_size[3]* sizeof(float), cudaMemcpyDeviceToHost, stream));
   
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
    CHECK(cudaFree(buffers[outputIndex1]));
    CHECK(cudaFree(buffers[outputIndex2]));
    CHECK(cudaFree(buffers[outputIndex3]));
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值