TensorRT模型推理

最新推荐文章于 2025-04-10 14:28:31 发布

cxb1998

最新推荐文章于 2025-04-10 14:28:31 发布

阅读量1.7k

点赞数 2

分类专栏： tensorRT CUDA 文章标签： c++ 算法开发语言

本文链接：https://blog.youkuaiyun.com/qq_41389330/article/details/126269626

版权

CUDA 同时被 2 个专栏收录

10 篇文章

订阅专栏

tensorRT

8 篇文章

订阅专栏

该博客展示了如何使用TensorRT构建一个包含全连接层和Sigmoid激活函数的简单神经网络模型，然后将其序列化并进行推理。首先，通过定义权重和构建网络结构来编译模型。接着，从二进制文件加载模型，创建运行时实例和执行上下文，进行GPU数据搬运和推理。最后，将结果从GPU拷贝回CPU并验证推理结果的正确性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

仍然是模型编译时实现的全连接示例：


// tensorRT include
#include <NvInfer.h>
#include <NvInferRuntime.h>

// cuda include
#include <cuda_runtime.h>

// system include
#include <stdio.h>
#include <math.h>

#include <iostream>
#include <fstream>
#include <vector>

using namespace std;
// 上一节的代码

class TRTLogger : public nvinfer1::ILogger{
public:
    virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
        if(severity <= Severity::kINFO){
            printf("%d: %s\n", severity, msg);
        }
    }
} logger;

nvinfer1::Weights make_weights(float* ptr, int n){
    nvinfer1::Weights w;
    w.count = n;
    w.type = nvinfer1::DataType::kFLOAT;
    w.values = ptr;
    return w;
}


bool build_model(){
    TRTLogger logger;

    // 这是基本需要的组件
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);

    // 构建一个模型
    /*
        Network definition:

        image
          |
        linear (fully connected)  input = 3, output = 2, bias = True     w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5]], b=[0.3, 0.8]
          |
        sigmoid
          |
        prob
    */

    const int num_input = 3;
    const int num_output = 2;
    float layer1_weight_values[] = {1.0, 2.0, 0.5, 0.1, 0.2, 0.5};
    float layer1_bias_values[]   = {0.3, 0.8};

    nvinfer1::ITensor* input = network->addInput("image", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(1, num_input, 1, 1));
    nvinfer1::Weights layer1_weight = make_weights(layer1_weight_values, 6);
    nvinfer1::Weights layer1_bias   = make_weights(layer1_bias_values, 2);
    auto layer1 = network->addFullyConnected(*input, num_output, layer1_weight, layer1_bias);
    auto prob = network->addActivation(*layer1->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
    
    // 将我们需要的prob标记为输出
    network->markOutput(*prob->getOutput(0));

    printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);
    config->setMaxWorkspaceSize(1 << 28);
    builder->setMaxBatchSize(1);

    nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    if(engine == nullptr){
        printf("Build engine failed.\n");
        return false;
    }

    // 将模型序列化，并储存为文件
    nvinfer1::IHostMemory* model_data = engine->serialize();
    FILE* f = fopen("engine.trtmodel", "wb");
    fwrite(model_data->data(), 1, model_data->size(), f);
    fclose(f);

    // 卸载顺序按照构建顺序倒序
    model_data->destroy();
    engine->destroy();
    network->destroy();
    config->destroy();
    builder->destroy();
    printf("Done.\n");
    return true;
}



vector<unsigned char> load_file(const string& file){
    ifstream in(file, ios::in | ios::binary);//二进制打开
    if (!in.is_open())
        return {};

    in.seekg(0, ios::end);//定位到文件结束处
    size_t length = in.tellg();//获取文件长度

    std::vector<uint8_t> data;
    if (length > 0){
        in.seekg(0, ios::beg);//定位到开始
        data.resize(length);

        in.read((char*)&data[0], length);//强制转换读取，
    }
    in.close();
    return data;
}

void inference(){

    // ------------------------------ 1. 准备模型并加载   ----------------------------
    TRTLogger logger;
    //engine_data的数据类型为vector<unsigned char>
    auto engine_data = load_file("engine.trtmodel");
    // 执行推理前，需要创建一个推理的runtime接口实例。与builer一样，runtime需要logger：
    nvinfer1::IRuntime* runtime   = nvinfer1::createInferRuntime(logger);
    // 将模型从读取到engine_data中，则可以对其进行反序列化以获得engine
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
    if(engine == nullptr){
        printf("Deserialize cuda engine failed.\n");
        runtime->destroy();
        return;
    }
    //执行上下文
    nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
    cudaStream_t stream = nullptr;
    // 创建CUDA流，以确定这个batch的推理是独立的
    cudaStreamCreate(&stream);

    /*
        Network definition:

        image
          |
        linear (fully connected)  input = 3, output = 2, bias = True     w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5]], b=[0.3, 0.8]
          |
        sigmoid
          |
        prob
    */

    // ------------------------------ 2. 准备好要推理的数据并搬运到GPU   ----------------------------
    float input_data_host[] = {1, 2, 3};
    float* input_data_device = nullptr;

    float output_data_host[2];
    float* output_data_device = nullptr;
    //device memory
    cudaMalloc(&input_data_device, sizeof(input_data_host));
    cudaMalloc(&output_data_device, sizeof(output_data_host));
    cudaMemcpyAsync(input_data_device, input_data_host, sizeof(input_data_host), cudaMemcpyHostToDevice, stream);
    // 用一个指针数组指定input和output在gpu中的指针。
    float* bindings[] = {input_data_device, output_data_device};//input和output指针

    // ------------------------------ 3. 推理并将结果搬运回CPU   ----------------------------
    bool success      = execution_context->enqueueV2((void**)bindings, stream, nullptr);//enqueueV2执行推理
    //gpu数据搬回cpu
    cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    printf("output_data_host = %f, %f\n", output_data_host[0], output_data_host[1]);

    // ------------------------------ 4. 释放内存 ----------------------------
    //先分配后释放
    printf("Clean memory\n");
    cudaStreamDestroy(stream);
    execution_context->destroy();
    engine->destroy();
    runtime->destroy();

    // ------------------------------ 5. 手动推理进行验证 ----------------------------
    const int num_input = 3;
    const int num_output = 2;
    float layer1_weight_values[] = {1.0, 2.0, 0.5, 0.1, 0.2, 0.5};
    float layer1_bias_values[]   = {0.3, 0.8};

    printf("手动验证计算结果：\n");
    for(int io = 0; io < num_output; ++io){
        float output_host = layer1_bias_values[io];
        for(int ii = 0; ii < num_input; ++ii){
            output_host += layer1_weight_values[io * num_input + ii] * input_data_host[ii];
        }

        // sigmoid
        float prob = 1 / (1 + exp(-output_host));
        printf("output_prob[%d] = %f\n", io, prob);
    }
}

int main(){

    if(!build_model()){
        return -1;
    }
    inference();
    return 0;
}

整体步骤：

1. 二进制转换完成的trt文件并以字符形式读取到vector

auto engine_data = load_file("engine.trtmodel");

其中load_file:

vector<unsigned char> load_file(const string& file){
    ifstream in(file, ios::in | ios::binary);
    if (!in.is_open())
        return {};

    in.seekg(0, ios::end);
    size_t length = in.tellg();

    std::vector<uint8_t> data;
    if (length > 0){
        in.seekg(0, ios::beg);
        data.resize(length);

        in.read((char*)&data[0], length);
    }
    in.close();
    return data;
}

2. 创建runtime推理实例

nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);

3. 对engine_data反序列化：

nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());

4. 创建执行上下文：

nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();

5. 创建流，batch异步

cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);

6. 准备推理的数据并搬运到device

float input_data_host[] = {1,2,3};
float* input_data_device = nullptr;

float output_data_host[2];
float* output_data_device = nullptr;

cudaMalloc(&input_data_device,sizeof(input_data_host));
cudaMalloc(&output_data_device,sizeof(output_data_host));

//将input从host到device
cudaMemcpyAsync(input_data_device,input_data_host,sizeof(input_data_host),cudaMemcpyHostToDevice,stream);

//用一个指针数组指定input和output在gpu中的指针。
float* bindings[] = {input_data_device, output_data_device};

7. 推理并将结果搬回host

bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);

cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);

cudaStreamSynchronize(stream);
printf("output_data_host = %f, %f\n", output_data_host[0], output_data_host[1]);