从零开始 TensorRT（6）C++ 篇：原生 API 构建网络

最新推荐文章于 2025-06-11 13:00:00 发布

秋山丶雪绪

最新推荐文章于 2025-06-11 13:00:00 发布

阅读量1.2k

点赞数 14

分类专栏： TensorRT 文章标签： c++ 深度学习 TensorRT

本文链接：https://blog.youkuaiyun.com/weixin_43605641/article/details/136183188

版权

TensorRT 专栏收录该内容

8 篇文章

订阅专栏

前言

学习资料：
B站视频配套代码 cookbook

示例：TensorRT 原生 API 构建网络

参考源码：cookbook → 01-SimpleDemo → TensorRT8.5

直接运行官方源码时需要根据自身环境修改 xxx/cookbook/include/Makefile.inc 中的部分路径。

源码

对参考源码进行了一些简化和修改：

去除从文件读取引擎部分
网络只有一个输入张量和一个输出张量，内存和显存操作部分去除了循环操作
使用 CMakeLists 替换 Makefile
提取 cookbookHelper.cuh 中需要用到的部分，改写为 log.h

（1）文件结构

.
└── SimpleDemo
    ├── CMakeLists.txt
    ├── log.h
    └── main.cpp

（2）`log.h`

#include <iostream>
#include <cmath>
#include <iomanip>
#include <NvInfer.h>
#include <cuda_fp16.h>


#define CHECK(call) check(call, __LINE__, __FILE__)

inline bool check(cudaError_t e, int iLine, const char *szFile)
{
    if (e != cudaSuccess)
    {
        std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile << std::endl;
        return false;
    }
    return true;
}

class Logger : public nvinfer1::ILogger
{
public:
    Severity reportableSeverity;

    Logger(Severity severity = Severity::kINFO):
        reportableSeverity(severity) {}

    void log(Severity severity, const char *msg) noexcept override
    {
        if (severity > reportableSeverity)
        {
            return;
        }
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            std::cerr << "INTERNAL_ERROR: ";
            break;
        case Severity::kERROR:
            std::cerr << "ERROR: ";
            break;
        case Severity::kWARNING:
            std::cerr << "WARNING: ";
            break;
        case Severity::kINFO:
            std::cerr << "INFO: ";
            break;
        default:
            std::cerr << "VERBOSE: ";
            break;
        }
        std::cerr << msg << std::endl;
    }
};


// get the size in byte of a TensorRT data type
__inline__ size_t dataTypeToSize(nvinfer1::DataType dataType)
{
    switch (dataType)
    {
    case nvinfer1::DataType::kFLOAT:
        return 4;
    case nvinfer1::DataType::kHALF:
        return 2;
    case nvinfer1::DataType::kINT8:
        return 1;
    case nvinfer1::DataType::kINT32:
        return 4;
    case nvinfer1::DataType::kBOOL:
        return 1;
    case nvinfer1::DataType::kUINT8:
        return 1;
    case nvinfer1::DataType::kFP8:
        return 1;
    default:
        return 4;
    }
}


// print data in the array
template<typename T>
void printArrayRecursion(const T *pArray, nvinfer1::Dims32 dim, int iDim, int iStart)
{
    if (iDim == dim.nbDims - 1)
    {
        for (int i = 0; i < dim.d[iDim]; ++i)
        {
            std::cout << std::fixed << std::setprecision(3) << std::setw(6) << double(pArray[iStart + i]) << " ";
        }
    }
    else
    {
        int nElement = 1;
        for (int i = iDim + 1; i < dim.nbDims; ++i)
        {
            nElement *= dim.d[i];
        }
        for (int i = 0; i < dim.d[iDim]; ++i)
        {
            printArrayRecursion<T>(pArray, dim, iDim + 1, iStart + i * nElement);
        }
    }
    std::cout << std::endl;
    return;
}

template<typename T>
void printArrayInformation(const T *pArray, nvinfer1::Dims32 dim, std::string name = std::string(""), bool bPrintInformation = true, bool bPrintArray = false, int n = 10)
{
    // print shape information
    std::cout << std::endl;
    std::cout << name << ": (";
    for (int i = 0; i < dim.nbDims; ++i)
    {
        std::cout << dim.d[i] << ", ";
    }
    std::cout << ")" << std::endl;

    // print statistic information of the array
    if (bPrintInformation)
    {
        int nElement = 1; // number of elements with batch dimension
        for (int i = 0; i < dim.nbDims; ++i)
        {
            nElement *= dim.d[i];
        }

        double sum      = double(pArray[0]);
        double absSum   = double(fabs(double(pArray[0])));
        double sum2     = double(pArray[0]) * double(pArray[0]);
        double diff     = 0.0;
        double maxValue = double(pArray[0]);
        double minValue = double(pArray[0]);
        for (int i = 1; i < nElement; ++i)
        {
            sum += double(pArray[i]);
            absSum += double(fabs(double(pArray[i])));
            sum2 += double(pArray[i]) * double(pArray[i]);
            maxValue = double(pArray[i]) > maxValue ? double(pArray[i]) : maxValue;
            minValue = double(pArray[i]) < minValue ? double(pArray[i]) : minValue;
            diff += abs(double(pArray[i]) - double(pArray[i - 1]));
        }
        double mean = sum / nElement;
        double var  = sum2 / nElement - mean * mean;

        std::cout << "absSum=" << std::fixed << std::setprecision(4) << std::setw(7) << absSum << ",";
        std::cout << "mean=" << std::fixed << std::setprecision(4) << std::setw(7) << mean << ",";
        std::cout << "var=" << std::fixed << std::setprecision(4) << std::setw(7) << var << ",";
        std::cout << "max=" << std::fixed << std::setprecision(4) << std::setw(7) << maxValue << ",";
        std::cout << "min=" << std::fixed << std::setprecision(4) << std::setw(7) << minValue << ",";
        std::cout << "diff=" << std::fixed << std::setprecision(4) << std::setw(7) << diff << ",";
        std::cout << std::endl;

        // print first n element and last n element
        for (int i = 0; i < n; ++i)
        {
            std::cout << std::fixed << std::setprecision(5) << std::setw(8) << double(pArray[i]) << ", ";
        }
        std::cout << std::endl;
        for (int i = nElement - n; i < nElement; ++i)
        {
            std::cout << std::fixed << std::setprecision(5) << std::setw(8) << double(pArray[i]) << ", ";
        }
        std::cout << std::endl;
    }

    // print the data of the array
    if (bPrintArray)
    {
        printArrayRecursion<T>(pArray, dim, 0, 0);
    }

    return;
}
template void printArrayInformation(const float *, nvinfer1::Dims32, std::string, bool, bool, int);
template void printArrayInformation(const half *, nvinfer1::Dims32, std::string, bool, bool, int);
template void printArrayInformation(const char *, nvinfer1::Dims32, std::string, bool, bool, int);
template void printArrayInformation(const int *, nvinfer1::Dims32, std::string, bool, bool, int);
template void printArrayInformation(const bool *, nvinfer1::Dims32, std::string, bool, bool, int);

（3）`main.cpp`

#include "log.h"
#include <fstream>
#include <vector>


const std::string trtFile {"./model.plan"};
static Logger gLogger(nvinfer1::ILogger::Severity::kERROR);

void run() {
    nvinfer1::ICudaEngine *engine = nullptr;
    // 构建期
    nvinfer1::IBuilder             *builder = nvinfer1::createInferBuilder(gLogger);
    nvinfer1::INetworkDefinition   *network = builder->createNetworkV2(1U << int(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
    nvinfer1::IOptimizationProfile *profile = builder->createOptimizationProfile();
    nvinfer1::IBuilderConfig       *config  = builder->createBuilderConfig();
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 30);

    nvinfer1::ITensor *inputTensor = network->addInput("inputT0", nvinfer1::DataType::kFLOAT, nvinfer1::Dims32 {3, {-1, -1, -1}});
    profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims32 {3, {1, 1, 1}});
    profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims32 {3, {3, 4, 5}});
    profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims32 {3, {6, 8, 10}});
    config->addOptimizationProfile(profile);

    nvinfer1::IIdentityLayer *identityLayer = network->addIdentity(*inputTensor);
    network->markOutput(*identityLayer->getOutput(0));
    // 生成序列化网络
    nvinfer1::IHostMemory *engineString = builder->buildSerializedNetwork(*network, *config);
    if (engineString == nullptr || engineString->size() == 0) {
        std::cout << "Failed building serialized engine!" << std::endl;
        return;
    }
    std::cout << "Succeeded building serialized engine!" << std::endl;

    // 运行期
    nvinfer1::IRuntime *runtime {nvinfer1::createInferRuntime(gLogger)};
    engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size());
    if (engine == nullptr) {
        std::cout << "Failed building engine!" << std::endl;
        return;
    }
    std::cout << "Succeeded building engine!" << std::endl;

    // 保存引擎 <fstream>
    std::ofstream engineFile(trtFile, std::ios::binary);
    if (!engineFile) {
        std::cout << "Failed opening file to write" << std::endl;
        return;
    }
    engineFile.write(static_cast<char *>(engineString->data()), engineString->size());
    if (engineFile.fail()) {
        std::cout << "Failed saving .plan file!" << std::endl;
        return;
    }
    std::cout << "Succeeded saving .plan file!" << std::endl;

    long unsigned int nIO = engine->getNbIOTensors();
    std::vector<std::string> vTensorName(nIO);
    for (int i = 0; i < nIO; ++i) {
        vTensorName[i] = std::string(engine->getIOTensorName(i));
    }
        
    // GPU进程
    nvinfer1::IExecutionContext *context = engine->createExecutionContext();
    // 输入形状
    context->setInputShape(vTensorName[0].c_str(), nvinfer1::Dims32 {3, {3, 4, 5}});

    std::vector<int> vTensorSize(nIO, 0);
    for (int i = 0; i < nIO; ++i) {
        nvinfer1::Dims32 dim = context->getTensorShape(vTensorName[i].c_str());
        int size = 1;
        for (int j = 0; j < dim.nbDims; ++j) {
            size *= dim.d[j];
        }
        vTensorSize[i] = size * dataTypeToSize(engine->getTensorDataType(vTensorName[i].c_str()));
    }

    void* inputHost = (void *)new char[vTensorSize[0]];
    void* outputHost = (void *)new char[vTensorSize[1]];
    void* inputDevice;
    void* outputDevice;
    CHECK(cudaMalloc(&inputDevice, vTensorSize[0]));
    CHECK(cudaMalloc(&outputDevice, vTensorSize[1]));

    // 输入数据
    float *pData = (float *)inputHost;
    for (int i = 0; i < vTensorSize[0] / dataTypeToSize(engine->getTensorDataType(vTensorName[0].c_str())); ++i) {
        pData[i] = float(i);
    }
    // H2D
    CHECK(cudaMemcpy(inputDevice, inputHost, vTensorSize[0], cudaMemcpyHostToDevice));
    // 设置地址
    context->setTensorAddress(vTensorName[0].c_str(), inputDevice);
    context->setTensorAddress(vTensorName[1].c_str(), outputDevice);
    // 推理
    context->enqueueV3(0);
    // D2H
    CHECK(cudaMemcpy(outputHost, outputDevice, vTensorSize[1], cudaMemcpyDeviceToHost));

    printArrayInformation((float *)inputHost, context->getTensorShape(vTensorName[0].c_str()), vTensorName[0], true, true);
    printArrayInformation((float *)outputHost, context->getTensorShape(vTensorName[1].c_str()), vTensorName[1], true, true);
    
    delete[] (char *)inputHost;
    delete[] (char *)outputHost;
    CHECK(cudaFree(inputDevice));
    CHECK(cudaFree(outputDevice));
    return;
}

int main() {
    CHECK(cudaSetDevice(0));
    run();
    return 0;
}

（4）`CMakeLists.txt`

cmake_minimum_required(VERSION 3.10)
project(SimpleDemo)

add_definitions(-std=c++11)

set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin)

# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(xxx/TensorRT-8.6.1.6/include)
link_directories(xxx/TensorRT-8.6.1.6/lib)

add_executable(demo main.cpp)
target_link_libraries(demo nvinfer)
target_link_libraries(demo cudart)

源码解析

本示例与 Python 示例从零开始 TensorRT（2）Python 篇：原生 API 构建网络对应，C++ 和 Python 中使用的 API 大同小异，流程也基本一致，因此不再对 main.cpp 中的 TensorRT 部分进行赘述。而对于 C++ 零基础而言，log.h 中代码的语法相对更难。

`Logger`

日志记录器 Logger 在构建 TensorRT 部分是必不可少的，并且不像在 Python 中简单的使用 logger = trt.Logger(trt.Logger.ERROR) 而是要手动写一个类。

"Logger 类继承自 nvinfer1::ILogger, 并且继承方式为 public"
"当不使用 public 时默认为 private"
"其区别是使用 public 后 Logger 的实例对象 gLogger 可以访问基类 nvinfer1::ILogger 中的公有成员"
class Logger : public nvinfer1::ILogger
{
public:
    Severity reportableSeverity;
    
	"构造函数, 类每次实例化时运行, 类似Python中的__init__"
	"reportableSeverity(severity)是通过成员初始化列表来初始化成员, 例如: a(value_a), b(value_b)"
	"其效果类似直接在函数体内写赋值语句, 区别是会在创建对象时就初始化成员变量, 而非在执行构造函数时"
	"当成员变量是类的对象时, 使用成员初始化列表会更高效"
	"如果使用赋值语句需要先创建临时对象, 再将临时对象的值赋给成员变量, 会引入额外的构造函数调用和拷贝"
    Logger(Severity severity = Severity::kINFO):
        reportableSeverity(severity) {}

	"先看父类 nvinfer1::ILogger 中对 log 函数的定义"
	"virtual void log(Severity severity, AsciiChar const* msg) noexcept = 0;"
	"virtual 代表虚函数, 可以在子类中重写函数体"
	"=0 代表纯虚函数, 必须在子类中重写函数体, 因此在c++中使用TensorRT需要自定义logger类"
	"noexcept 代表此函数不会抛出异常, 可以帮助编译器减少优化损失"
	"子类中 override 代表函数是被重写的, 增加可读性"
    void log(Severity severity, const char *msg) noexcept override
    {
        if (severity > reportableSeverity)
        {
            return;
        }
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR:
            std::cerr << "INTERNAL_ERROR: ";
            break;
        case Severity::kERROR:
            std::cerr << "ERROR: ";
            break;
        case Severity::kWARNING:
            std::cerr << "WARNING: ";
            break;
        case Severity::kINFO:
            std::cerr << "INFO: ";
            break;
        default:
            std::cerr << "VERBOSE: ";
            break;
        }
        std::cerr << msg << std::endl;
    }
};

`CHECK`

"宏定义"
"#define 宏名(参数) 具体函数调用"
"__LINE__, __FILE_ 为预处理器指令, 代表当前的行号和文件名"
#define CHECK(call) check(call, __LINE__, __FILE__)

"inline 代表内联函数, 编译器会把函数体内的代码插入到函数调用的地方, 而不是像普通函数一样通过函数调用"
"1.节省函数调用的开销, 包括参数传递、栈帧的创建和销毁等"
"2.若函数体较大且在多出被调用, 会导致代码膨胀, 可执行文件变大"
"3.可能会增加编译的时间"
"4.编译器可能会忽略内联, 通常因为函数体过大、递归调用等"

"普通函数调用过程"
"1.通过寄存器或栈传递参数"
"2.为被调用的函数分配内存空间栈帧, 栈帧中包含局部变量、函数参数等必要信息"
"3.跳转到被调用函数的入口点"
"4.执行函数"
"5.通过寄存器或栈传递返回值"
"6.清理栈帧, 释放空间"
"7.跳转调用函数位置执行后续代码"
inline bool check(cudaError_t e, int iLine, const char *szFile)
{
    if (e != cudaSuccess)
    {
        std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile << std::endl;
        return false;
    }
    return true;
}

"__inline__ 是编译器的扩展, 将函数作为内联函数处理, 不同的编译器可能对内联函数的处理不同"
"__inline__ 和 inline 的作用相同, 通常优先使用 inline, inline 是C++标准中的关键字, 更具可移植性和通用性"
"size_t 用于表示对象大小或索引的数据类型, 无符号整型, 大小可容纳任何对象大小(内存字节数)"
__inline__ size_t dataTypeToSize(nvinfer1::DataType dataType)

`printArray`

"template 为定义模板的关键字, 模板可以创建类、函数、变量等, 它们类型相同但参数类型不同"
template<typename T>
void printArrayRecursion(const T *pArray, nvinfer1::Dims32 dim, int iDim, int iStart) {}

template<typename T>
void printArrayInformation(const T *pArray, nvinfer1::Dims32 dim, std::string name = std::string(""), bool bPrintInformation = true, bool bPrintArray = false, int n = 10)
{
	"此时 pArray 的类型并不确定, 需要在函数名后添加模板参数<T>"
	printArrayRecursion<T>(pArray, dim, 0, 0);
}
"显式实例化, 可以在编译时生成特定类型的函数, 从而减少模板实例化次数和编译时间"
"模板函数通常在被调用时进行实例化, 根据调用时的参数类型进行模板的具体化, 意味着每次调用生成新的函数实例"
template void printArrayInformation(const float *, nvinfer1::Dims32, std::string, bool, bool, int);

"此时参数类型确定, 可以省略函数名后的模板参数<float>"
printArrayInformation((float *)inputHost, context->getTensorShape(vTensorName[0].c_str()), vTensorName[0], true, true);

关于 include

在 C++ 代码中，#include 用于将其他文件的内容包含到当前文件中。而在对示例进行简化时，有些难以确认 API 属于哪个导入的文件。例如在 Python 中，代码 os.path.join 就能直接知道此 API 属于 os，需要 import os。而在 C++ 中，例如 std::cout、std::fixed、std::setprecision 即使都归属于命名空间 std，需要导入的文件却有所不同。比如 std::cout 的声明在 iostream，std::setprecision 的声明在 iomanip，std::fixed 的声明在 ios_base.h 但 iomanip 导入了该文件。
通常来说 #include 的是头文件，头文件中一般只有声明，还需要包含实现的库文件。在 CMakeLists 中就使用了 include_directories 添加特定的头文件搜索路径，link_directories 添加特定的库文件搜索路径，target_link_libraries 添加了特定的库文件。
在示例中，nvinfer1::createInferBuilder 的声明在 NvInfer.h，cudaMalloc 的声明在 cuda_runtime_api.h，但 NvInfer.h 通过多层的 #include 包含了 cuda_runtime_api.h，而两者的实现分别在 nvinfer 和 cudart 中。
总而言之，感觉在 C++ 中似乎只能通过教程示例等来确定路径、文件、API 等的对应关系。原本示例中使用 using namespace nvinfer1; 省略命名空间后，直接阅读代码更加难以得知 API 的归属。