仍然是模型编译时实现的全连接示例:
// tensorRT include
#include <NvInfer.h>
#include <NvInferRuntime.h>
// cuda include
#include <cuda_runtime.h>
// system include
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <fstream>
#include <vector>
using namespace std;
// 上一节的代码
class TRTLogger : public nvinfer1::ILogger{
public:
virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
if(severity <= Severity::kINFO){
printf("%d: %s\n", severity, msg);
}
}
} logger;
nvinfer1::Weights make_weights(float* ptr, int n){
nvinfer1::Weights w;
w.count = n;
w.type = nvinfer1::DataType::kFLOAT;
w.values = ptr;
return w;
}
bool build_model(){
TRTLogger logger;
// 这是基本需要的组件
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);
// 构建一个模型
/*
Network definition:
image
|
linear (fully connected) input = 3, output = 2, bias = True w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5]], b=[0.3, 0.8]
|
sigmoid
|
prob
*/
const int num_input = 3;
const int num_output = 2;
float layer1_weight_values[] = {1.0, 2.0, 0.5, 0.1, 0.2, 0.5};
float layer1_bias_values[] = {0.3, 0.8};
nvinfer1::ITensor* input = network->addInput("image", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(1, num_input, 1, 1));
nvinfer1::Weights layer1_weight = make_weights(layer1_weight_values, 6);
nvinfer1::Weights layer1_bias = make_weights(layer1_bias_values, 2);
auto layer1 = network->addFullyConnected(*input, num_output, layer1_weight, layer1_bias);
auto prob = network->addActivation(*layer1->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
// 将我们需要的prob标记为输出
network->markOutput(*prob->getOutput(0));
printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);
config->setMaxWorkspaceSize(1 << 28);
builder->setMaxBatchSize(1);
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
if(engine == nullptr){
printf("Build engine failed.\n");
return false;
}
// 将模型序列化,并储存为文件
nvinfer1::IHostMemory* model_data = engine->serialize();
FILE* f = fopen("engine.trtmodel", "wb");
fwrite(model_data->data(), 1, model_data->size(), f);
fclose(f);
// 卸载顺序按照构建顺序倒序
model_data->destroy();
engine->destroy();
network->destroy();
config->destroy();
builder->destroy();
printf("Done.\n");
return true;
}
vector<unsigned char> load_file(const string& file){
ifstream in(file, ios::in | ios::binary);//二进制打开
if (!in.is_open())
return {};
in.seekg(0, ios::end);//定位到文件结束处
size_t length = in.tellg();//获取文件长度
std::vector<uint8_t> data;
if (length > 0){
in.seekg(0, ios::beg);//定位到开始
data.resize(length);
in.read((char*)&data[0], length);//强制转换读取,
}
in.close();
return data;
}
void inference(){
// ------------------------------ 1. 准备模型并加载 ----------------------------
TRTLogger logger;
//engine_data的数据类型为vector<unsigned char>
auto engine_data = load_file("engine.trtmodel");
// 执行推理前,需要创建一个推理的runtime接口实例。与builer一样,runtime需要logger:
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
// 将模型从读取到engine_data中,则可以对其进行反序列化以获得engine
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
if(engine == nullptr){
printf("Deserialize cuda engine failed.\n");
runtime->destroy();
return;
}
//执行上下文
nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
cudaStream_t stream = nullptr;
// 创建CUDA流,以确定这个batch的推理是独立的
cudaStreamCreate(&stream);
/*
Network definition:
image
|
linear (fully connected) input = 3, output = 2, bias = True w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5]], b=[0.3, 0.8]
|
sigmoid
|
prob
*/
// ------------------------------ 2. 准备好要推理的数据并搬运到GPU ----------------------------
float input_data_host[] = {1, 2, 3};
float* input_data_device = nullptr;
float output_data_host[2];
float* output_data_device = nullptr;
//device memory
cudaMalloc(&input_data_device, sizeof(input_data_host));
cudaMalloc(&output_data_device, sizeof(output_data_host));
cudaMemcpyAsync(input_data_device, input_data_host, sizeof(input_data_host), cudaMemcpyHostToDevice, stream);
// 用一个指针数组指定input和output在gpu中的指针。
float* bindings[] = {input_data_device, output_data_device};//input和output指针
// ------------------------------ 3. 推理并将结果搬运回CPU ----------------------------
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);//enqueueV2执行推理
//gpu数据搬回cpu
cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
printf("output_data_host = %f, %f\n", output_data_host[0], output_data_host[1]);
// ------------------------------ 4. 释放内存 ----------------------------
//先分配后释放
printf("Clean memory\n");
cudaStreamDestroy(stream);
execution_context->destroy();
engine->destroy();
runtime->destroy();
// ------------------------------ 5. 手动推理进行验证 ----------------------------
const int num_input = 3;
const int num_output = 2;
float layer1_weight_values[] = {1.0, 2.0, 0.5, 0.1, 0.2, 0.5};
float layer1_bias_values[] = {0.3, 0.8};
printf("手动验证计算结果:\n");
for(int io = 0; io < num_output; ++io){
float output_host = layer1_bias_values[io];
for(int ii = 0; ii < num_input; ++ii){
output_host += layer1_weight_values[io * num_input + ii] * input_data_host[ii];
}
// sigmoid
float prob = 1 / (1 + exp(-output_host));
printf("output_prob[%d] = %f\n", io, prob);
}
}
int main(){
if(!build_model()){
return -1;
}
inference();
return 0;
}
整体步骤:
1. 二进制转换完成的trt文件并以字符形式读取到vector
auto engine_data = load_file("engine.trtmodel");
其中load_file:
vector<unsigned char> load_file(const string& file){
ifstream in(file, ios::in | ios::binary);
if (!in.is_open())
return {};
in.seekg(0, ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0){
in.seekg(0, ios::beg);
data.resize(length);
in.read((char*)&data[0], length);
}
in.close();
return data;
}
2. 创建runtime推理实例
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
3. 对engine_data反序列化:
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
4. 创建执行上下文:
nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
5. 创建流,batch异步
cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);
6. 准备推理的数据并搬运到device
float input_data_host[] = {1,2,3};
float* input_data_device = nullptr;
float output_data_host[2];
float* output_data_device = nullptr;
cudaMalloc(&input_data_device,sizeof(input_data_host));
cudaMalloc(&output_data_device,sizeof(output_data_host));
//将input从host到device
cudaMemcpyAsync(input_data_device,input_data_host,sizeof(input_data_host),cudaMemcpyHostToDevice,stream);
//用一个指针数组指定input和output在gpu中的指针。
float* bindings[] = {input_data_device, output_data_device};
7. 推理并将结果搬回host
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
printf("output_data_host = %f, %f\n", output_data_host[0], output_data_host[1]);