本文主要介绍在linux开发环境中如何使用C++和Cmake对LSTM进行部署推理。之所以不在vscode等编译软件上直接进行设置,是因为笔者认为CMake更好,可以编辑对应的语法,也可以避免路径设置(CMake直接传入指定路径)等相关问题。
熟悉LSTM的读者都知道,LSTM是一种比较经典针对时序处理的深度网络。在一般的LSTM中,输入参数有三个维度,分别是批处理大小,序列长度和特征维数(batch_size, seq_length, dims)
,输出为两个维度,分别是批处理大小和输出维度(batch_size, out_dim)
。
根据这个原理,笔者使用pytorch
深度学习框架训练了一个两层的LSTM网络,输入参数大小是(1, 10, 10
),输出参数大小是(1, 10
)。参考链接中的代码。本文重点在于使用LSTM到处的onnx模型进行C++推理,CMake编译。具体实现方式如下描述。
1、LSTM模型Pytorch实现
import torch
import torch.nn as nn
class LSTM(nn.Module):
def __init__(self, input_size, output_size, out_channels, num_layers, device):
super(LSTM, self).__init__()
self.device = device
self.input_size = input_size
self.hidden_size = input_size
self.num_layers = num_layers
self.output_size = output_size
self.lstm = nn.LSTM(input_size=self.input_size,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True)
self.out_channels = out_channels
self.fc = nn.Linear(self.hidden_size, self.output_size)
def forward(self, x):
out, _ = self.lstm(x)
if self.out_channels == 1:
out = out[:, -1, :]
out = self.fc(out)
return out
return out
batch_size = 1
input_size = 20
seq_len = 5
output_size = 10
num_layers = 2
out_channels = 1
model = LSTM(input_size, output_size, out_channels, num_layers, "cpu")
model.eval()
input_names = ["input"] # 设定输入接口名称
output_names = ["output"] # 设定输出接口名称
x = torch.randn((batch_size, seq_len, input_size))
print(x.shape)
y = model(x)
print(y.shape)
torch.onnx.export(model, x, 'lstm.onnx', verbose=True, input_names=input_names, output_names=output_names,
dynamic_axes={'input':[0], 'output':[0]} )
import onnx
model = onnx.load("lstm.onnx")
print("load model done.")
onnx.checker.check_model(model)
print(onnx.helper.printable_graph(model.graph))
print("check model done.")
运行之后会在本地生成一个lstm.onnx
文件。记得根据自己的需求设定输入输出接口名称,下面C++部署代码会用到。
2、Python测试ONNX模型
本节主要用python测试onnx模型推理的结果,方便与后面C++结果推理作对比,代码如下:
import onnx
import onnxruntime as ort
import numpy as np
batch_size = 1
input_size = 20
seq_len = 5
input_data = []
for i in range(batch_size):
data1 = []
for j in range(seq_len):
data2 = []
for k in range(input_size):
data2.append(1.0 * (k + 1) * j / 20)
data1.append(data2)
input_data.append(data1)
print(input_data)
ort_session = ort.InferenceSession('lstm.onnx')
outputs = ort_session.run(None, {'input': np.array(input_data).astype(np.float32)})
print('onnx result:', outputs[0])
3、C++推理代码
推理部署的C++简易代码(lstm.cpp
)如下:
#include <iostream>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>
using namespace std;
using namespace Ort;
const int batch_size = 1;
const int input_size = 20;
const int seq_len = 5;
const int output_size = 10;
std::vector<float> testOnnxLSTM(std::vector<std::vector<std::vector<float>>>& inputs)
{
//设置为VERBOSE,方便控制台输出时看到是使用了cpu还是gpu执行
//Ort::Env env(ORT_LOGGING_LEVEL_VERBOSE, "test");
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1); // 使用五个线程执行op,提升速度
// 第二个参数代表GPU device_id = 0,注释这行就是cpu执行
//OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
const char* model_path = "../lstm.onnx";
std::cout << model_path << std::endl;
Ort::Session session(env, model_path, session_options);
const char* input_names[] = {"input"}; // 根据上节输入接口名称设置
const char* output_names[] = {"output"}; // 根据上节输出接口名称设置
std::array<float, batch_size * seq_len * input_size> input_matrix;
std::array<float, batch_size * output_size> output_matrix;
std::array<int64_t, 3> input_shape{batch_size, seq_len, input_size};
std::array<int64_t, 2> output_shape{batch_size, output_size};
for (int i = 0; i < batch_size; i++)
for (int j = 0; j < seq_len; j++)
for (int k = 0; k < input_size; k++)
input_matrix[i * seq_len * input_size + j * input_size + k] = inputs[i][j][k];
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_matrix.data(), input_matrix.size(), input_shape.data(), input_shape.size());
try
{
Ort::Value output_tensor = Ort::Value::CreateTensor<float>(memory_info, output_matrix.data(), output_matrix.size(), output_shape.data(), output_shape.size());
session.Run(Ort::RunOptions{ nullptr }, input_names, &input_tensor, 1, output_names, &output_tensor, 1);
}
catch (const std::exception& e)
{
std::cout << e.what() << std::endl;
}
std::cout << "get data from LSTM onnx: \n";
std::vector<float> ret;
for (int i = 0; i < output_size; i++) {
ret.emplace_back(output_matrix[i]);
std::cout << ret[i] << "\t";
}
std::cout << "\n";
return ret;
}
int main(int argc, char const *argv[])
{
std::vector<std::vector<std::vector<float>>> data;
for (int i = 0; i < batch_size; i++) {
std::vector<std::vector<float>> t1;
for (int j = 0; j < seq_len; j++) {
std::vector<float> t2;
for (int k = 0; k < input_size; k++) {
t2.push_back(1.0 * (k + 1) * j / 20);
}
t1.push_back(t2);
t2.clear();
}
data.push_back(t1);
t1.clear();
}
std::cout << data.size() << " " << data[0].size() << " " << data[0][0].size() << std::endl;
for (auto& i : data) {
for (auto& j : i) {
for (auto& k : j) {
std::cout << k << "\t";
}
std::cout << "\n";
}
std::cout << "\n";
}
auto ret = testOnnxLSTM(data);
return 0;
}
4、CMakeLists文件编写
CMakeLists.txt
文件内容如下:
cmake_minimum_required(VERSION 3.16)
project(lstm)
option(ONNXRUNTIME_DIR "Path to built ONNX Runtime directory." STRING)
message(STATUS "ONNXRUNTIME_DIR: ${ONNXRUNTIME_DIR}")
add_executable(lstm lstm.cpp)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread")
target_include_directories(lstm PRIVATE "${ONNXRUNTIME_DIR}/include")
target_compile_features(lstm PRIVATE cxx_std_11)
target_link_libraries(lstm "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")
5、CMake编译
CMake编译cpp代码,首先进入自己的工程文件夹
下,再输入如下命令:
mkdir build
cd build
cmake .. -DONNXRUNTIME_DIR=to/your/onnxruntime-win-x64-1.9.0 -DCMAKE_BUILD_TYPE=Release
cmake --build . -j4
6、多输入LSTM模型(C++部署)
如果遇到一个模型有多个输入,那么就需要对输入的数据进行组合,这里以2个序列输入为例,示例代码(lstm_multi.cpp
)如下:
#include <iostream>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>
using namespace std;
using namespace Ort;
const int batch_size = 1;
const int input1_size = 1; // 第一个输入的维度
const int seq1_len = 10; // 第一个输入的序列长度
const int input2_size = 4; // 第二个输入的维度
const int seq2_len = 20; // 第二个输入的序列长度
const int out_seq_len = 20; // 输出的序列长度
const int output_size = 1; // 输出的维度
std::vector<float> testOnnxLSTM(std::vector<std::vector<std::vector<float>>>& input1, std::vector<std::vector<std::vector<float>>>& input2)
{
//设置为VERBOSE,方便控制台输出时看到是使用了cpu还是gpu执行
//Ort::Env env(ORT_LOGGING_LEVEL_VERBOSE, "test");
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1); // 使用五个线程执行op,提升速度
// 第二个参数代表GPU device_id = 0,注释这行就是cpu执行
//OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
auto memory1_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
auto memory2_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
const char* model_path = "../cmp.onnx";
std::cout << model_path << std::endl;
Ort::Session session(env, model_path, session_options);
const char* input_names[] = {"input_1", "input_2"}; // 根据模型输入接口定义设定
const char* output_names[] = {"output"}; // 根据模型输出接口定义设定
std::array<float, batch_size * seq1_len * input1_size> input1_matrix;
std::array<float, batch_size * seq2_len * input2_size> input2_matrix;
std::array<float, batch_size * out_seq_len * output_size> output_matrix;
std::array<int64_t, 3> input1_shape{batch_size, seq1_len, input1_size};
std::array<int64_t, 3> input2_shape{batch_size, seq2_len, input2_size};
std::array<int64_t, 3> output_shape{batch_size, out_seq_len, output_size};
for (int i = 0; i < batch_size; i++)
for (int j = 0; j < seq1_len; j++)
for (int k = 0; k < input1_size; k++)
input1_matrix[i * seq1_len * input1_size + j * input1_size + k] = input1[i][j][k];
for (int i = 0; i < batch_size; i++)
for (int j = 0; j < seq2_len; j++)
for (int k = 0; k < input2_size; k++)
input2_matrix[i * seq2_len * input2_size + j * input2_size + k] = input2[i][j][k];
Ort::Value input1_tensor = Ort::Value::CreateTensor<float>(memory1_info, input1_matrix.data(), input1_matrix.size(), input1_shape.data(), input1_shape.size());
Ort::Value input2_tensor = Ort::Value::CreateTensor<float>(memory2_info, input2_matrix.data(), input2_matrix.size(), input2_shape.data(), input2_shape.size());
std::vector<Ort::Value> inputs;
inputs.push_back(std::move(input1_tensor));
inputs.push_back(std::move(input2_tensor));
try
{
Ort::Value output_tensor = Ort::Value::CreateTensor<float>(memory1_info, output_matrix.data(), output_matrix.size(), output_shape.data(), output_shape.size());
session.Run(Ort::RunOptions{nullptr}, input_names, inputs.data(), inputs.size(), output_names, &output_tensor, 1);
}
catch (const std::exception& e)
{
std::cout << e.what() << std::endl;
}
std::cout << "get data from LSTM onnx: \n";
std::vector<float> ret;
for (int i = 0; i < out_seq_len; i++) {
ret.emplace_back(output_matrix[i]);
std::cout << ret[i] << "\t";
}
std::cout << "\n";
return ret;
}
int main(int argc, char const *argv[])
{
std::vector<std::vector<float>> t1;
std::vector<float> t2;
// generate random input1
std::vector<std::vector<std::vector<float>>> input1;
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < seq1_len; j++) {
for (int k = 0; k < input1_size; k++) {
t2.push_back(1.0 * (k + 1) * j / 20);
}
t1.push_back(t2);
t2.clear();
}
input1.push_back(t1);
t1.clear();
}
std::cout << "dim0: " << input1.size() << " ,dim1: " << input1[0].size() << " ,dim2: " << input1[0][0].size() << std::endl;
// print input1
for (auto& i : input1) {
for (auto& j : i) {
for (auto& k : j) {
std::cout << k << "\t";
}
std::cout << "\n";
}
std::cout << "\n";
}
t1.clear();
t2.clear();
// generate random input2
std::vector<std::vector<std::vector<float>>> input2;
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < seq2_len; j++) {
for (int k = 0; k < input2_size; k++) {
t2.push_back(1.0 * (k + 1) * j / 20);
}
t1.push_back(t2);
t2.clear();
}
input2.push_back(t1);
t1.clear();
}
std::cout << "dim0: " << input2.size() << " ,dim1: " << input2[0].size() << " ,dim2: " << input2[0][0].size() << std::endl;
// print input2
for (auto& i : input2) {
for (auto& j : i) {
for (auto& k : j) {
std::cout << k << "\t";
}
std::cout << "\n";
}
std::cout << "\n";
}
// inference
auto ret = testOnnxLSTM(input1, input2);
return 0;
}
7、参考链接
[1] https://blog.youkuaiyun.com/wydxry/article/details/132909712
[2] https://github.com/microsoft/onnxruntime