【onnxruntime推理】LSTM时序预测模型(含多输入)C++部署(CMake编译)

本文详细介绍了如何在Linux开发环境中,使用PyTorch创建LSTM模型,将其转换为ONNX格式,然后通过C++和CMake进行部署和推理,包括模型的实现、CMakeLists文件配置以及多输入模型的处理。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

  本文主要介绍在linux开发环境中如何使用C++和Cmake对LSTM进行部署推理。之所以不在vscode等编译软件上直接进行设置,是因为笔者认为CMake更好,可以编辑对应的语法,也可以避免路径设置(CMake直接传入指定路径)等相关问题。

  熟悉LSTM的读者都知道,LSTM是一种比较经典针对时序处理的深度网络。在一般的LSTM中,输入参数有三个维度,分别是批处理大小,序列长度和特征维数(batch_size, seq_length, dims),输出为两个维度,分别是批处理大小和输出维度(batch_size, out_dim)
  根据这个原理,笔者使用pytorch深度学习框架训练了一个两层的LSTM网络,输入参数大小是(1, 10, 10),输出参数大小是(1, 10)。参考链接中的代码。本文重点在于使用LSTM到处的onnx模型进行C++推理,CMake编译。具体实现方式如下描述。



1、LSTM模型Pytorch实现

import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, output_size, out_channels, num_layers, device):
        super(LSTM, self).__init__()
        self.device = device
        self.input_size = input_size
        self.hidden_size = input_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            batch_first=True)

        self.out_channels = out_channels

        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x):
        out, _ = self.lstm(x)

        if self.out_channels == 1:
            out = out[:, -1, :]
            out = self.fc(out)
            return out
        
        return out


batch_size = 1
input_size = 20
seq_len = 5
output_size = 10
num_layers = 2
out_channels = 1

model = LSTM(input_size, output_size, out_channels, num_layers, "cpu")
model.eval() 

input_names = ["input"]     # 设定输入接口名称
output_names  = ["output"]  # 设定输出接口名称

x = torch.randn((batch_size, seq_len, input_size))
print(x.shape)
y = model(x)
print(y.shape)

torch.onnx.export(model, x, 'lstm.onnx', verbose=True, input_names=input_names, output_names=output_names,
  dynamic_axes={'input':[0], 'output':[0]} )

import onnx
model = onnx.load("lstm.onnx")
print("load model done.")
onnx.checker.check_model(model)
print(onnx.helper.printable_graph(model.graph))
print("check model done.")

运行之后会在本地生成一个lstm.onnx文件。记得根据自己的需求设定输入输出接口名称,下面C++部署代码会用到。

2、Python测试ONNX模型

本节主要用python测试onnx模型推理的结果,方便与后面C++结果推理作对比,代码如下:

import onnx
import onnxruntime as ort
import numpy as np


batch_size = 1
input_size = 20
seq_len = 5
input_data = []
for i in range(batch_size):
    data1 = []
    for j in range(seq_len):
        data2 = []
        for k in range(input_size):
            data2.append(1.0 * (k + 1) * j / 20)
        data1.append(data2)
    input_data.append(data1)
print(input_data)

ort_session = ort.InferenceSession('lstm.onnx')
outputs = ort_session.run(None, {'input': np.array(input_data).astype(np.float32)})
print('onnx result:', outputs[0])

3、C++推理代码

  推理部署的C++简易代码(lstm.cpp)如下:

#include <iostream>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>
using namespace std;
using namespace Ort;

const int batch_size = 1;
const int input_size = 20;
const int seq_len = 5;
const int output_size = 10;


std::vector<float> testOnnxLSTM(std::vector<std::vector<std::vector<float>>>& inputs) 
{
    //设置为VERBOSE,方便控制台输出时看到是使用了cpu还是gpu执行
    //Ort::Env env(ORT_LOGGING_LEVEL_VERBOSE, "test");
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
    Ort::SessionOptions session_options;

    session_options.SetIntraOpNumThreads(1); // 使用五个线程执行op,提升速度
    // 第二个参数代表GPU device_id = 0,注释这行就是cpu执行
    //OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);

    const char* model_path = "../lstm.onnx";
    std::cout << model_path << std::endl;

    Ort::Session session(env, model_path, session_options);

    const char* input_names[] = {"input"};    // 根据上节输入接口名称设置
    const char* output_names[] = {"output"};  // 根据上节输出接口名称设置

    std::array<float, batch_size * seq_len * input_size> input_matrix;
    std::array<float, batch_size * output_size> output_matrix;

    std::array<int64_t, 3> input_shape{batch_size, seq_len, input_size};
    std::array<int64_t, 2> output_shape{batch_size, output_size};

    for (int i = 0; i < batch_size; i++)
        for (int j = 0; j < seq_len; j++)
            for (int k = 0; k < input_size; k++)
                input_matrix[i * seq_len * input_size + j * input_size + k] = inputs[i][j][k];

    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_matrix.data(), input_matrix.size(), input_shape.data(), input_shape.size());

    try
    {
        Ort::Value output_tensor = Ort::Value::CreateTensor<float>(memory_info, output_matrix.data(), output_matrix.size(), output_shape.data(), output_shape.size());
        session.Run(Ort::RunOptions{ nullptr }, input_names, &input_tensor, 1, output_names, &output_tensor, 1); 
    }
    catch (const std::exception& e)
    {
        std::cout << e.what() << std::endl;
    }

    std::cout << "get data from LSTM onnx: \n";
    std::vector<float> ret;
    for (int i = 0; i < output_size; i++) {
        ret.emplace_back(output_matrix[i]);
        std::cout << ret[i] << "\t";
    }
    std::cout << "\n";

    return ret;
}


int main(int argc, char const *argv[])
{
    std::vector<std::vector<std::vector<float>>> data;
    for (int i = 0; i < batch_size; i++) {
       std::vector<std::vector<float>> t1;
       for (int j = 0; j < seq_len; j++) {
           std::vector<float> t2;
           for (int k = 0; k < input_size; k++) {
               t2.push_back(1.0 * (k + 1) * j / 20);
           }
           t1.push_back(t2);
           t2.clear();
       }
       data.push_back(t1);
       t1.clear();
    }
    std::cout << data.size() << " " << data[0].size() << " " << data[0][0].size() << std::endl;

    for (auto& i : data) {
        for (auto& j : i) {
            for (auto& k : j) {
               std::cout << k << "\t";
            }
            std::cout << "\n";
        }
        std::cout << "\n";
    }
    auto ret = testOnnxLSTM(data);
    return 0;
}

4、CMakeLists文件编写

CMakeLists.txt文件内容如下:

cmake_minimum_required(VERSION 3.16)
project(lstm)

option(ONNXRUNTIME_DIR "Path to built ONNX Runtime directory." STRING)
message(STATUS "ONNXRUNTIME_DIR: ${ONNXRUNTIME_DIR}")

add_executable(lstm lstm.cpp)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread")

target_include_directories(lstm PRIVATE "${ONNXRUNTIME_DIR}/include")
target_compile_features(lstm PRIVATE cxx_std_11)
target_link_libraries(lstm "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")

5、CMake编译

  CMake编译cpp代码,首先进入自己的工程文件夹,再输入如下命令:

mkdir build
cd build
cmake .. -DONNXRUNTIME_DIR=to/your/onnxruntime-win-x64-1.9.0 -DCMAKE_BUILD_TYPE=Release
cmake --build . -j4

6、多输入LSTM模型(C++部署)

  如果遇到一个模型有多个输入,那么就需要对输入的数据进行组合,这里以2个序列输入为例,示例代码(lstm_multi.cpp)如下:

#include <iostream>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>
using namespace std;
using namespace Ort;

const int batch_size = 1;
const int input1_size = 1;  // 第一个输入的维度
const int seq1_len = 10;    // 第一个输入的序列长度
const int input2_size = 4;  // 第二个输入的维度
const int seq2_len = 20;  // 第二个输入的序列长度
const int out_seq_len = 20;  // 输出的序列长度
const int output_size = 1;     // 输出的维度


std::vector<float> testOnnxLSTM(std::vector<std::vector<std::vector<float>>>& input1, std::vector<std::vector<std::vector<float>>>& input2) 
{
    //设置为VERBOSE,方便控制台输出时看到是使用了cpu还是gpu执行
    //Ort::Env env(ORT_LOGGING_LEVEL_VERBOSE, "test");
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
    Ort::SessionOptions session_options;

    session_options.SetIntraOpNumThreads(1); // 使用五个线程执行op,提升速度
    // 第二个参数代表GPU device_id = 0,注释这行就是cpu执行
    //OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
    auto memory1_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
    auto memory2_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);

    const char* model_path = "../cmp.onnx";
    std::cout << model_path << std::endl;

    Ort::Session session(env, model_path, session_options);

    const char* input_names[] = {"input_1", "input_2"};   // 根据模型输入接口定义设定
    const char* output_names[] = {"output"};       // 根据模型输出接口定义设定

    std::array<float, batch_size * seq1_len * input1_size> input1_matrix;
    std::array<float, batch_size * seq2_len * input2_size> input2_matrix;
    std::array<float, batch_size * out_seq_len * output_size> output_matrix;

    std::array<int64_t, 3> input1_shape{batch_size, seq1_len, input1_size};
    std::array<int64_t, 3> input2_shape{batch_size, seq2_len, input2_size};
    std::array<int64_t, 3> output_shape{batch_size, out_seq_len, output_size};

    for (int i = 0; i < batch_size; i++)
        for (int j = 0; j < seq1_len; j++)
            for (int k = 0; k < input1_size; k++)
                input1_matrix[i * seq1_len * input1_size + j * input1_size + k] = input1[i][j][k];

    for (int i = 0; i < batch_size; i++)
        for (int j = 0; j < seq2_len; j++)
            for (int k = 0; k < input2_size; k++)
                input2_matrix[i * seq2_len * input2_size + j * input2_size + k] = input2[i][j][k];

    Ort::Value input1_tensor = Ort::Value::CreateTensor<float>(memory1_info, input1_matrix.data(), input1_matrix.size(), input1_shape.data(), input1_shape.size());
    Ort::Value input2_tensor = Ort::Value::CreateTensor<float>(memory2_info, input2_matrix.data(), input2_matrix.size(), input2_shape.data(), input2_shape.size());
    std::vector<Ort::Value> inputs;
    inputs.push_back(std::move(input1_tensor));
    inputs.push_back(std::move(input2_tensor));

    try
    {
        Ort::Value output_tensor = Ort::Value::CreateTensor<float>(memory1_info, output_matrix.data(), output_matrix.size(), output_shape.data(), output_shape.size());
        session.Run(Ort::RunOptions{nullptr}, input_names, inputs.data(), inputs.size(), output_names, &output_tensor, 1); 
    }
    catch (const std::exception& e)
    {
        std::cout << e.what() << std::endl;
    }

    std::cout << "get data from LSTM onnx: \n";
    std::vector<float> ret;
    for (int i = 0; i < out_seq_len; i++) {
        ret.emplace_back(output_matrix[i]);
        std::cout << ret[i] << "\t";
    }
    std::cout << "\n";

    return ret;
}


int main(int argc, char const *argv[])
{
    std::vector<std::vector<float>> t1;
    std::vector<float> t2;
    // generate random input1
    std::vector<std::vector<std::vector<float>>> input1;
    for (int i = 0; i < batch_size; i++) {

       for (int j = 0; j < seq1_len; j++) {

           for (int k = 0; k < input1_size; k++) {
               t2.push_back(1.0 * (k + 1) * j / 20);
           }
           t1.push_back(t2);
           t2.clear();
       }
       input1.push_back(t1);
       t1.clear();
    }
    std::cout << "dim0: " << input1.size() << " ,dim1: " << input1[0].size() << " ,dim2: " << input1[0][0].size() << std::endl;
    // print input1
    for (auto& i : input1) {
        for (auto& j : i) {
            for (auto& k : j) {
               std::cout << k << "\t";
            }
            std::cout << "\n";
        }
        std::cout << "\n";
    }
    t1.clear();
    t2.clear();
    // generate random input2
    std::vector<std::vector<std::vector<float>>> input2;
    for (int i = 0; i < batch_size; i++) {
       for (int j = 0; j < seq2_len; j++) {
           for (int k = 0; k < input2_size; k++) {
               t2.push_back(1.0 * (k + 1) * j / 20);
           }
           t1.push_back(t2);
           t2.clear();
       }
       input2.push_back(t1);
       t1.clear();
    }
    std::cout << "dim0: " << input2.size() << " ,dim1: " << input2[0].size() << " ,dim2: " << input2[0][0].size() << std::endl;
    // print input2
    for (auto& i : input2) {
        for (auto& j : i) {
            for (auto& k : j) {
               std::cout << k << "\t";
            }
            std::cout << "\n";
        }
        std::cout << "\n";
    }
    // inference
    auto ret = testOnnxLSTM(input1, input2);
    return 0;
}



7、参考链接

[1] https://blog.youkuaiyun.com/wydxry/article/details/132909712
[2] https://github.com/microsoft/onnxruntime

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

IRevers

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值