好消息~apollo已经发布了orin版本:https://github.com/ApolloAuto/apollo/issues/15090
另外还有4090显卡的支持:
https://github.com/ApolloAuto/apollo/issues/14821
最近开始尝试在orin上编译perception模块,在这边记录一下。
其他模块的编译见地址:running Apollo on orin(arm64/aarch64) 移植记录_Scott_D_的博客-优快云博客
orin为arm64架构,并且cuda版本>11.4,使用tensorRT8,cundnn8.2.4,但是apollo的perception代码有部分是基于tensorRT7的接口写的,所以需要手动写一些包装器把8的接口改装成7的。
理论上本文的方法对于想要在4090显卡上面运行apollo也是有帮助的,因为4090的cuda版本为12.x,也对应tensorRT8,同样需要改写接口。
一、环境准备
在apollo容器中安装cuda11.4、cudnn8.2.4(arm版本):
在里面选择 cuDNN Runtime Library for Ubuntu20.04 aarch64sbsa (Deb)、
cuDNN Developer Library for Ubuntu20.04 aarch64sbsa (Deb)
放到docker内用dpkg -i cuDNN-xxxx 进行安装,先安装runtime再安装developer
安装TensorRT8,这个可以去搜教程。
二、编译过程
bash apollo.sh build_opt_gpu
1. 报错:unrecognized option “-msse4.1”;
注释
/apollo/modules/perception/lidar/lib/ground_detector/spatio_temporal_ground_detector/BUILD第十行:
copts = ["-msse4.1"],
因为arm架构不支持此x86的编译指令
2. 报错: undeclared type “__m128i”;
这是SSE2 指令集内部函数的数据类型,arm不支持,可以在modules/perception/common/i_lib/core下面添加头文件SSE2NEON.h,然后让i_basic.h包含他,并在BUILD中添加sse2neon.h
下载地址https://codeload.github.com/DLTcollab/sse2neon/tar.gz/refs/tags/v1.5.0
3. 报错:‘DimsNCHW’ in namespace ‘nvinfer1’ does not name a type;
改造tensorRT接口,相关代码来自Can I upgrade the CUDA and TensorRT version inside the apollo docker? · Issue #14858 · ApolloAuto/apollo · GitHub
首先在modules/perception/inference/tensorrt/路径下新建一个文件arm_wrappers.h
#pragma once
#include <NvInferLegacyDims.h>
namespace nvinfer1 {
class DimsNCHW : public Dims4 {
public:
DimsNCHW() : Dims4() {}
DimsNCHW(
int32_t batch_size, int32_t channels,
int32_t height, int32_t width)
: Dims4(batch_size, channels, height, width) {}
int32_t& n() {
return d[0];
}
int32_t n() const {
return d[0];
}
int32_t& c() {
return d[1];
}
int32_t c() const {
return d[1];
}
int32_t& h() {
return d[2];
}
int32_t h() const {
return d[2];
}
int32_t& w() {
return d[3];
}
int32_t w() const {
return d[3];
}
};
class DimsCHW : public Dims3 {
public:
DimsCHW() : Dims3() {}
DimsCHW(int32_t channels, int32_t height, int32_t width)
: Dims3(channels, height, width) {}
int32_t& c() {
return d[0];
}
int32_t c() const {
return d[0];
}
int32_t& h() {
return d[1];
}
int32_t h() const {
return d[1];
}
int32_t& w() {
return d[2];
}
int32_t w() const {
return d[2];
}
};
} // namespace nvinfer1
在modules/perception/inference/tensorrt/batch_stream.h以及modules/perception/inference/tensorrt/rt_common.h中包含这个文件。在他们的BUILD文件的hdrs中添加这个文件。
然后修改modules/perception/inference/tensorrt/plugins/softmax_plugin.h文件,将文件内容替换为
/******************************************************************************
* Copyright 2018 The Apollo Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*****************************************************************************/
#pragma once
#include "modules/perception/inference/tensorrt/rt_common.h"
namespace apollo {
namespace perception {
namespace inference {
#ifndef TENSORRT_8
class SoftmaxPlugin : public nvinfer1::IPlugin {
public:
SoftmaxPlugin(const SoftmaxParameter ¶m, nvinfer1::Dims in_dims) {
input_dims_.nbDims = in_dims.nbDims;
for (int i = 0; i < in_dims.nbDims; i++) {
input_dims_.d[i] = in_dims.d[i];
input_dims_.type[i] = in_dims.type[i];
}
axis_ = param.axis() - 1;
CHECK_GE(axis_, 0);
CHECK_LE(axis_ + 1, input_dims_.nbDims);
inner_num_ = 1;
for (int i = axis_ + 1; i < input_dims_.nbDims; i++) {
inner_num_ *= input_dims_.d[i];
}
outer_num_ = 1;
for (int i = 0; i < axis_; i++) {
outer_num_ *= input_dims_.d[i];
}
cudnnCreateTensorDescriptor(&input_desc_);
cudnnCreateTensorDescriptor(&output_desc_);
}
SoftmaxPlugin() {}
~SoftmaxPlugin() {
cudnnDestroyTensorDescriptor(input_desc_);
cudnnDestroyTensorDescriptor(output_desc_);
}
virtual int initialize() {
cudnnCreate(&cudnn_); // initialize cudnn and cublas
cublasCreate(&cublas_);
return 0;
}
virtual void terminate() {
cublasDestroy(cublas_);
cudnnDestroy(cudnn_);
}
int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override {
nvinfer1::Dims out_dims = inputs[0];
return out_dims;
}
void configure(const nvinfer1::Dims *inputDims, int nbInputs,
const nvinfer1::Dims *outputDims, int nbOutputs,
int maxBatchSize) override {
input_dims_ = inputDims[0];
}
size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
size_t getSerializationSize() override { return 0; }
void serialize(void *buffer) override {
char *d = reinterpret_cast<char *>(buffer), *a = d;
size_t size = getSerializationSize();
CHECK_EQ(d, a + size);
}
private:
cudnnHandle_t cudnn_;
cublasHandle_t cublas_;
nvinfer1::Dims input_dims_;
int axis_;
int inner_num_;
int outer_num_;
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
};
#else
class SoftmaxPlugin : public nvinfer1::IPluginV2Ext {
public:
SoftmaxPlugin(const SoftmaxParameter ¶m, nvinfer1::Dims in_dims) {
input_dims_.nbDims = in_dims.nbDims;
for (int i = 0; i < in_dims.nbDims; i++) {
input_dims_.d[i] = in_dims.d[i];
}
axis_ = param.axis() - 1;
CHECK_GE(axis_, 0);
CHECK_LE(axis_ + 1, input_dims_.nbDims);
inner_num_ = 1;
for (int i = axis_ + 1; i < input_dims_.nbDims; i++) {
inner_num_ *= input_dims_.d[i];
}
outer_num_ = 1;
for (int i = 0; i < axis_; i++) {
outer_num_ *= input_dims_.d[i];
}
cudnnCreateTensorDescriptor(&input_desc_);
cudnnCreateTensorDescriptor(&output_desc_);
}
SoftmaxPlugin() {}
~SoftmaxPlugin() {
cudnnDestroyTensorDescriptor(input_desc_);
cudnnDestroyTensorDescriptor(output_desc_);
}
virtual int32_t initialize() noexcept {
cudnnCreate(&cudnn_); // initialize cudnn and cublas
cublasCreate(&cublas_);
return 0;
}
virtual void terminate() noexcept {
cublasDestroy(cublas_);
cudnnDestroy(cudnn_);
}
int32_t getNbOutputs() const noexcept override { return 1; }
nvinfer1::Dims getOutputDimensions(int32_t index,
const nvinfer1::Dims *inputs, int32_t nbInputDims)
noexcept override {
nvinfer1::Dims out_dims = inputs[0];
return out_dims;
}
void configureWithFormat(const nvinfer1::Dims *inputDims, int32_t nbInputs,
const nvinfer1::Dims *outputDims, int32_t nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format,
int32_t maxBatchSize) noexcept override {
input_dims_ = inputDims[0];
}
size_t getWorkspaceSize(int32_t maxBatchSize)
const noexcept override { return 0; }
int32_t enqueue(int32_t batchSize, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream)
noexcept override;
size_t getSerializationSize() const noexcept override { return 0; }
void serialize(void *buffer) const noexcept override {
char *d = reinterpret_cast<char *>(buffer), *a = d;
size_t size = getSerializationSize();
CHECK_EQ(d, a + size);
}
nvinfer1::AsciiChar const* getPluginType()
const noexcept override {
return plugin_type;
}
nvinfer1::AsciiChar const* getPluginVersion()
const noexcept override {
return plugin_version;
}
void setPluginNamespace(const nvinfer1::AsciiChar* libNamespace)
noexcept override {
plugin_namespace = const_cast<nvinfer1::AsciiChar*>(libNamespace);
}
nvinfer1::AsciiChar const* getPluginNamespace()
const noexcept override {
return const_cast<nvinfer1::AsciiChar*>(plugin_namespace);
}
bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const noexcept override {
return true;
}
void destroy() noexcept override {
delete this;
}
nvinfer1::IPluginV2Ext* clone() const noexcept override {
SoftmaxPlugin* p = new SoftmaxPlugin();
cudnnCreate(&(p->cudnn_)); // initialize cudnn and cublas
cublasCreate(&(p->cublas_));
p->axis_ = axis_;
p->inner_num_ = inner_num_;
p->outer_num_ = outer_num_;
p->plugin_namespace = plugin_namespace;
(p->input_dims_).nbDims = input_dims_.nbDims;
for (int i = 0; i < input_dims_.nbDims; i++) {
(p->input_dims_).d[i] = input_dims_.d[i];
}
cudnnCreateTensorDescriptor(&(p->input_desc_));
cudnnCreateTensorDescriptor(&(p->output_desc_));
return p;
}
bool isOutputBroadcastAcrossBatch(int32_t outputIndex,
bool const *inputIsBroadcasted, int32_t nbInputs)
const noexcept override {
return false;
}
bool canBroadcastInputAcrossBatch(int32_t inputIndex)
const noexcept override {
return false;
}
nvinfer1::DataType getOutputDataType(int32_t index,
nvinfer1::DataType const *inputTypes, int32_t nbInputs)
const noexcept {
return nvinfer1::DataType::kFLOAT;
}
void configurePlugin(
nvinfer1::Dims const *inputDims, int32_t nbInputs,
nvinfer1::Dims const *outputDims, int32_t nbOutputs,
nvinfer1::DataType const *inputTypes,
nvinfer1::DataType const *outputTypes,
bool const *inputIsBroadcast, bool const *outputIsBroadcast,
nvinfer1::PluginFormat floatFormat, int32_t maxBatchSize) noexcept {}
private:
cudnnHandle_t cudnn_;
cublasHandle_t cublas_;
nvinfer1::Dims input_dims_;
int axis_;
int inner_num_;
int outer_num_;
nvinfer1::AsciiChar* plugin_namespace;
const nvinfer1::AsciiChar* plugin_type = "";
const nvinfer1::AsciiChar* plugin_version = "";
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
};
#endif
} // namespace inference
} // namespace perception
} // namespace apollo
主要的替换内容有:
- 从initialize()开始的函数都要声明为noexcept
- serialize、getSerializationSize函数还要额外声明const
- enqueue函数的形参outputs要声明为 void* const*,修改对应.cu文件内的对应的enqueue函数,同样修改outputs形参
- 增加私有成员变量的读写函数
- 增加destroy、clone函数
对其他plugin头文件都进行类似的操作
寄了寄了,由于apollo使用的一些模型和库用的是apollo编译好的x86的库,所以perception暂时无法在orin上编译了,但是如果自己有lidarcomponent等perception传感器模型的话,可以替换掉apollo的库进行perception的编译。