TensorRT加速MobileNet SSD分解为三个问题:
1)图像的预处理;
2)结果的处理;
3)depthwise convolution层的实现;
针对1)图像预处理我就不多言了;
针对2)结果处理,由于tensorRT中有SSD的detectout插件,所以出来的结果如何处理,也没有什么多说的,结果的个数是100个,for循环就可以了.可以借鉴这个demo:https://github.com/maomaoyuchengzi/MobileNetSSD-detect;
针对3)我参考的是prelu 插件层以及caffe中gpu前向处理的代码:下面粘贴部分代码在这里:
class DepthWiseConvPlugin : public IPlugin
{
public:
DepthWiseConvPlugin();
~DepthWiseConvPlugin(){
if (mWeights.values){
free(const_cast<void*>(mWeights.values));
}
if (mbias.values){
free(const_cast<void*>(mbias.values));
}
}
DepthWiseConvPlugin(DepthWiseConv_Param DWConv_param,const Weights *weights, int nbWeights){
mdepthWiseConv_param.bias_term = DWConv_param.bias_term;
mdepthWiseConv_param.kernel_h = DWConv_param.kernel_h;
mdepthWiseConv_param.kernel_w = DWConv_param.kernel_w;
mdepthWiseConv_param.pad_h = DWConv_param.pad_h;
mdepthWiseConv_param.pad_w = DWConv_param.pad_w;
mdepthWiseConv_param.stride_h = DWConv_param.stride_h;
mdepthWiseConv_param.stride_w = DWConv_param.stride_w;
if(mdepthWiseConv_param.bias_term){
assert(nbWeights==2);
mWeights = weights[0];
mbias = weights[1];
assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
memcpy(const_cast<void*>(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));
assert(mbias.type == DataType::kFLOAT || mbias.type == DataType::kHALF);
mbias.values = malloc(mbias.count*type2size(mbias.type));
memcpy(const_cast<void*>(mbias.values),weights[1].values,mbias.count*type2size(mbias.type));
}
else{
assert(nbWeights==1);
mWeights = weights[0];
mbias = weights[1];
assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
memcpy(const_cast<void*>(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));
}
}
DepthWiseConvPlugin(const void* buffer, size_t size){
const char* d = reinterpret_cast<const char*>(buffer), *a = d;
read<int>(d, m_top_count);
read<int>(d, mdepthWiseConv_param.channels);
read<int>(d, mdepthWiseConv_param.height);
read<int>(d, mdepthWiseConv_param.width);
read<int>(d, mdepthWiseConv_param.kernel_h);
read<int>(d, mdepthWiseConv_param.kernel_w);
read<int>(d, mdepthWiseConv_param.stride_h);
read<int>(d, mdepthWiseConv_param.stride_w);
read<int>(d, mdepthWiseConv_param.pad_h);
read<int>(d, mdepthWiseConv_param.pad_w);
read<int>(d, mdepthWiseConv_param.conved_height);
read<int>(d, mdepthWiseConv_param.conved_width);
read<bool>(d,mdepthWiseConv_param.bias_term);
read<int64_t>(d,mWeights.count);
read<DataType>(d,mWeights.type);
mWeights.values = nullptr;
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));//deserializeToDevice(d,mDeviceKernel,mWeights.count);
memcpy(const_cast<void*>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
d += mWeights.count * type2size(mWeights.type);
if(mdepthWiseConv_param.bias_term)
{
read<int64_t>(d,mbias.count);
read<DataType>(d,mbias.type);
mbias.values = nullptr;
mbias.values = malloc(mbias.count * type2size(mbias.type));
memcpy(const_cast<void*>(mbias.values), d, mbias.count * type2size(mbias.type));
d += mbias.count * type2size(mbias.type);
}
assert(d == a + size);
}
inline int getNbOutputs() const override {
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override{
mdepthWiseConv_param.channels = inputs[0].d[0];
mdepthWiseConv_param.height = inputs[0].d[1];
mdepthWiseConv_param.width = inputs[0].d[2];
assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
int h_output = floor((inputs[0].d[1] - mdepthWiseConv_param.kernel_h + 2*mdepthWiseConv_param.pad_h)/mdepthWiseConv_param.stride_h) + 1;
int w_output = floor((inputs[0].d[2] - mdepthWiseConv_param.kernel_w + 2*mdepthWiseConv_param.pad_w)/mdepthWiseConv_param.stride_w)+ 1;
mdimstop =DimsCHW(inputs[0].d[0], h_output, w_output);
mdepthWiseConv_param.conved_height = mdimstop.h();
mdepthWiseConv_param.conved_width = mdimstop.w();
m_top_count = mdimstop.c()*mdimstop.h()*mdimstop.w();
// cout<<"depthwise_Conv:"<<"c = "<<inputs[0].d[0]<<";h = "<<h_output<<";w = "<<w_output<<endl;
return DimsCHW(inputs[0].d[0], h_output, w_output);
}
int initialize() override{
return 0;
}
inline void terminate() override{
}
inline size_t getWorkspaceSize(int) const override {
return 0;
}
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override{
DepthwiseConvolutionLayer_Forward_gpu(mWeights.count,mbias.count,m_top_count,mdepthWiseConv_param, (float*)mWeights.values,(float*)mbias.values,(const float*)inputs[0],(float *)outputs[0]);
}
size_t getSerializationSize() override{
if(mdepthWiseConv_param.bias_term){
return 12*sizeof(int) + 1*sizeof(bool)+ 2*sizeof(int64_t)+2*sizeof(DataType)+mWeights.count * sizeof(float) + mbias.count * sizeof(float);
}
else{
return 12*sizeof(int) + 1*sizeof(bool)+ 1*sizeof(int64_t)+1*sizeof(DataType)+mWeights.count * sizeof(float);
}
}
void serialize(void* buffer) override{
char *d = reinterpret_cast<char*>(buffer), *a = d;
write(d, m_top_count);
// write(d, mdimstop.h());
// write(d, mdimstop.w());
write(d, mdepthWiseConv_param.channels);
write(d, mdepthWiseConv_param.height);
write(d, mdepthWiseConv_param.width);
write(d,mdepthWiseConv_param.kernel_h);
write(d,mdepthWiseConv_param.kernel_w);
write(d,mdepthWiseConv_param.stride_h);
write(d,mdepthWiseConv_param.stride_w);
write(d,mdepthWiseConv_param.pad_h);
write(d,mdepthWiseConv_param.pad_w);
write(d,mdepthWiseConv_param.conved_height);
write(d,mdepthWiseConv_param.conved_width);
write(d,mdepthWiseConv_param.bias_term);
write(d, mWeights.count);
write(d, mWeights.type);
convertAndCopyToBuffer(d,mWeights);
if(mdepthWiseConv_param.bias_term)
{
write(d,mbias.count);
write(d,mbias.type);
convertAndCopyToBuffer(d,mbias);
}
char* rmp = a + getSerializationSize();
assert(d == rmp);
}
void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override{
// mdimsBottomData = DimsCHW{inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]};
// mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
}
private:
template<typename T> void write(char*& buffer, const T& val)
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template<typename T> void read(const char*& buffer, T& val)
{
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
size_t type2size(DataType type) {
return sizeof(float);
}
void convertAndCopyToBuffer(char*& buffer, const Weights& weights)
{
memcpy(buffer, weights.values, weights.count * type2size(weights.type));
buffer += weights.count * type2size(weights.type);
}
Weights mWeights;
Weights mbias;
// DimsCHW mdimsBottomData;
DimsCHW mdimstop;
DepthWiseConv_Param mdepthWiseConv_param;
int m_top_count;
// size_t mCopySize;
};
还有部分代码,我就不贴全部了,你们可以自己思考下;上面的序列化和反序列化已经实现没问题,可以将序列结果存盘;
遇到的问题:
1)在运行PluginFactory时,运行到detectout时爆出如下错误:
"Plugin layer output count is not equal to caffe output count".
解决方法:
原始的prototxt:
layer {
name: "detection_out"
type: "IPlugin"
bottom: "mbox_loc"
bottom: "mbox_conf_flatten"
bottom: "mbox_priorbox"
top: "detection_out"
...
}
加一个图top层就可以了,修改后:
layer {
name: "detection_out"
type: "IPlugin"
bottom: "mbox_loc"
bottom: "mbox_conf_flatten"
bottom: "mbox_priorbox"
top: "detection_out"
#here
top:"out2"
...
}
这样就可以解决了,此问题解决参考:https://devtalk.nvidia.com/default/topic/1025153/?comment=5214393
https://github.com/dusty-nv/jetson-inference/issues/171#issuecomment-360982183
下面是第二参考的原文:
wa。 I've solved this problem, it's because the tensorrt ssd implementation of
detection output layer has TWO outputs. Therefore you should add an output blob
in the prototxt file. I've built it successfully.
谢谢无私的贡献者;