TensorRT基于caffe模型加速MobileNet SSD

TensorRT加速MobileNet SSD分解为三个问题:

1)图像的预处理;

2)结果的处理;

3)depthwise convolution层的实现;

针对1)图像预处理我就不多言了;

针对2)结果处理,由于tensorRT中有SSD的detectout插件,所以出来的结果如何处理,也没有什么多说的,结果的个数是100个,for循环就可以了.可以借鉴这个demo:https://github.com/maomaoyuchengzi/MobileNetSSD-detect;

针对3)我参考的是prelu 插件层以及caffe中gpu前向处理的代码:下面粘贴部分代码在这里:

class DepthWiseConvPlugin : public IPlugin
{
public:
    DepthWiseConvPlugin();
    ~DepthWiseConvPlugin(){
        if (mWeights.values){
            free(const_cast<void*>(mWeights.values));
        }
        if (mbias.values){
            free(const_cast<void*>(mbias.values));
        }
    }
    DepthWiseConvPlugin(DepthWiseConv_Param DWConv_param,const Weights *weights, int nbWeights){
        mdepthWiseConv_param.bias_term = DWConv_param.bias_term;
        mdepthWiseConv_param.kernel_h = DWConv_param.kernel_h;
        mdepthWiseConv_param.kernel_w = DWConv_param.kernel_w;
        mdepthWiseConv_param.pad_h = DWConv_param.pad_h;
        mdepthWiseConv_param.pad_w = DWConv_param.pad_w;
        mdepthWiseConv_param.stride_h = DWConv_param.stride_h;
        mdepthWiseConv_param.stride_w = DWConv_param.stride_w;

        if(mdepthWiseConv_param.bias_term){
            assert(nbWeights==2);
            mWeights = weights[0];
            mbias = weights[1];
            assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
            mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
            memcpy(const_cast<void*>(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));

            assert(mbias.type == DataType::kFLOAT || mbias.type == DataType::kHALF);
            mbias.values = malloc(mbias.count*type2size(mbias.type));
            memcpy(const_cast<void*>(mbias.values),weights[1].values,mbias.count*type2size(mbias.type));

        }
        else{
            assert(nbWeights==1);
            mWeights = weights[0];
            mbias = weights[1];
            assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
            mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
            memcpy(const_cast<void*>(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));
        }
    }
    DepthWiseConvPlugin(const void* buffer, size_t size){
        const char* d = reinterpret_cast<const char*>(buffer), *a = d;
        read<int>(d, m_top_count);

        read<int>(d, mdepthWiseConv_param.channels);
        read<int>(d, mdepthWiseConv_param.height);
        read<int>(d, mdepthWiseConv_param.width);
        read<int>(d, mdepthWiseConv_param.kernel_h);
        read<int>(d, mdepthWiseConv_param.kernel_w);
        read<int>(d, mdepthWiseConv_param.stride_h);
        read<int>(d, mdepthWiseConv_param.stride_w);
        read<int>(d, mdepthWiseConv_param.pad_h);
        read<int>(d, mdepthWiseConv_param.pad_w);
        read<int>(d, mdepthWiseConv_param.conved_height);
        read<int>(d, mdepthWiseConv_param.conved_width);
        read<bool>(d,mdepthWiseConv_param.bias_term);

        read<int64_t>(d,mWeights.count);
        read<DataType>(d,mWeights.type);

        mWeights.values = nullptr;
        mWeights.values = malloc(mWeights.count * type2size(mWeights.type));//deserializeToDevice(d,mDeviceKernel,mWeights.count);
        memcpy(const_cast<void*>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
        d += mWeights.count * type2size(mWeights.type);

        if(mdepthWiseConv_param.bias_term)
        {
            read<int64_t>(d,mbias.count);
            read<DataType>(d,mbias.type);

            mbias.values = nullptr;
            mbias.values = malloc(mbias.count * type2size(mbias.type));
            memcpy(const_cast<void*>(mbias.values), d, mbias.count * type2size(mbias.type));
            d += mbias.count * type2size(mbias.type);
        }
        assert(d == a + size);
    }

    inline int getNbOutputs() const override {
        return 1;
    }
    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override{
        mdepthWiseConv_param.channels = inputs[0].d[0];
        mdepthWiseConv_param.height = inputs[0].d[1];
        mdepthWiseConv_param.width = inputs[0].d[2];

        assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
        int h_output = floor((inputs[0].d[1] - mdepthWiseConv_param.kernel_h + 2*mdepthWiseConv_param.pad_h)/mdepthWiseConv_param.stride_h) + 1;
        int w_output = floor((inputs[0].d[2] - mdepthWiseConv_param.kernel_w + 2*mdepthWiseConv_param.pad_w)/mdepthWiseConv_param.stride_w)+ 1;
        mdimstop =DimsCHW(inputs[0].d[0], h_output, w_output);
        mdepthWiseConv_param.conved_height = mdimstop.h();
        mdepthWiseConv_param.conved_width = mdimstop.w();
        m_top_count = mdimstop.c()*mdimstop.h()*mdimstop.w();
//        cout<<"depthwise_Conv:"<<"c = "<<inputs[0].d[0]<<";h = "<<h_output<<";w = "<<w_output<<endl;
        return DimsCHW(inputs[0].d[0], h_output, w_output);

    }
    int initialize() override{
      return 0;
    }
    inline void terminate() override{

    }

    inline size_t getWorkspaceSize(int) const override {
        return 0;
    }

    int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override{

        DepthwiseConvolutionLayer_Forward_gpu(mWeights.count,mbias.count,m_top_count,mdepthWiseConv_param, (float*)mWeights.values,(float*)mbias.values,(const float*)inputs[0],(float *)outputs[0]);
    }

    size_t getSerializationSize() override{
        if(mdepthWiseConv_param.bias_term){
            return 12*sizeof(int) + 1*sizeof(bool)+ 2*sizeof(int64_t)+2*sizeof(DataType)+mWeights.count * sizeof(float) + mbias.count * sizeof(float);
        }
        else{
            return 12*sizeof(int) + 1*sizeof(bool)+ 1*sizeof(int64_t)+1*sizeof(DataType)+mWeights.count * sizeof(float);
        }
    }
    void serialize(void* buffer) override{
           char *d = reinterpret_cast<char*>(buffer), *a = d;
           write(d, m_top_count);
//           write(d, mdimstop.h());
//           write(d, mdimstop.w());

            write(d, mdepthWiseConv_param.channels);
            write(d, mdepthWiseConv_param.height);
            write(d, mdepthWiseConv_param.width);
            write(d,mdepthWiseConv_param.kernel_h);
            write(d,mdepthWiseConv_param.kernel_w);
            write(d,mdepthWiseConv_param.stride_h);
            write(d,mdepthWiseConv_param.stride_w);
            write(d,mdepthWiseConv_param.pad_h);
            write(d,mdepthWiseConv_param.pad_w);
            write(d,mdepthWiseConv_param.conved_height);
            write(d,mdepthWiseConv_param.conved_width);
            write(d,mdepthWiseConv_param.bias_term);

            write(d, mWeights.count);
            write(d, mWeights.type);
            convertAndCopyToBuffer(d,mWeights);

            if(mdepthWiseConv_param.bias_term)
            {
                write(d,mbias.count);
                write(d,mbias.type);
                convertAndCopyToBuffer(d,mbias);
            }
            char* rmp = a + getSerializationSize();
            assert(d == rmp);

    }

    void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override{
//        mdimsBottomData = DimsCHW{inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]};
//        mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
    }

private:
    template<typename T> void write(char*& buffer, const T& val)
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template<typename T> void read(const char*& buffer, T& val)
    {
        val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
    }
    size_t type2size(DataType type) {
        return sizeof(float);
    }
    void convertAndCopyToBuffer(char*& buffer, const Weights& weights)
    {
        memcpy(buffer, weights.values, weights.count * type2size(weights.type));
        buffer += weights.count * type2size(weights.type);
    }

    Weights mWeights;
    Weights mbias;
//    DimsCHW mdimsBottomData;
    DimsCHW mdimstop;
    DepthWiseConv_Param mdepthWiseConv_param;
    int m_top_count;
//    size_t mCopySize;
};

还有部分代码,我就不贴全部了,你们可以自己思考下;上面的序列化和反序列化已经实现没问题,可以将序列结果存盘;

 

遇到的问题:

1)在运行PluginFactory时,运行到detectout时爆出如下错误:

"Plugin layer output count is not equal to caffe output count".

解决方法:

原始的prototxt:

layer {
  name: "detection_out"
  type: "IPlugin"
  bottom: "mbox_loc"
  bottom: "mbox_conf_flatten"
  bottom: "mbox_priorbox"
  top: "detection_out"
...
}

加一个图top层就可以了,修改后:

layer {
  name: "detection_out"
  type: "IPlugin"
  bottom: "mbox_loc"
  bottom: "mbox_conf_flatten"
  bottom: "mbox_priorbox"
  top: "detection_out"
  #here
  top:"out2"
...
}

这样就可以解决了,此问题解决参考:https://devtalk.nvidia.com/default/topic/1025153/?comment=5214393

https://github.com/dusty-nv/jetson-inference/issues/171#issuecomment-360982183

下面是第二参考的原文:

wa。 I've solved this problem, it's because the tensorrt ssd implementation of
 detection output layer has TWO outputs. Therefore you should add an output blob 
in the prototxt file. I've built it successfully.

谢谢无私的贡献者; 

 

评论 25
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

猫猫与橙子

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值