ncnn源码学习（八）：常见操作算子（中）

最新推荐文章于 2025-03-30 15:32:56 发布

MirrorYuChen

最新推荐文章于 2025-03-30 15:32:56 发布

阅读量5k

点赞数 4

分类专栏： ncnn

本文链接：https://blog.youkuaiyun.com/sinat_31425585/article/details/102927120

版权

ncnn 专栏收录该内容

14 篇文章

订阅专栏

1.阈值操作：Threshold

$f(x)=\left\{\begin{matrix} 1.0, & x>threshold & \\ 0.0, & else & \end{matrix}\right.$

将大于阈值的元素置为1，将小于或等于阈值的元素置为0。对应代码为：

int Threshold::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        // 对每个元素做阈值操作
        for (int i=0; i<size; i++)
        {
            ptr[i] = ptr[i] > threshold ? 1.f : 0.f;
        }
    }

    return 0;
}

2.双曲正切激活：tanh

对每个元素使用tanh激活函数处理：

$f(x)=\frac{e^{x}-e^{-x}}{e^{x}+e^{-x}}$

int TanH::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        // 对每个元素做tanh映射
        for (int i=0; i<size; i++)
        {
            ptr[i] = tanh(ptr[i]);
        }
    }

    return 0;
}

3.维度压缩：squeeze

将为1的维度压缩掉，例如：a = [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]]，使用squeeze压缩后，b=squeeze(a)=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    top_blob = bottom_blob;

    // 对channels维度进行压缩
    if (squeeze_c && dims == 3 && channels == 1)
    {
        // 对height维度进行压缩
        if (squeeze_h && h == 1)
            top_blob = bottom_blob.reshape(w, opt.blob_allocator);
        else
            top_blob = bottom_blob.reshape(w, h, opt.blob_allocator);
    }
    // 对height维度进行压缩
    else if (squeeze_h && dims >= 2 && h == 1)
    {
        // 对width维度进行压缩
        if (squeeze_w && w == 1)
            top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
        else
            top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator);
    }
    // 对width维度进行压缩
    else if (squeeze_w && dims >= 1 && w == 1)
    {
        // 对height维度进行压缩
        if (squeeze_h && h == 1)
            top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
        else
            top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator);
    }

    if (top_blob.empty())
        return -100;

    return 0;
}

4.平铺操作：Tile

沿着某个维度，对输入Mat进行平铺操作，相当于将原始Mat沿着某个维度复制tiles份

int Tile::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    // 沿着channel方向进行平铺
    if (dim == 0)
    {
        top_blob.create(w, h, channels * tiles, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const float* ptr = bottom_blob;
        int size = bottom_blob.cstep * channels;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<tiles; p++)
        {
            float* outptr = top_blob.channel(p * channels);

            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i];
            }
        }
    }
    // 沿着height方向进行平铺
    else if (dim == 1)
    {
        top_blob.create(w, h * tiles, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int p=0; p<tiles; p++)
            {
                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i];
                }

                outptr += size;
            }
        }
    }
    // 沿着width方向进行平铺
    else if (dim == 2)
    {
        top_blob.create(w * tiles, h, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                for (int p=0; p<tiles; p++)
                {
                    for (int j = 0; j < w; j++)
                    {
                        outptr[j] = ptr[j];
                    }

                    outptr += w;
                }

                ptr += w;
            }
        }
    }

    return 0;
}

5.空间金字塔池化：SPP（Spatial Pyramid Pooling）

作用就是利用池化技术，使得不同尺度输入图像都可以产生相同的输出维度，如图所示：

图1 单通道输入SPP示意图（摘自参考资料2）

先考虑输入的channel为1，有第一个部分，可以做一个全局的pooling，就会产生一个1x1的输出，第二个部分，可以将输入划分为2x2个区域，每个区域做一次pooling，就会得到一个4x1的输出，第三个部分，就是将图像划分为8x8个区域，每个区域做一次pooling，就会产生16x1的输出，将这些输出concat起来，就得到一个（1+4+16）x1=21x1的输出，不管输入是多大，按照这个方法都可以产生一个固定的输出维度：21x1

相同的道理，如果输入为256个channel，如图所示，按照相似的处理方法，我们可以得到21x256维的固定输出维度：

图2 channel为256的输入，对应spp原理（摘自参考资料2）

那么总的bin数目为：

$f(n)=4^{0}+4^{1}+4^{2}+...+4^{n-1}$

求和方式也很简单，两边乘以4，有：

$4f(n)=4^{1}+4^{2}+4^{3}+...+4^{n}$

将两个式子相减，变形有：

$f(n)=\frac{4^{n}-1}{3}=\frac{2^{2n}-1}{3}$

有了这个式子，再看一下ncnn实现：

int SPP::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    size_t elemsize = bottom_blob.elemsize;
    // 1 + 4 + 16 + 64 + ... + (2*pyramid_height)^2
    int pyramid_num_bins = ((1 << (pyramid_height * 2)) - 1) / 3;
    top_blob.create(pyramid_num_bins, 1, 2, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    float* pyramid_ptr = top_blob;

    // all spatial pyramids
    for (int p = 0; p < pyramid_height; p++)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        // 每一层总的bins为 1 << 2p
        // 针对水平和垂直方向kernel大小和stride时的bins为 1 << p
        int num_bins = 1 << p;

        // pooing时，height方向的kernel大小和stride大小
        int kernel_h = ceil(h / (float)num_bins);
        int stride_h = kernel_h;
        int remainder_h = stride_h * num_bins - h;
        int pad_h = (remainder_h + 1) / 2;

        // pooing时，width方向的kernel大小和stride大小
        int kernel_w = ceil(w / (float)num_bins);
        int stride_w = kernel_w;
        int remainder_w = stride_w * num_bins - w;
        int pad_w = (remainder_w + 1) / 2;

        // max value in NxN window
        // avg value in NxN window

        // 输出的width和height
        int outw = num_bins;
        int outh = num_bins;

        Mat bottom_blob_bordered = bottom_blob;
        if (pad_h > 0 || pad_w > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;

            w = bottom_blob_bordered.w;
            h = bottom_blob_bordered.h;
        }

        // pooling时，数组长度
        const int maxk = kernel_h * kernel_w;

        // kernel offsets
        // 计算每个kernel对应偏移量
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - kernel_w;
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        if (pooling_type == PoolMethod_MAX)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                // 取出第q个channel
                const Mat m(w, h, bottom_blob_bordered.channel(q));
                // 输出blob的指针
                float* outptr = pyramid_ptr + outh * outw * q;

                // 输出每个元素对应于SPP的每个划分区域
                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.row(i*stride_h) + j*stride_w;

                        float max = sptr[0];

                        // maxpooling就是取每个划分区域的最大值
                        for (int k = 0; k < maxk; k++)
                        {
                            float val = sptr[ space_ofs[k] ];
                            max = std::max(max, val);
                        }

                        outptr[j] = max;
                    }

                    outptr += outw;
                }
            }
        }
        else if (pooling_type == PoolMethod_AVE)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const Mat m(w, h, bottom_blob_bordered.channel(q));
                float* outptr = pyramid_ptr + outh * outw * q;

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.row(i*stride_h) + j*stride_w;

                        float sum = 0;

                        // average pooling就是取每个划分区域的平均值
                        for (int k = 0; k < maxk; k++)
                        {
                            float val = sptr[ space_ofs[k] ];
                            sum += val;
                        }

                        outptr[j] = sum / maxk;
                    }

                    outptr += outw;
                }
            }
        }

        pyramid_ptr += channels * outh * outw;
    }

    return 0;
}

对着代码看一下，其实很清晰，ncnn代码写的真心棒！

6.split层

将输入blob复制n份给输出blob，对应操作为浅拷贝

对应ncnn代码为：

// 将输入blob拷贝n份，分别复制给输出blob
int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    for (size_t i=0; i<top_blobs.size(); i++)
    {
        top_blobs[i] = bottom_blob;
    }

    return 0;
}

7.softmax层

原理就不需要多说了，就是：

$f(x_{i})=\frac{e^{x_{i}}}{\sum_{j=0}^{N}e^{x_{j}}}$

但是对于下面函数而言，g(x)随着x成指数增加，当x较大时，就会导致数值溢出，所以一般处理时，会对每个元素做一个减去最大值的处理，这个可以参考资料[3]。

$g(x)=e^{x}$

所以，ncnn中softmax层实现时，首先会搜寻当前维度的最大值，然后每个元素减去最大值后，再使用原始的softmax公式计算：

// softmax
int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // value = exp( value - global max value )
    // sum all value
    // value = value / sum

    int dims = bottom_top_blob.dims;
    size_t elemsize = bottom_top_blob.elemsize;

    // 输入是一维
    if (dims == 1) // axis == 0
    {
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        float max = -FLT_MAX;
        // 求最大值
        for (int i=0; i<w; i++)
        {
            max = std::max(max, ptr[i]);
        }

        float sum = 0.f;
        // 去最大值，计算e^x并求和
        for (int i=0; i<w; i++)
        {
            ptr[i] = exp(ptr[i] - max);
            sum += ptr[i];
        }

        // 计算softmax
        for (int i=0; i<w; i++)
        {
            ptr[i] /= sum;
        }

        return 0;
    }

    // 如果维度为2，并按照第1维求取softmax
    // 对每一列求softmax
    if (dims == 2 && axis == 0)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        Mat max;
        max.create(w, elemsize, opt.workspace_allocator);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);

        // 获取每一列最大值
        for (int i=0; i<h; i++)
        {
            // 每次取一行
            const float* ptr = bottom_top_blob.row(i);
            // 每一列逐个元素求最大值
            for (int j=0; j<w; j++)
            {
                max[j] = std::max(max[j], ptr[j]);
            }
        }

        Mat sum;
        sum.create(w, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;
        sum.fill(0.f);

        // 每次处理一行
        // 每一行逐个元素进行减去最大值，并求和
        for (int i = 0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - max[j]);
                sum[j] += ptr[j];
            }
        }

        // 对每一行逐个元素求softmax处理
        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] /= sum[j];
            }
        }

        return 0;
    }

    // 沿着第二维方向处理
    if (dims == 2 && axis == 1)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            float m = -FLT_MAX;
            for (int j=0; j<w; j++)
            {
                m = std::max(m, ptr[j]);
            }

            float s = 0.f;
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - m);
                s += ptr[j];
            }

            for (int j=0; j<w; j++)
            {
                ptr[j] /= s;
            }
        }

        return 0;
    }

    // 与前面的类似
    if (dims == 3 && axis == 0)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        Mat max;
        max.create(w, h, elemsize, opt.workspace_allocator);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                max[i] = std::max(max[i], ptr[i]);
            }
        }

        Mat sum;
        sum.create(w, h, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = exp(ptr[i] - max[i]);
                sum[i] += ptr[i];
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] /= sum[i];
            }
        }

        return 0;
    }

    if (dims == 3 && axis == 1)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;

        Mat max;
        max.create(w, channels, elemsize, opt.workspace_allocator);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    maxptr[j] = std::max(maxptr[j], ptr[j]);
                }

                ptr += w;
            }
        }

        Mat sum;
        sum.create(w, channels, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - maxptr[j]);
                    sumptr[j] += ptr[j];
                }

                ptr += w;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    ptr[j] /= sumptr[j];
                }

                ptr += w;
            }
        }

        return 0;
    }

    if (dims == 3 && axis == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<h; i++)
            {
                float max = -FLT_MAX;
                for (int j=0; j<w; j++)
                {
                    max = std::max(max, ptr[j]);
                }

                float sum = 0.f;
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - max);
                    sum += ptr[j];
                }

                for (int j=0; j<w; j++)
                {
                    ptr[j] /= sum;
                }

                ptr += w;
            }
        }

        return 0;
    }

    return 0;
}

8.Slice层

将输入blob按照某个axis切分为N个部分，如输入为6*H*W，按照axis=0，将输入切分为三段，每一段对应长度分别为1,2,3，那么输出为：1*H*W，2*H*W，3*H*W，对应代码为：

int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    const int* slices_ptr = slices;

    // 输入维度为1
    if (dims == 1) // axis == 0
    {
        int w = bottom_blob.w;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            // 如果是-233就将剩余channel均匀分配
            if (slice == -233)
            {
                slice = (w - q) / (top_blobs.size() - i);
            }

            // 对第i个输出分配内存
            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            const float* ptr = (const float*)bottom_blob + q;
            float* outptr = top_blob;
            // 对第i个输出赋值
            memcpy(outptr, ptr, slice * elemsize);

            // 更新切分点
            q += slice;
        }

        return 0;
    }

    // 输入为二维，在第一个维度对输入进行切分
    if (dims == 2 && axis == 0)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            // 如果输入为-233，对后面的输出进行均匀分配
            if (slice == -233)
            {
                slice = (h - q) / (top_blobs.size() - i);
            }

            // 为第i个输出分配内存
            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int size = w * slice;

            const float* ptr = bottom_blob.row(q);
            float* outptr = top_blob;
            // 对第i个输出赋值
            memcpy(outptr, ptr, size * elemsize);

            // 更新切分点
            q += slice;
        }

        return 0;
    }

    // 后面的与上面类似
    if (dims == 2 && axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            if (slice == -233)
            {
                slice = (w - q) / (top_blobs.size() - i);
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j=0; j<h; j++)
            {
                float* outptr = top_blob.row(j);
                const float* ptr = bottom_blob.row(j) + q;
                memcpy(outptr, ptr, slice * elemsize);
            }

            q += slice;
        }

        return 0;
    }

    if (dims == 3 && axis == 0)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            if (slice == -233)
            {
                slice = (channels - q) / (top_blobs.size() - i);
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, slice, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int size = bottom_blob.cstep * slice;

            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob;
            memcpy(outptr, ptr, size * elemsize);

            q += slice;
        }

        return 0;
    }

    if (dims == 3 && axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            if (slice == -233)
            {
                slice = (h - q) / (top_blobs.size() - i);
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice, channels, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<channels; p++)
            {
                int size = w * slice;

                float* outptr = top_blob.channel(p);
                const float* ptr = bottom_blob.channel(p).row(q);
                memcpy(outptr, ptr, size * elemsize);
            }

            q += slice;
        }

        return 0;
    }

    if (dims == 3 && axis == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i=0; i<top_blobs.size(); i++)
        {
            int slice = slices_ptr[i];
            if (slice == -233)
            {
                slice = (w - q) / (top_blobs.size() - i);
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, channels, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<channels; p++)
            {
                float* outptr = top_blob.channel(p);
                const Mat m = bottom_blob.channel(p);

                for (int j=0; j<h; j++)
                {
                    const float* ptr = m.row(j) + q;
                    memcpy(outptr, ptr, slice * elemsize);

                    outptr += slice;
                }
            }

            q += slice;
        }

        return 0;
    }

    return 0;
}

9.Sigmoid函数

对每个输入进行sigmoid激活处理：

$f(x)=\frac{1}{1+e^{-x}}$

对应代码为：

// sigmoid函数
int Sigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        // 对每个输入进行sigmoid处理
        for (int i=0; i<size; i++)
        {
            ptr[i] = 1.f / (1.f + exp(-ptr[i]));
        }
    }

    return 0;
}

10.ShuffleChannel

精简的CNN网络都会采用Group Convolution技术，进而降低运算量，但是Group之间的特征图不会发生共享，导致每一个filter只能对部分特征可见，降低了模型的表征能力。为了降低group convolution对于模型表征能力的影响，会在Group convolution的特征图后面加入shuffle处理，使得接下来的group convolution filters可以在每个group输出的部分channel上进行计算，参考资料[4]。

ncnn对应的ShuffleChannel代码其实就是：

首先，将输入的channel分为group个组，每个组对应channel数目为：

$chsPerGroup=\frac{c}{group}$

然后，打乱顺序：

$out.channel(j*group+i)=in.channel(i*chsPerGroup+j)$

如图所示：

图3 Channel Shuffle示意图（摘自参考资料[5]）

对应ncnn代码为：

// 将channel进行分组
int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int c = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    // 每个group对应channel数目
    int chs_per_group = c / group;

    if (c != chs_per_group * group)
    {
        // reject invalid group
        return -100;
    }

    top_blob.create(w, h, c, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const size_t feature_sz = w * h * elemsize;
    // 遍历group
    for (int i = 0; i != group; i++)
    {
        // 遍历每个group对应channel
        for (int j = 0; j != chs_per_group; j++)
        {
            // src上第i个group的第j个channel
            int src_q = chs_per_group * i + j;
            // dst上第j个group第i个channel
            int dst_q = group * j + i;
            memcpy(top_blob.channel(dst_q), bottom_blob.channel(src_q), feature_sz);
        }
    }
    return 0;
}

11.SELU激活函数

计算公式为：

$f(x)=\lambda \left\{\begin{matrix} x & if\ x >0\\ \alpha e^{x}-\alpha &if \ x\leq 0 \end{matrix}\right.$

// SELU激活函数
int SELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    float alphaxlambda = alpha * lambda;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        // 计算SELU
        for (int i=0; i<size; i++)
        {
            if (ptr[i] < 0.f)
                ptr[i] = (exp(ptr[i]) - 1.f) * alphaxlambda;
            else
                ptr[i] *= lambda;
        }
    }

    return 0;
}

12.Scale操作

很简单，计算公式为:

$f(x)= \left\{\begin{matrix} scale*x & if\ no\ bias\\ \ scale*x+bias &else \end{matrix}\right.$

对应ncnn代码为：

int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
{
    Mat& bottom_top_blob = bottom_top_blobs[0];
    const Mat& scale_blob = bottom_top_blobs[1];

    int dims = bottom_top_blob.dims;

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        // 如果有bias
        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<w; i++)
            {
                ptr[i] = ptr[i] * scale_blob[i] + bias_data[i];
            }
        }
        else
        {
            // 如果没有bias
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<w; i++)
            {
                ptr[i] *= scale_blob[i];
            }
        }
    }

    // 二维矩阵
    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                float* ptr = bottom_top_blob.row(i);
                float s = scale_blob[i];
                float bias = bias_data[i];

                // 与上面的类似
                for (int j=0; j<w; j++)
                {
                    ptr[j] = ptr[j] * s + bias;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                float* ptr = bottom_top_blob.row(i);
                float s = scale_blob[i];

                // 不存在bias
                for (int j=0; j<w; j++)
                {
                    ptr[j] *= s;
                }
            }
        }
    }

    // 三维矩阵
    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);

                float s = scale_blob[q];
                float bias = bias_data[q];

                for (int i=0; i<size; i++)
                {
                    ptr[i] = ptr[i] * s + bias;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);

                float s = scale_blob[q];

                for (int i=0; i<size; i++)
                {
                    ptr[i] *= s;
                }
            }
        }
    }

    return 0;
}

13.roipooling操作

roipooling源自于faster rcnn网络，rcnn网络检测目标物体的思路为：先使用推断出有限的最可能存在物体的位置，再进行分类和回归，得到最终精确的目标位置，采用这个流程的目标检测方法也称为二阶段检测方法，这里不是解读faster rcnn算法，就不做深究了，但是理解roipooling，还是需要理解一下faster rcnn的思路，最原始的rcnn采用的方法是：直接在原始图像上使用region proposal技术生成2K个可能的目标位置，然后将对应图像块抠出来resize到一个固定大小放到神经网络中进行训练，生成的2K个区域每一个都需要放到神经网络中进行一次前向传播，训练耗时不说，还十分耗费资源；于是，作者就对其进行了改进，由于cnn网络每个神经元与输入其实存在着空间位置的对应关系，我们只需要将原始图像放到cnn网络中进行一次前向传播，就可以基于roi与特征图之间对应关系找到每个roi在feature map上的对应位置关系，然后采用roi pooling技术将不等尺寸的输入执行最大池化操作，以获得固定尺寸的特征映射，但是任然存在一个问题，前面候选区域生成还是采用的图像处理方法，无法直接放到gpu上进行训练，于是就有了faster rcnn，faster rcnn内嵌了一个rpn网络，取代了前面的region proposal功能，实现了一个端到端的训练过程，大大提升了模型训练效率（有点扯远了），具体可以参考资料[6]：

roipool需要进行三个步骤：

（1）找roi从输入到feature map的对应关系：

如图所示，输入尺寸为800x800，feature map尺寸为25x25

那么对应feat_stride = 800 / 25 = 32，也就是说，将输入上roi除以32，就可以得到对应于feature map上roi位置，如：

roi大小为665x665，对应于feature map上roi大小为：665/32 x 665/32 = 20.78 x 20.78，做一个向下取整操作有：20 x 20，从20.78到20的过程称为第一次量化。

（2）feature map 到roi pooling 输出的区域划分：

由于我们需要得到pool_w x pool_h的输出，所以需要将feature map上的roi划分为pool_w x pool_h个区域，那么每个区域对应的width和height为：

$bin\_size\_w =feature\_map_{roi}.width/pool\_w$

$bin\_size\_h =feature\_map_{roi}.height/pool\_h$

如，roipooling的最终尺寸为7x7，那么每个区域的width和height为：
$\begin{matrix} width = 20 / 7 = 2.86 \\ height = 20 / 7 = 2.86 \end{matrix}$

实际计算会对2.86取整，这就是第二次量化，得到每个区域大小为2x2

（3）对每个区域做maxpooling操作。

具体代码如下：

int ROIPooling::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int channels = bottom_blob.c;

    // 读取roi参数
    const Mat& roi_blob = bottom_blobs[1];

    // 为输出blob分配空间
    Mat& top_blob = top_blobs[0];
    top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // For each ROI R = [x y w h]: max pool over R
    // 读取roi：[x, y, width, height]
    const float* roi_ptr = roi_blob;

    // 计算在feature map上的roi
    int roi_x1 = round(roi_ptr[0] * spatial_scale);
    int roi_y1 = round(roi_ptr[1] * spatial_scale);
    int roi_x2 = round(roi_ptr[2] * spatial_scale);
    int roi_y2 = round(roi_ptr[3] * spatial_scale);

    // 计算feature map上roi的宽度和高度
    int roi_w = std::max(roi_x2 - roi_x1 + 1, 1);
    int roi_h = std::max(roi_y2 - roi_y1 + 1, 1);

    // 需要将feature map上的roi分解成pooled_width x pooled_height个region
    // 计算每个region对应长度
    float bin_size_w = (float)roi_w / (float)pooled_width;
    float bin_size_h = (float)roi_h / (float)pooled_height;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int ph = 0; ph < pooled_height; ph++)
        {
            for (int pw = 0; pw < pooled_width; pw++)
            {
                // Compute pooling region for this output unit:
                //  start (included) = floor(ph * roi_height / pooled_height)
                //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height)
                // 计算每个region的坐标位置
                int hstart = roi_y1 + floor((float)(ph) * bin_size_h);
                int wstart = roi_x1 + floor((float)(pw) * bin_size_w);
                int hend = roi_y1 + ceil((float)(ph + 1) * bin_size_h);
                int wend = roi_x1 + ceil((float)(pw + 1) * bin_size_w);

                // 越界检查
                hstart = std::min(std::max(hstart, 0), h);
                wstart = std::min(std::max(wstart, 0), w);
                hend = std::min(std::max(hend, 0), h);
                wend = std::min(std::max(wend, 0), w);

                // 判断region是否为空
                bool is_empty = (hend <= hstart) || (wend <= wstart);

                // 赋初值
                float max = is_empty ? 0.f : ptr[hstart * w + wstart];

                // maxpooling
                for (int y = hstart; y < hend; y++)
                {
                    for (int x = wstart; x < wend; x++)
                    {
                        int index = y * w + x;
                        max = std::max(max, ptr[index]);
                    }
                }

                outptr[pw] = max;
            }

            // 更新输出指针
            outptr += pooled_width;
        }
    }

    return 0;
}

14.roialign操作

前面的roipooling存在一个很严重的问题，那就是两次量化之后存在较大偏差，这对于后面回归任务而言毕然造成影响，因此，就有了roialign，和前面roipooling不同的是：直接采用浮点数计算，不进行向下取整的过程，最后计算pooling的区域大小后，对区域中每个元素采用双线性插值的形式计算后求均值或maxpooling得到，具体过程如下：

（1）找roi从输入到feature map的对应关系：

如图上面图所示，输入尺寸为800x800，feature map尺寸为25x25

那么对应feat_stride = 800 / 25 = 32，也就是说，将输入上roi除以32，就可以得到对应于feature map上roi位置，如：

roi大小为665x665，对应于feature map上roi大小为：665/32 x 665/32 = 20.78 x 20.78。

（2）feature map 到roi pooling 输出的区域划分：

由于我们需要得到pool_w x pool_h的输出，所以需要将feature map上的roi划分为pool_w x pool_h个区域，那么每个区域对应的width和height为：

$bin\_size\_w =feature\_map_{roi}.width/pool\_w$

$bin\_size\_h =feature\_map_{roi}.height/pool\_h$

如，roipooling的最终尺寸为7x7，那么每个区域的width和height为：
$\begin{matrix} width = 20.78 / 7 = 2.97 \\ height = 20.78 / 7 = 2.97 \end{matrix}$

（3）对每个区域的每个元素进行双线性插值求均值（ncnn代码里面是这样实现的，但是在一些其他的地方是双线性插值之后，再maxpooling）。

具体代码为：

int ROIAlign::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int channels = bottom_blob.c;

    const Mat& roi_blob = bottom_blobs[1];

    Mat& top_blob = top_blobs[0];
    top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // For each ROI R = [x y w h]: avg pool over R
    const float* roi_ptr = roi_blob;

    // 计算roi在feature map上的位置
    float roi_x1 = roi_ptr[0] * spatial_scale;
    float roi_y1 = roi_ptr[1] * spatial_scale;
    float roi_x2 = roi_ptr[2] * spatial_scale;
    float roi_y2 = roi_ptr[3] * spatial_scale;

    // 计算roi的size
    float roi_w = std::max(roi_x2 - roi_x1, 1.f);
    float roi_h = std::max(roi_y2 - roi_y1, 1.f);

    // 计算每个小区域的size
    float bin_size_w = roi_w / (float)pooled_width;
    float bin_size_h = roi_h / (float)pooled_height;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int ph = 0; ph < pooled_height; ph++)
        {
            for (int pw = 0; pw < pooled_width; pw++)
            {
                // Compute pooling region for this output unit:
                //  start (included) = ph * roi_height / pooled_height
                //  end (excluded) = (ph + 1) * roi_height / pooled_height
                // 计算每个小区域的的位置
                float hstart = roi_y1 + ph * bin_size_h;
                float wstart = roi_x1 + pw * bin_size_w;
                float hend = roi_y1 + (ph + 1) * bin_size_h;
                float wend = roi_x1 + (pw + 1) * bin_size_w;

                // 越界检测
                hstart = std::min(std::max(hstart, 0.f), (float)h);
                wstart = std::min(std::max(wstart, 0.f), (float)w);
                hend = std::min(std::max(hend, 0.f), (float)h);
                wend = std::min(std::max(wend, 0.f), (float)w);

                // 计算每个小区域的size
                int bin_grid_h = ceil(hend - hstart);
                int bin_grid_w = ceil(wend - wstart);

                // 判断是否为空
                bool is_empty = (hend <= hstart) || (wend <= wstart);
                // 计算面积
                int area = bin_grid_h * bin_grid_w;

                float sum = 0.f;
                for (int by = 0; by < bin_grid_h; by++)
                {
                    float y = hstart + (by + 0.5f) * bin_size_h / (float)bin_grid_h;

                    for (int bx = 0; bx < bin_grid_w; bx++)
                    {
                        float x = wstart + (bx + 0.5f) * bin_size_w / (float)bin_grid_w;

                        // bilinear interpolate at (x,y)
                        // 对(x,y)位置进行双线性插值
                        float v = bilinear_interpolate(ptr, w, h, x, y);

                        sum += v;
                    }
                }

                // 求平均
                outptr[pw] = is_empty ? 0.f : (sum / (float)area);
            }

            outptr += pooled_width;
        }
    }

    return 0;
}

15.Reshape操作

改变形状，这个没啥好说的，直接看代码：

int Reshape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    size_t elemsize = bottom_blob.elemsize;
    int total = bottom_blob.w * bottom_blob.h * bottom_blob.c;

    // 输入为一维
    if (ndim == 1)
    {
        int _w = w;

        if (_w == 0)
            _w = bottom_blob.w;

        // 计算reshape后mat的形状
        if (_w == -1)
            _w = total;

        // 转置
        if (permute == 1)
        {
            top_blob.create(_w, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            // c-h-w to h-w-c
            float* ptr = top_blob;
            for (int i=0; i<bottom_blob.h; i++)
            {
                for (int j=0; j<bottom_blob.w; j++)
                {
                    for (int p=0; p<bottom_blob.c; p++)
                    {
                        const float* bptr = bottom_blob.channel(p);
                        *ptr++ = bptr[i*bottom_blob.w + j];
                    }
                }
            }
        }
        else
        {
            top_blob = bottom_blob.reshape(_w, opt.blob_allocator);
        }
    }
    else if (ndim == 2)
    {
        int _w = w;
        int _h = h;

        if (_w == 0)
            _w = bottom_blob.w;
        if (_h == 0)
            _h = bottom_blob.h;

        // 计算reshape后mat的形状
        if (_w == -1)
            _w = total / _h;
        if (_h == -1)
            _h = total / _w;

        top_blob = bottom_blob.reshape(_w, _h, opt.blob_allocator);
    }
    else if (ndim == 3)
    {
        int _w = w;
        int _h = h;
        int _c = c;

        if (_w == 0)
            _w = bottom_blob.w;
        if (_h == 0)
            _h = bottom_blob.h;
        if (_c == 0)
            _c = bottom_blob.c;

        // 计算reshape后mat的形状
        if (_w == -1)
            _w = total / _c / _h;
        if (_h == -1)
            _h = total / _c / _w;
        if (_c == -1)
            _c = total / _h / _w;

        top_blob = bottom_blob.reshape(_w, _h, _c, opt.blob_allocator);
    }

    if (top_blob.empty())
        return -100;

    return 0;
}

参考资料：

[1] https://github.com/Tencent/ncnn

[2] https://www.bilibili.com/video/av64313133?from=search&seid=6032750934674516839

[3] https://www.jianshu.com/p/afa0ac6b7201

[4] https://www.jianshu.com/p/44db6d72d6eb

[5] Zhang X, Zhou X, Lin M, et al. ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices[J]. arXiv: Computer Vision and Pattern Recognition, 2017.

[6] https://www.cnblogs.com/wangyong/p/8523814.html