arm neon 图像旋转

最新推荐文章于 2023-07-19 23:56:51 发布

转载最新推荐文章于 2023-07-19 23:56:51 发布 · 575 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://blog.youkuaiyun.com/u010580016/article/details/111124800#t2

文章标签：

#arm #嵌入式硬件 #计算机视觉

arm 专栏收录该内容

6 篇文章

订阅专栏

这篇博客介绍了使用ARM NEON指令集优化的图像旋转算法，通过加载、表格查找和存储操作，高效地实现了8x8像素块的90度旋转，适用于快速处理图像数据。

转自：ARM NEON优化5.图像旋转_亦梦云烟的博客-优快云博客_neon 旋转

输入高和宽为原图的高和宽

#include <arm_neon.h>

int GrayRotation90_NEON(uint8_t * in, uint8_t* out, int height, int width)
{
    uint8x8_t vone = {1,1,1,1,1,1,1,1};
    uint8x8_t index_0 = {28, 20, 12, 4, 24, 16, 8, 0};
    uint8x8_t index_1 = vadd_u8(index_0, vone);
    uint8x8_t index_2 = vadd_u8(index_1, vone);
    uint8x8_t index_3 = vadd_u8(index_2, vone);
    uint8x8_t index_4 = {12, 13, 14, 15, 4, 5, 6 ,7};
    uint8x8_t index_5 = {8, 9, 10, 11, 0, 1, 2, 3};
    uint8x8x4_t mat0;
    uint8x8x4_t mat1;
    uint8x8x4_t temp0;
    uint8x8x4_t temp1;
    uint8x8x2_t out0;
    uint8x8x2_t out1;
    uint8x8x2_t out2;
    uint8x8x2_t out3;
 
    int x = 0, y = 0;
    for(y=0; y<height; y+=8)
    {
        for(x=0; x<width; x+=8)
        {
            mat0.val[0] = vld1_u8(in + y*width+x);
            mat0.val[1] = vld1_u8(in + (y+1)*width+x);
            mat0.val[2] = vld1_u8(in + (y+2)*width+x);
            mat0.val[3] = vld1_u8(in + (y+3)*width+x);
            mat1.val[0] = vld1_u8(in + (y+4)*width+x);
            mat1.val[1] = vld1_u8(in + (y+5)*width+x);
            mat1.val[2] = vld1_u8(in + (y+6)*width+x);
            mat1.val[3] = vld1_u8(in + (y+7)*width+x);
 
            temp0.val[0] = vtbl4_u8(mat0, index_0);
            temp0.val[1] = vtbl4_u8(mat0, index_1);
            temp0.val[2] = vtbl4_u8(mat0, index_2);
            temp0.val[3] = vtbl4_u8(mat0, index_3);
 
            temp1.val[0] = vtbl4_u8(mat1, index_0);
            temp1.val[1] = vtbl4_u8(mat1, index_1);
            temp1.val[2] = vtbl4_u8(mat1, index_2);
            temp1.val[3] = vtbl4_u8(mat1, index_3);
 
            out0.val[0] = temp0.val[0];
            out0.val[1] = temp1.val[0];
            out1.val[0] = temp0.val[1];
            out1.val[1] = temp1.val[1];
            out2.val[0] = temp0.val[2];
            out2.val[1] = temp1.val[2];
            out3.val[0] = temp0.val[3];
            out3.val[1] = temp1.val[3];
            mat0.val[0] = vtbl2_u8(out0, index_4); // line 0
            mat0.val[1] = vtbl2_u8(out0, index_5); // line 4
            mat0.val[2] = vtbl2_u8(out1, index_4); // line 1
            mat0.val[3] = vtbl2_u8(out1, index_5); // line 5
 
            mat1.val[0] = vtbl2_u8(out2, index_4); // line 2
            mat1.val[1] = vtbl2_u8(out2, index_5); // line 6
            mat1.val[2] = vtbl2_u8(out3, index_4); // line 3
            mat1.val[3] = vtbl2_u8(out3, index_5); // line 7
 
            // store out data in order: 0, 4, 1, 5, 2, 6, 3, 7
            vst1_u8(out + (x + 0) * height + height-8 - y, mat0.val[0]); // line 0
            vst1_u8(out + (x + 1) * height + height-8 - y, mat0.val[2]); // line 1
            vst1_u8(out + (x + 2) * height + height-8 - y, mat1.val[0]); // line 2
            vst1_u8(out + (x + 3) * height + height-8 - y, mat1.val[2]); // line 3
            vst1_u8(out + (x + 4) * height + height-8 - y, mat0.val[1]); // line 4
            vst1_u8(out + (x + 5) * height + height-8 - y, mat0.val[3]); // line 5
            vst1_u8(out + (x + 6) * height + height-8 - y, mat1.val[1]); // line 6
            vst1_u8(out + (x + 7) * height + height-8 - y, mat1.val[3]); // line 7
        }
    }
 
    return 0;
}