CMYK to RGBA

半块方便面

于 2023-10-08 15:23:10 发布

阅读量230

点赞数

分类专栏：性能优化文章标签：算法 arm开发

本文链接：https://blog.youkuaiyun.com/qq_35227495/article/details/133681844

版权

性能优化专栏收录该内容

12 篇文章

订阅专栏

1、算法原理

我们首先来看算法原理：
在这里插入图片描述
显而易见，CMYK是存储起来的四个值，通过变换可以转换成RGBA值，其中A值始终为255

2、原始代码

// The original code uses a macro UNROLL8 to unroll code and
// each iteration processes eight pixels.
//
// The macro actually contains a for loop that iterates over a single line
// (the w parameter is the width of the image).

#define UNROLL8(w, op1, op2) { \
  uint32_t _x; \
// For the whole width of the image..
  for (_x = w; _x >= 8; _x -= 8) { \
  op1; \
// Repeat for 8 pixels..
  REPEAT8(op2); \
  } \
// For any pixels left over..
  if (_x > 0) { \
    op1; \
    CASE8(_x,op2); \
  } \
} // end of macro UNROLL8()

// For each line of the image (h is the image height)
for( ; h > 0; --h) {
  // Convert 8 pixels from CMYK to RGBA (A is always 255)
  UNROLL8(w, NOP, {
    k = 255 - pp[3];
    r = (k*(255-pp[0]))/255;
    g = (k*(255-pp[1]))/255;
    b = (k*(255-pp[2]))/255;
    // Write each pixel to memory one at a time
    *cp++ = PACK(r, g, b);
    pp += samplesperpixel
  });
  cp += toskew;
  pp += fromskew;
}

这是原始代码，注意：k=255-pp[3]这对于Neon优化非常重要

3、Neon优化

这段代码初读起来非常的绕，我们进行一一拆解

kindices 的作用在于求k，即255-pp[3]，我们接下来看看是如何求 k 的
1. 因为k=255-pp[3]，要求k就要计算 255 - pp[3]
2. 使用vld1q_u8加载4个像素，然后分两次计算并将位数扩大至16位
3. kindices 的值为 6 255 6 255 6 255 … 14 255 14 255 14 255 …
4. 以subl(uint16x8_t )举例，此时共8个数，我们清楚kindices共16个数，这是如何对应的呢？很好理解，subl的8个数分别对应 01 23 …，即一个数对应两个索引（这一点我们用文章最后的实验说明），所以6和14刚好是 255 - pp[3]，得到的结果 kl 和 kh 的值为 k 0 k 0 …
5. 使用 vmulq_u16计算出RGB值，因为 kl 和 kh 是 uint8x16_t 类型，在计算时 k 0组合值即为k

到此就讲解结束了，后面就是比较常规的过程，其中 vuzp1q_u8 只是取出有效的数值，从这个例子看出，我们要灵活使用寄存器来达到自己的目的。

// Loop over all pixels of the image
uint32_t np = w * h; 
uint32_t* endp = cp + np;

// Indices for VTBL that duplicate each pixels K value
uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
uint8x16_t kindices = vcombine_u8(dupK1, dupK2);

// Indices to obtain the final results
uint8_t resultIndices[16] = {0,1,2,-1,4,5,6,-1,8,9,10,-1,12,13,14,-1};
while(cp < endp) {
    // 16 copies of 255
    uint8x16_t v255 = vdupq_n_u8 (255);
    // load 4 pixels (each pixel is 4 bytes with the CMYK values)
    uint8x16_t src_u8 = vld1q_u8(pp);
    // perform (255 - x) on each component
    // each vsubl is working on 2 pixels
    uint16x8_t subl = vsubl_u8(vget_low_u8(v255), vget_low_u8(src_u8));
    uint16x8_t subh = vsubl_high_u8(v255, src_u8);
    // duplicate k element from each pixel in subl
    uint8x16_t kl = vqtbl1q_u8(subl, kindices);
    uint8x16_t kh = vqtbl1q_u8(subh, kindices);
    // multiply (255 - x) by (255 - k)
    uint16x8_t ml = vmulq_u16(kl, subl);
    uint16x8_t mh = vmulq_u16(kh, subh);
    // the results we need are in the low 8 bits of the uint16 elements
    // combine results and result (throwing away all the upper halves of all the uint16)
    uint8x16_t idx = vld1q_u8 (resultIndices);
    uint16x8_t resultl = ml / 255; 
    uint16x8_t resulth = mh / 255; 
    uint8x16_t packed = vuzp1q_u8 (vreinterpretq_u8_u16 (resultl),
                                   vreinterpretq_u8_u16 (resulth));
    // wherever the index is -1, we take the value from v255 (we return 255 in alpha)
    uint8x16_t pixels = vqtbx1q_u8 (v255, packed, idx);
    // store the four RGBA pixels and advance the pointers/counters
    vst1q_u8((uint8_t*)cp, pixels);
    cp += 4;
    pp += samplesperpixel * 4;
}

思考：

#include <iostream>
#include "arm_neon.h"

void test_vqtbl1q_u8(void)
{
    uint16x8_t lt = {0, 1, 2, 3, 4, 5, 6, 7};
    uint16x8_t ht = {1, 2, 3, 4, 5, 6, 7, 8};
    
    uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
    uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
    uint8x16_t kindices = vcombine_u8(dupK1, dupK2);

    uint8x16_t lret = vqtbl1q_u8(lt,kindices);
    uint8x16_t hret = vqtbl1q_u8(ht,kindices);
    
    for(int i = 0; i < 16; ++i){
        printf("%d ", lret[i]);
    }
    printf("\n");

    for(int i = 0; i < 16; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}

int main(){
    test_vqtbl1q_u8();
    return 0;
}

通过这段代码可以验证第三节的结论，假设将dupk1的06换成03或01等任意奇数，则结果即为0 0 0 0 … 7 0 7 0 …

编译下面的代码，并打印lret可得到序列1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 ，可以看到奇数位置就是 0。

    uint8x16_t lret = vreinterpretq_u8_u16(ht);

同时我们运行下面的代码，可以发现，1，0被翻译成了1，其他的同理，因此neon优化的代码 uint16x8_t ml = vmulq_u16(kl, subl); uint16x8_t mh = vmulq_u16(kh, subh);可以被正确执行

void test_vqtbl1q_u8(void)
{
    uint8x16_t lt = {1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
    uint16x8_t hret = vreinterpretq_u16_u8(lt);
    for(int i = 0; i < 8; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}

我们进一步测试：发现 3，2被翻译成了515，因此neon寄存器后面的数处于高位，前面的数处于低位，即 00000010(2) 00000011(3),合在一起就是515

void test_vqtbl1q_u8(void)
{

    uint8x16_t lt = {1, 0, 3, 2, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
    
    uint16x8_t hret = vreinterpretq_u16_u8(lt);

    for(int i = 0; i < 8; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}