CMYK to RGBA

1、 算法原理

我们首先来看算法原理:
在这里插入图片描述
显而易见,CMYK是存储起来的四个值,通过变换可以转换成RGBA值,其中A值始终为255

2、原始代码

// The original code uses a macro UNROLL8 to unroll code and
// each iteration processes eight pixels.
//
// The macro actually contains a for loop that iterates over a single line
// (the w parameter is the width of the image).

#define UNROLL8(w, op1, op2) { \
  uint32_t _x; \
// For the whole width of the image..
  for (_x = w; _x >= 8; _x -= 8) { \
  op1; \
// Repeat for 8 pixels..
  REPEAT8(op2); \
  } \
// For any pixels left over..
  if (_x > 0) { \
    op1; \
    CASE8(_x,op2); \
  } \
} // end of macro UNROLL8()

// For each line of the image (h is the image height)
for( ; h > 0; --h) {
  // Convert 8 pixels from CMYK to RGBA (A is always 255)
  UNROLL8(w, NOP, {
    k = 255 - pp[3];
    r = (k*(255-pp[0]))/255;
    g = (k*(255-pp[1]))/255;
    b = (k*(255-pp[2]))/255;
    // Write each pixel to memory one at a time
    *cp++ = PACK(r, g, b);
    pp += samplesperpixel
  });
  cp += toskew;
  pp += fromskew;
}

这是原始代码,注意:k=255-pp[3]这对于Neon优化非常重要

3、Neon优化

这段代码初读起来非常的绕,我们进行一一拆解

  • kindices 的作用在于求k,即255-pp[3],我们接下来看看是如何求 k 的
    1. 因为k=255-pp[3],要求k就要计算 255 - pp[3]
    2. 使用vld1q_u8加载4个像素,然后分两次计算并将位数扩大至16位
    3. kindices 的值为 6 255 6 255 6 255 … 14 255 14 255 14 255 …
    4. 以subl(uint16x8_t )举例,此时共8个数,我们清楚kindices共16个数,这是如何对应的呢?很好理解,subl的8个数分别对应 01 23 …,即一个数对应两个索引(这一点我们用文章最后的实验说明),所以6和14刚好是 255 - pp[3],得到的结果 kl 和 kh 的值为 k 0 k 0 …
    5. 使用 vmulq_u16计算出RGB值,因为 kl 和 kh 是 uint8x16_t 类型,在计算时 k 0组合值即为k

到此就讲解结束了,后面就是比较常规的过程,其中 vuzp1q_u8 只是取出有效的数值,从这个例子看出,我们要灵活使用寄存器来达到自己的目的。

// Loop over all pixels of the image
uint32_t np = w * h; 
uint32_t* endp = cp + np;

// Indices for VTBL that duplicate each pixels K value
uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
uint8x16_t kindices = vcombine_u8(dupK1, dupK2);

// Indices to obtain the final results
uint8_t resultIndices[16] = {0,1,2,-1,4,5,6,-1,8,9,10,-1,12,13,14,-1};
while(cp < endp) {
    // 16 copies of 255
    uint8x16_t v255 = vdupq_n_u8 (255);
    // load 4 pixels (each pixel is 4 bytes with the CMYK values)
    uint8x16_t src_u8 = vld1q_u8(pp);
    // perform (255 - x) on each component
    // each vsubl is working on 2 pixels
    uint16x8_t subl = vsubl_u8(vget_low_u8(v255), vget_low_u8(src_u8));
    uint16x8_t subh = vsubl_high_u8(v255, src_u8);
    // duplicate k element from each pixel in subl
    uint8x16_t kl = vqtbl1q_u8(subl, kindices);
    uint8x16_t kh = vqtbl1q_u8(subh, kindices);
    // multiply (255 - x) by (255 - k)
    uint16x8_t ml = vmulq_u16(kl, subl);
    uint16x8_t mh = vmulq_u16(kh, subh);
    // the results we need are in the low 8 bits of the uint16 elements
    // combine results and result (throwing away all the upper halves of all the uint16)
    uint8x16_t idx = vld1q_u8 (resultIndices);
    uint16x8_t resultl = ml / 255; 
    uint16x8_t resulth = mh / 255; 
    uint8x16_t packed = vuzp1q_u8 (vreinterpretq_u8_u16 (resultl),
                                   vreinterpretq_u8_u16 (resulth));
    // wherever the index is -1, we take the value from v255 (we return 255 in alpha)
    uint8x16_t pixels = vqtbx1q_u8 (v255, packed, idx);
    // store the four RGBA pixels and advance the pointers/counters
    vst1q_u8((uint8_t*)cp, pixels);
    cp += 4;
    pp += samplesperpixel * 4;
}

思考:

#include <iostream>
#include "arm_neon.h"

void test_vqtbl1q_u8(void)
{
    uint16x8_t lt = {0, 1, 2, 3, 4, 5, 6, 7};
    uint16x8_t ht = {1, 2, 3, 4, 5, 6, 7, 8};
    
    uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
    uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
    uint8x16_t kindices = vcombine_u8(dupK1, dupK2);

    uint8x16_t lret = vqtbl1q_u8(lt,kindices);
    uint8x16_t hret = vqtbl1q_u8(ht,kindices);
    
    for(int i = 0; i < 16; ++i){
        printf("%d ", lret[i]);
    }
    printf("\n");

    for(int i = 0; i < 16; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}

int main(){
    test_vqtbl1q_u8();
    return 0;
}

通过这段代码可以验证第三节的结论,假设将dupk1的06换成03或01等任意奇数,则结果即为0 0 0 0 … 7 0 7 0 …

编译下面的代码,并打印lret可得到序列1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 ,可以看到奇数位置就是 0。

    uint8x16_t lret = vreinterpretq_u8_u16(ht);

同时我们运行下面的代码,可以发现,1,0被翻译成了1,其他的同理,因此neon优化的代码 uint16x8_t ml = vmulq_u16(kl, subl); uint16x8_t mh = vmulq_u16(kh, subh);可以被正确执行

void test_vqtbl1q_u8(void)
{
    uint8x16_t lt = {1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
    uint16x8_t hret = vreinterpretq_u16_u8(lt);
    for(int i = 0; i < 8; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}

我们进一步测试:发现 3,2被翻译成了515,因此neon寄存器后面的数处于高位,前面的数处于低位,即 00000010(2) 00000011(3),合在一起就是515

void test_vqtbl1q_u8(void)
{

    uint8x16_t lt = {1, 0, 3, 2, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
    
    uint16x8_t hret = vreinterpretq_u16_u8(lt);

    for(int i = 0; i < 8; ++i){
        printf("%d ", hret[i]);
    }
    printf("\n");
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值