1、 算法原理
我们首先来看算法原理:
显而易见,CMYK是存储起来的四个值,通过变换可以转换成RGBA值,其中A值始终为255
2、原始代码
// The original code uses a macro UNROLL8 to unroll code and
// each iteration processes eight pixels.
//
// The macro actually contains a for loop that iterates over a single line
// (the w parameter is the width of the image).
#define UNROLL8(w, op1, op2) { \
uint32_t _x; \
// For the whole width of the image..
for (_x = w; _x >= 8; _x -= 8) { \
op1; \
// Repeat for 8 pixels..
REPEAT8(op2); \
} \
// For any pixels left over..
if (_x > 0) { \
op1; \
CASE8(_x,op2); \
} \
} // end of macro UNROLL8()
// For each line of the image (h is the image height)
for( ; h > 0; --h) {
// Convert 8 pixels from CMYK to RGBA (A is always 255)
UNROLL8(w, NOP, {
k = 255 - pp[3];
r = (k*(255-pp[0]))/255;
g = (k*(255-pp[1]))/255;
b = (k*(255-pp[2]))/255;
// Write each pixel to memory one at a time
*cp++ = PACK(r, g, b);
pp += samplesperpixel
});
cp += toskew;
pp += fromskew;
}
这是原始代码,注意:k=255-pp[3]
这对于Neon优化非常重要
3、Neon优化
这段代码初读起来非常的绕,我们进行一一拆解
- kindices 的作用在于求
k
,即255-pp[3]
,我们接下来看看是如何求 k 的- 因为
k=255-pp[3]
,要求k就要计算 255 - pp[3] - 使用
vld1q_u8
加载4个像素,然后分两次计算并将位数扩大至16位 - kindices 的值为 6 255 6 255 6 255 … 14 255 14 255 14 255 …
- 以subl
(uint16x8_t )
举例,此时共8个数,我们清楚kindices共16个数,这是如何对应的呢?很好理解,subl的8个数分别对应 01 23 …,即一个数对应两个索引(这一点我们用文章最后的实验说明),所以6和14刚好是 255 - pp[3],得到的结果 kl 和 kh 的值为 k 0 k 0 … - 使用
vmulq_u16
计算出RGB值,因为 kl 和 kh 是 uint8x16_t 类型,在计算时 k 0组合值即为k
- 因为
到此就讲解结束了,后面就是比较常规的过程,其中 vuzp1q_u8 只是取出有效的数值,从这个例子看出,我们要灵活使用寄存器来达到自己的目的。
// Loop over all pixels of the image
uint32_t np = w * h;
uint32_t* endp = cp + np;
// Indices for VTBL that duplicate each pixels K value
uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
uint8x16_t kindices = vcombine_u8(dupK1, dupK2);
// Indices to obtain the final results
uint8_t resultIndices[16] = {0,1,2,-1,4,5,6,-1,8,9,10,-1,12,13,14,-1};
while(cp < endp) {
// 16 copies of 255
uint8x16_t v255 = vdupq_n_u8 (255);
// load 4 pixels (each pixel is 4 bytes with the CMYK values)
uint8x16_t src_u8 = vld1q_u8(pp);
// perform (255 - x) on each component
// each vsubl is working on 2 pixels
uint16x8_t subl = vsubl_u8(vget_low_u8(v255), vget_low_u8(src_u8));
uint16x8_t subh = vsubl_high_u8(v255, src_u8);
// duplicate k element from each pixel in subl
uint8x16_t kl = vqtbl1q_u8(subl, kindices);
uint8x16_t kh = vqtbl1q_u8(subh, kindices);
// multiply (255 - x) by (255 - k)
uint16x8_t ml = vmulq_u16(kl, subl);
uint16x8_t mh = vmulq_u16(kh, subh);
// the results we need are in the low 8 bits of the uint16 elements
// combine results and result (throwing away all the upper halves of all the uint16)
uint8x16_t idx = vld1q_u8 (resultIndices);
uint16x8_t resultl = ml / 255;
uint16x8_t resulth = mh / 255;
uint8x16_t packed = vuzp1q_u8 (vreinterpretq_u8_u16 (resultl),
vreinterpretq_u8_u16 (resulth));
// wherever the index is -1, we take the value from v255 (we return 255 in alpha)
uint8x16_t pixels = vqtbx1q_u8 (v255, packed, idx);
// store the four RGBA pixels and advance the pointers/counters
vst1q_u8((uint8_t*)cp, pixels);
cp += 4;
pp += samplesperpixel * 4;
}
思考:
#include <iostream>
#include "arm_neon.h"
void test_vqtbl1q_u8(void)
{
uint16x8_t lt = {0, 1, 2, 3, 4, 5, 6, 7};
uint16x8_t ht = {1, 2, 3, 4, 5, 6, 7, 8};
uint8x8_t dupK1 = vcreate_u8(0xff06ff06ff06ff06ull);
uint8x8_t dupK2 = vcreate_u8(0xff0eff0eff0eff0eull);
uint8x16_t kindices = vcombine_u8(dupK1, dupK2);
uint8x16_t lret = vqtbl1q_u8(lt,kindices);
uint8x16_t hret = vqtbl1q_u8(ht,kindices);
for(int i = 0; i < 16; ++i){
printf("%d ", lret[i]);
}
printf("\n");
for(int i = 0; i < 16; ++i){
printf("%d ", hret[i]);
}
printf("\n");
}
int main(){
test_vqtbl1q_u8();
return 0;
}
通过这段代码可以验证第三节的结论,假设将dupk1的06换成03或01等任意奇数,则结果即为0 0 0 0 … 7 0 7 0 …
编译下面的代码,并打印lret可得到序列1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0
,可以看到奇数位置就是 0。
uint8x16_t lret = vreinterpretq_u8_u16(ht);
同时我们运行下面的代码,可以发现,1,0被翻译成了1,其他的同理,因此neon优化的代码 uint16x8_t ml = vmulq_u16(kl, subl); uint16x8_t mh = vmulq_u16(kh, subh);
可以被正确执行
void test_vqtbl1q_u8(void)
{
uint8x16_t lt = {1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
uint16x8_t hret = vreinterpretq_u16_u8(lt);
for(int i = 0; i < 8; ++i){
printf("%d ", hret[i]);
}
printf("\n");
}
我们进一步测试:发现 3,2被翻译成了515,因此neon寄存器后面的数处于高位,前面的数处于低位,即 00000010(2) 00000011(3)
,合在一起就是515
void test_vqtbl1q_u8(void)
{
uint8x16_t lt = {1, 0, 3, 2, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0};
uint16x8_t hret = vreinterpretq_u16_u8(lt);
for(int i = 0; i < 8; ++i){
printf("%d ", hret[i]);
}
printf("\n");
}