void cvt_420_422(int width, int height, char *src, char *dst)
{
unsigned char *py = (unsigned char *)src;
unsigned char *pu = (unsigned char *)src + width * height;
unsigned char *pv = (unsigned char *)src + width * height * 5 / 4;
unsigned char *pPixels = (unsigned char *)dst;
uint64_t looprow = width / 8;
uint64_t loopcol = height / 2;
uint64_t step1 = width;
uint64_t step2 = width * 2;
__asm__
{
mov rdi, pPixels;
mov rsi, py;
mov r8, step1;
add r8, rsi;
mov r9, pu;
mov r10, pv;
mov r11, step2;
add r11, rdi;
mov rbx, loopcol;
COL:
mov rcx, looprow;
ROW:
movq xmm0, [rsi];
movq xmm1, [r8];
movd xmm3, [r9];
movd xmm4, [r10];
add rsi, 8;
add r8, 8;
add r9, 4;
add r10, 4;
pxor xmm5, xmm5;
punpcklbw xmm3, xmm5;
punpcklbw xmm5, xmm4;
pxor xmm4, xmm4;
punpcklbw xmm4, xmm5;
punpcklbw xmm0, xmm3;
punpcklbw xmm1, xmm3;
por xmm0, xmm4;
por xmm1, xmm4;
movdqu [rdi], xmm0;
movdqu [r11], xmm1;
add rdi, 16;
add r11, 16;
dec rcx;
jnz ROW;
add rdi, step2;
add r11, step2;
add rsi, step1;
add r8, step1;
dec rbx;
jnz COL;
emms;
}
}