RGB24 To Yuv420 C语言 +汇编实现(windows平台)

本文介绍了一种从RGB24格式图像数据转换为I420格式的方法,利用SSSE3指令集进行高效处理。该过程涉及RGB到ARGB的转换、ARGB到YUV的分离以及最终的I420格式输出。通过条件编译和内存对齐等技术提高处理速度。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

以下代码来自libyuv


#include <stdint.h>
#include <stdlib.h>
#include <string.h>


#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))

#define align_buffer_64(var, size)                                           \
	uint8_t* var##_mem = (uint8_t*)(malloc((size)+63));         /* NOLINT */ \
	uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */

#define free_aligned_buffer_64(var)											\
	free(var##_mem);														\
	var = 0

#define SIMD_ALIGNED(var) __declspec(align(16)) var

#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))

typedef __declspec(align(16)) uint8_t uvec8[16];

static const uvec8 kShuffleMaskRGB24ToARGB = {
	0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u };

typedef __declspec(align(16)) int8_t vec8[16];

static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0,
13, 65, 33, 0, 13, 65, 33, 0 };

static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u };

typedef __declspec(align(32)) uint8_t ulvec8[32];

#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)						 \
	void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \
	uint8_t* dst_v, int width) {												\
	SIMD_ALIGNED(uint8_t temp[128 * 4]);										\
	memset(temp, 0, 128 * 2); /* for msan */									\
	int r = width & MASK;														\
	int n = width & ~MASK;														\
if (n > 0) {																	\
	ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);								\
}																				\
	memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
	memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
	SS(r, UVSHIFT) * BPP);                                            \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
	memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
	BPP);                                                           \
	memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
	temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
}                                                                        \
	ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
	memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
	memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
	}

#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)					\
	void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {		\
	SIMD_ALIGNED(uint8_t temp[128 * 2]);									\
	memset(temp, 0, 128); /* for YUY2 and msan */							\
	int r = width & MASK;													\
	int n = width & ~MASK;													\
if (n > 0) {																\
	ANY_SIMD(src_ptr, dst_ptr, n);                                      \
}                                                                     \
	memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
	ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
	memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
	}

#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
	void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {		\
	SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
	memset(temp, 0, 128); /* for YUY2 and msan */                         \
	int r = width & MASK;                                                 \
	int n = width & ~MASK;                                                \
if (n > 0) {															\
	ANY_SIMD(src_ptr, dst_ptr, n);                                      \
}                                                                     \
	memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
	ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
	memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
	}

static const ulvec8 kBiasUV128 = {
	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
{
	__asm {
		mov       eax, [esp + 4]  // src_rgb24
			mov       edx, [esp + 8]  // dst_argb
			mov       ecx, [esp + 12]  // width
			pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
			pslld     xmm5, 24
			movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB

		convertloop :
		movdqu    xmm0, [eax]
			movdqu    xmm1, [eax + 16]
			movdqu    xmm3, [eax + 32]
			lea       eax, [eax + 48]
			movdqa    xmm2, xmm3
			palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
			pshufb    xmm2, xmm4
			por       xmm2, xmm5
			palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
			pshufb    xmm0, xmm4
			movdqu[edx + 32], xmm2
			por       xmm0, xmm5
			pshufb    xmm1, xmm4
			movdqu[edx], xmm0
			por       xmm1, xmm5
			palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
			pshufb    xmm3, xmm4
			movdqu[edx + 16], xmm1
			por       xmm3, xmm5
			movdqu[edx + 48], xmm3
			lea       edx, [edx + 64]
			sub       ecx, 16
			jg        convertloop
			ret
	}
}

static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0,
112, -74, -38, 0, 112, -74, -38, 0 };

static const vec8 kARGBToV = {
	-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};

__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
	int src_stride_argb,
	uint8_t* dst_u,
	uint8_t* dst_v,
	int width) {
	__asm {
		push       esi
			push       edi
			mov        eax, [esp + 8 + 4]  // src_argb
			mov        esi, [esp + 8 + 8]  // src_stride_argb
			mov        edx, [esp + 8 + 12]  // dst_u
			mov        edi, [esp + 8 + 16]  // dst_v
			mov        ecx, [esp + 8 + 20]  // width
			movdqa     xmm5, xmmword ptr kBiasUV128
			movdqa     xmm6, xmmword ptr kARGBToV
			movdqa     xmm7, xmmword ptr kARGBToU
			sub        edi, edx  // stride from u to v

		convertloop :
		/* step 1 - subsample 16x2 argb pixels to 8x1 */
		movdqu     xmm0, [eax]
			movdqu     xmm4, [eax + esi]
			pavgb      xmm0, xmm4
			movdqu     xmm1, [eax + 16]
			movdqu     xmm4, [eax + esi + 16]
			pavgb      xmm1, xmm4
			movdqu     xmm2, [eax + 32]
			movdqu     xmm4, [eax + esi + 32]
			pavgb      xmm2, xmm4
			movdqu     xmm3, [eax + 48]
			movdqu     xmm4, [eax + esi + 48]
			pavgb      xmm3, xmm4

			lea        eax, [eax + 64]
			movdqa     xmm4, xmm0
			shufps     xmm0, xmm1, 0x88
			shufps     xmm4, xmm1, 0xdd
			pavgb      xmm0, xmm4
			movdqa     xmm4, xmm2
			shufps     xmm2, xmm3, 0x88
			shufps     xmm4, xmm3, 0xdd
			pavgb      xmm2, xmm4

			// step 2 - convert to U and V
			// from here down is very similar to Y code except
			// instead of 16 different pixels, its 8 pixels of U and 8 of V
			movdqa     xmm1, xmm0
			movdqa     xmm3, xmm2
			pmaddubsw  xmm0, xmm7  // U
			pmaddubsw  xmm2, xmm7
			pmaddubsw  xmm1, xmm6  // V
			pmaddubsw  xmm3, xmm6
			phaddw     xmm0, xmm2
			phaddw     xmm1, xmm3
			psraw      xmm0, 8
			psraw      xmm1, 8
			packsswb   xmm0, xmm1
			paddb      xmm0, xmm5  // -> unsigned

			// step 3 - store 8 U and 8 V values
			movlps     qword ptr[edx], xmm0  // U
			movhps     qword ptr[edx + edi], xmm0  // V
			lea        edx, [edx + 8]
			sub        ecx, 16
			jg         convertloop

			pop        edi
			pop        esi
			ret
	}
}

__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
	uint8_t* dst_y,
	int width) {
	__asm {
		mov        eax, [esp + 4] /* src_argb */
			mov        edx, [esp + 8] /* dst_y */
			mov        ecx, [esp + 12] /* width */
			movdqa     xmm4, xmmword ptr kARGBToY
			movdqa     xmm5, xmmword ptr kAddY16

		convertloop :
		movdqu     xmm0, [eax]
			movdqu     xmm1, [eax + 16]
			movdqu     xmm2, [eax + 32]
			movdqu     xmm3, [eax + 48]
			pmaddubsw  xmm0, xmm4
			pmaddubsw  xmm1, xmm4
			pmaddubsw  xmm2, xmm4
			pmaddubsw  xmm3, xmm4
			lea        eax, [eax + 64]
			phaddw     xmm0, xmm1
			phaddw     xmm2, xmm3
			psrlw      xmm0, 7
			psrlw      xmm2, 7
			packuswb   xmm0, xmm2
			paddb      xmm0, xmm5
			movdqu[edx], xmm0
			lea        edx, [edx + 16]
			sub        ecx, 16
			jg         convertloop
			ret
	}
}

ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)

int RGB24ToI420(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) 
{
	int y;

	void(*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width);
	void(*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width);
	void(*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width);

	if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0)
	{
		return -1;
	}

	if (height < 0)
	{
		height = -height;
		src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
		src_stride_rgb24 = -src_stride_rgb24;
	}

	RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;//宽度不是16字节的倍数
	if (IS_ALIGNED(width, 16))
	{
		RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
	}

	ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
	ARGBToYRow = ARGBToYRow_Any_SSSE3;
	if (IS_ALIGNED(width, 16)) {
		ARGBToUVRow = ARGBToUVRow_SSSE3;
		ARGBToYRow = ARGBToYRow_SSSE3;
	}

	const int kRowSize = (width * 4 + 31) & ~31;
	align_buffer_64(row, kRowSize * 2);

	for (y = 0; y < height - 1; y += 2) {

		RGB24ToARGBRow(src_rgb24, row, width);
		RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
		ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
		ARGBToYRow(row, dst_y, width);
		ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
		src_rgb24 += src_stride_rgb24 * 2;
		dst_y += dst_stride_y * 2;
		dst_u += dst_stride_u;
		dst_v += dst_stride_v;
	}
	if (height & 1)
	{
		RGB24ToARGBRow(src_rgb24, row, width);
		ARGBToUVRow(row, 0, dst_u, dst_v, width);
		ARGBToYRow(row, dst_y, width);
	}
	free_aligned_buffer_64(row);
	return 0;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Suspend.

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值