_mm_set1_epi8的干了什么?

本文深入探讨了Intel SSE2 intrinsics中的_mm_set1_epi8函数,解释了其用途和工作原理,以及如何通过单指令多数据流思想实现高效的数据操作。

在qemu中有#define SPLAT(p) _mm_set1_epi8(*(p))

那么,_mm_set1_epi8到底是干什么用的呢?下面这个说明文档说的很详细,算是个记录吧。

Set Intrinsics

The Intel® Streaming SIMD Extensions 2 (Intel® SSE2) intrinsics for integer set operations are listed in this topic. These intrinsics are composite intrinsics because they require more than one instruction to implement them. The prototypes for the Intel® SSE2 intrinsics are in the emmintrin.h header file.

The results of each intrinsic operation are placed in registers. The information about what is placed in each register appears in the tables below, in the detailed explanation of each intrinsic. RR0R1...R15 represent the registers in which results are placed.(其实就是通过这些指令给寄存器赋值,利用了单指令多数据流的思想)

Intrinsic Name

Operation

Corresponding
Intel® SSE2 Instruction

_mm_set_epi64

Set two integer values

Composite

_mm_set_epi64x

Set two integer values

Composite

_mm_set_epi32

Set four integer values

Composite

_mm_set_epi16

Set eight integer values

Composite

_mm_set_epi8

Set sixteen integer values

Composite

_mm_set1_epi64

Set two integer values

Composite

_mm_set1_epi64x

Set two integer values

Composite

_mm_set1_epi32

Set four integer values

Composite

_mm_set1_epi16

Set eight integer values

Composite

_mm_set1_epi8

Set sixteen integer values

Composite

_mm_setr_epi64

Set two integer values in reverse order

Composite

_mm_setr_epi32

Set four integer values in reverse order

Composite

_mm_setr_epi16

Set eight integer values in reverse order

Composite

_mm_setr_epi8

Set sixteen integer values in reverse order

Composite

_mm_setzero_si128

Set to zero

Composite

__m128i _mm_set_epi64(__m64 q1, __m64 q0)

Sets the 2 64-bit integer values.

R0

R1

q0

q1

__m128i _mm_set_epi64x(__int64 b, __int64 a)

Sets the 2 64-bit integer values.

R0

R1

a

b

__m128i _mm_set_epi32(int i3, int i2, int i1, int i0)

Sets the 4 signed 32-bit integer values.

R0

R1

R2

R3

i0

i1

i2

i3

__m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)

Sets the 8 signed 16-bit integer values.

R0

R1

...

R7

w0

w1

...

w7

__m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)

Sets the 16 signed 8-bit integer values.

R0

R1

...

R15

b0

b1

...

b15

__m128i _mm_set1_epi64(__m64 q)

Sets the 2 64-bit integer values to q.

R0

R1

q

q

__m128i _mm_set1_epi64x(__int64 a)

Sets the 2 64-bit integer values to a.

R0

R1

a

a

__m128i _mm_set1_epi32(int i)

Sets the 4 signed 32-bit integer values to i.

R0

R1

R2

R3

i

i

i

i

__m128i _mm_set1_epi16(short w)

Sets the 8 signed 16-bit integer values to w.

R0

R1

...

R7

w

w

w

w

__m128i _mm_set1_epi8(char b)

Sets the 16 signed 8-bit integer values to b.

R0

R1

...

R15

b

b

b

b

__m128i _mm_setr_epi64(__m64 q0, __m64 q1)

Sets the 2 64-bit integer values in reverse order.

R0

R1

q0

q1

__m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)

Sets the 4 signed 32-bit integer values in reverse order.

R0

R1

R2

R3

i0

i1

i2

i3

__m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)

Sets the 8 signed 16-bit integer values in reverse order.

R0

R1

...

R7

w0

w1

...

w7

__m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)

Sets the 16 signed 8-bit integer values in reverse order.

R0

R1

...

R15

b0

b1

...

b15

__m128i _mm_setzero_si128()

Sets the 128-bit value to zero.

R

0x0

void YUVToRGB(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride){ const int Shift = 13; const int HalfV = 1 << (Shift - 1); const int B_Y_WT = 1 << Shift, B_U_WT = 2.03211f * (1 << Shift), B_V_WT = 0; const int G_Y_WT = 1 << Shift, G_U_WT = -0.39465f * (1 << Shift), G_V_WT = -0.58060f * (1 << Shift); const int R_Y_WT = 1 << Shift, R_U_WT = 0, R_V_WT = 1.13983 * (1 << Shift); for (int YY = 0; YY < Height; YY++) { unsigned char *LinePD = RGB + YY * Stride; unsigned char *LinePY = Y + YY * Width; unsigned char *LinePU = U + YY * Width; unsigned char *LinePV = V + YY * Width; for (int XX = 0; XX < Width; XX++, LinePD += 3) { int YV = LinePY[XX], UV = LinePU[XX] - 128, VV = LinePV[XX] - 128; LinePD[0] = ClampToByte(YV + ((B_U_WT * UV + HalfV) >> Shift)); LinePD[1] = ClampToByte(YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift)); LinePD[2] = ClampToByte(YV + ((R_V_WT * VV + HalfV) >> Shift)); } }}12345678910111213141516171819202122YUV2RGB的SSE初级实现和上面一样,直接翻译为SSE代码,没什么技巧:void YUVToRGBSSE_1(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride) { const int Shift = 13; const int HalfV = 1 << (Shift - 1); const int B_Y_WT = 1 << Shift, B_U_WT = 2.03211f * (1 << Shift), B_V_WT = 0; const int G_Y_WT = 1 << Shift, G_U_WT = -0.39465f * (1 << Shift), G_V_WT = -0.58060f * (1 << Shift); const int R_Y_WT = 1 << Shift, R_U_WT = 0, R_V_WT = 1.13983 * (1 << Shift); __m128i Weight_B_Y = _mm_set1_epi32(B_Y_WT), Weight_B_U = _mm_set1_epi32(B_U_WT), Weight_B_V = _mm_set1_epi32(B_V_WT); __m128i Weight_G_Y = _mm_set1_epi32(G_Y_WT), Weight_G_U = _mm_set1_epi32(G_U_WT), Weight_G_V = _mm_set1_epi32(G_V_WT); __m128i Weight_R_Y = _mm_set1_epi32(R_Y_WT), Weight_R_U = _mm_set1_epi32(R_U_WT), Weight_R_V = _mm_set1_epi32(R_V_WT); __m128i Half = _mm_set1_epi32(HalfV); __m128i C128 = _mm_set1_epi32(128); __m128i Zero = _mm_setzero_si128(); const int BlockSize = 16, Block = Width / BlockSize; for (int YY = 0; YY < Height; YY++) { unsigned char *LinePD = RGB + YY * Stride; unsigned char *LinePY = Y + YY * Width; unsigned char *LinePU = U + YY * Width; unsigned char *LinePV = V + YY * Width; for (int XX = 0; XX < Block * BlockSize; XX += BlockSize, LinePY += BlockSize, LinePU += BlockSize, LinePV += BlockSize) { __m128i Blue, Green, Red, YV, UV, VV, Dest1, Dest2, Dest3; YV = _mm_loadu_si128((__m128i *)(LinePY + 0)); UV = _mm_loadu_si128((__m128i *)(LinePU + 0)); VV = _mm_loadu_si128((__m128i *)(LinePV + 0)); //UV = _mm_sub_epi32(UV, C128); //VV = _mm_sub_epi32(VV, C128); __m128i YV16L = _mm_unpacklo_epi8(YV, Zero); __m128i YV16H = _mm_unpackhi_epi8(YV, Zero); __m128i YV32LL = _mm_unpacklo_epi16(YV16L, Zero); __m128i YV32LH = _mm_unpackhi_epi16(YV16L, Zero); __m128i YV32HL = _mm_unpacklo_epi16(YV16H, Zero); __m128i YV32HH = _mm_unpackhi_epi16(YV16H, Zero); __m128i UV16L = _mm_unpacklo_epi8(UV, Zero); __m128i UV16H = _mm_unpackhi_epi8(UV, Zero); __m128i UV32LL = _mm_unpacklo_epi16(UV16L, Zero); __m128i UV32LH = _mm_unpackhi_epi16(UV16L, Zero); __m128i UV32HL = _mm_unpacklo_epi16(UV16H, Zero); __m128i UV32HH = _mm_unpackhi_epi16(UV16H, Zero); UV32LL = _mm_sub_epi32(UV32LL, C128); UV32LH = _mm_sub_epi32(UV32LH, C128); UV32HL = _mm_sub_epi32(UV32HL, C128); UV32HH = _mm_sub_epi32(UV32HH, C128); __m128i VV16L = _mm_unpacklo_epi8(VV, Zero); __m128i VV16H = _mm_unpackhi_epi8(VV, Zero); __m128i VV32LL = _mm_unpacklo_epi16(VV16L, Zero); __m128i VV32LH = _mm_unpackhi_epi16(VV16L, Zero); __m128i VV32HL = _mm_unpacklo_epi16(VV16H, Zero); __m128i VV32HH = _mm_unpackhi_epi16(VV16H, Zero); VV32LL = _mm_sub_epi32(VV32LL, C128); VV32LH = _mm_sub_epi32(VV32LH, C128); VV32HL = _mm_sub_epi32(VV32HL, C128); VV32HH = _mm_sub_epi32(VV32HH, C128); __m128i LL_B = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LL, Weight_B_U)), Shift)); __m128i LH_B = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LH, Weight_B_U)), Shift)); __m128i HL_B = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HL, Weight_B_U)), Shift)); __m128i HH_B = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HH, Weight_B_U)), Shift)); Blue = _mm_packus_epi16(_mm_packus_epi32(LL_B, LH_B), _mm_packus_epi32(HL_B, HH_B)); __m128i LL_G = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LL), _mm_mullo_epi32(Weight_G_V, VV32LL))), Shift)); __m128i LH_G = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LH), _mm_mullo_epi32(Weight_G_V, VV32LH))), Shift)); __m128i HL_G = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HL), _mm_mullo_epi32(Weight_G_V, VV32HL))), Shift)); __m128i HH_G = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HH), _mm_mullo_epi32(Weight_G_V, VV32HH))), Shift)); Green = _mm_packus_epi16(_mm_packus_epi32(LL_G, LH_G), _mm_packus_epi32(HL_G, HH_G)); __m128i LL_R = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LL, Weight_R_V)), Shift)); __m128i LH_R = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LH, Weight_R_V)), Shift)); __m128i HL_R = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HL, Weight_R_V)), Shift)); __m128i HH_R = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HH, Weight_R_V)), Shift)); Red = _mm_packus_epi16(_mm_packus_epi32(LL_R, LH_R), _mm_packus_epi32(HL_R, HH_R)); Dest1 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5)); Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1))); Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1))); Dest2 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1)); Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Green, _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10))); Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1))); Dest3 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1)); Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1))); Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Red, _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15))); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3), Dest1); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize), Dest2); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize * 2), Dest3); } for (int XX = Block * BlockSize; XX < Width; XX++, LinePU++, LinePV++, LinePY++) { int YV = LinePY[XX], UV = LinePU[XX] - 128, VV = LinePV[XX] - 128; LinePD[XX + 0] = ClampToByte(YV + ((B_U_WT * UV + HalfV) >> Shift)); LinePD[XX + 1] = ClampToByte(YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift)); LinePD[XX + 2] = ClampToByte(YV + ((R_V_WT * VV + HalfV) >> Shift)); } }}
03-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值