_mm_packus_epi16

本文详细解释了SSE指令集中的_mm_packus_epi16函数,包括其如何将16位整数饱和压缩到8位,并提供了示例代码来展示如何打印__m128i变量。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Microsoft Specific

Packs the 16 signed 16-bit integers from a and b into 8-bit unsigned integers and saturates.

          
__m128i _mm_packus_epi16 (__m128i a, __m128i b);
PACKUSWB
        

r0 := UnsignedSaturate(a0)
r1 := UnsignedSaturate(a1)
...
r7 := UnsignedSaturate(a7)
r8 := UnsignedSaturate(b0)
r9 := UnsignedSaturate(b1)
...
r15 := UnsignedSaturate(b7)

Header: emmintrin.h

END Microsoft Specific

https://msdn.microsoft.com/library/07ad1wx4(v=vs.110).aspx

---------------------------------------------------------------------------------

What does UnsignedSaturate in SSE instruction mean?

Basically, "saturation" means that values beyond some "max" value get set to "max", and values below a "min" value get set to "min". Usually, "min" and "max" are the values appropiate for some data type.

Thus, for example, if you take arithmetic on unsigned bytes, "128+128" would have to be "256" (which is hex 0x100), which doesn't fit into a byte. Normal integer arithmetic would create an overflow and discard the part that doesn't fit, which means "128+128 -> 0". With saturated arithmetic, "256 > 255" so the result is 255.

Another option would be scaling, which basically "compresses" the values to a smaller range. Saturation just cuts them off.

You can also use this to put larger types into smaller ones, like putting 16 bit values into 8 bit values. Your example most likely does exactly that, although you'll probably know better than I do what kind of types you are dealing with there.

"UnsignedSaturation" most likely has a min of "0" and a "max" of whatever the max of the result type is. Thus, negative inputs get turned into "0".

http://stackoverflow.com/questions/12141075/what-does-unsignedsaturate-in-sse-instruction-mean

------------------------------------------------------------------------------------------------

how to print a __m128i variable?

#include <iostream>
#include <sstream>
#include <emmintrin.h>
#include <stdint.h>

using namespace std;

template <typename T>
string __m128i_toString(const __m128i var)
{
    stringstream sstr;
    const T *values = (const T *)&var;
    if (sizeof(T) == 1)
    {
        for (unsigned int i = 0; i < sizeof(__m128i); i++)
        {
            sstr << (int)values[i] << " ";
        }
    }
    else
    {
        for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++)
        {
            sstr << values[i] << " ";
        }
    }
    return sstr.str();
}

int main()
{
    __m128i resultLo = _mm_setr_epi16(800, 700, 600, 500, 400, 300, 200, 100);
    __m128i resultHi = _mm_setr_epi16(0, -100, -200, -300, -400, -500, -600, -700);
    __m128i result = _mm_packus_epi16(resultLo, resultHi);

    cout << __m128i_toString<int16_t>(resultLo) << endl;
    cout << __m128i_toString<int16_t>(resultHi) << endl;
    cout << __m128i_toString<uint8_t>(result) << endl;

    return 0;
}

http://stackoverflow.com/questions/13257166/print-a-m128i-variable

void YUVToRGB(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride){ const int Shift = 13; const int HalfV = 1 << (Shift - 1); const int B_Y_WT = 1 << Shift, B_U_WT = 2.03211f * (1 << Shift), B_V_WT = 0; const int G_Y_WT = 1 << Shift, G_U_WT = -0.39465f * (1 << Shift), G_V_WT = -0.58060f * (1 << Shift); const int R_Y_WT = 1 << Shift, R_U_WT = 0, R_V_WT = 1.13983 * (1 << Shift); for (int YY = 0; YY < Height; YY++) { unsigned char *LinePD = RGB + YY * Stride; unsigned char *LinePY = Y + YY * Width; unsigned char *LinePU = U + YY * Width; unsigned char *LinePV = V + YY * Width; for (int XX = 0; XX < Width; XX++, LinePD += 3) { int YV = LinePY[XX], UV = LinePU[XX] - 128, VV = LinePV[XX] - 128; LinePD[0] = ClampToByte(YV + ((B_U_WT * UV + HalfV) >> Shift)); LinePD[1] = ClampToByte(YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift)); LinePD[2] = ClampToByte(YV + ((R_V_WT * VV + HalfV) >> Shift)); } }}12345678910111213141516171819202122YUV2RGB的SSE初级实现和上面一样,直接翻译为SSE代码,没什么技巧:void YUVToRGBSSE_1(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride) { const int Shift = 13; const int HalfV = 1 << (Shift - 1); const int B_Y_WT = 1 << Shift, B_U_WT = 2.03211f * (1 << Shift), B_V_WT = 0; const int G_Y_WT = 1 << Shift, G_U_WT = -0.39465f * (1 << Shift), G_V_WT = -0.58060f * (1 << Shift); const int R_Y_WT = 1 << Shift, R_U_WT = 0, R_V_WT = 1.13983 * (1 << Shift); __m128i Weight_B_Y = _mm_set1_epi32(B_Y_WT), Weight_B_U = _mm_set1_epi32(B_U_WT), Weight_B_V = _mm_set1_epi32(B_V_WT); __m128i Weight_G_Y = _mm_set1_epi32(G_Y_WT), Weight_G_U = _mm_set1_epi32(G_U_WT), Weight_G_V = _mm_set1_epi32(G_V_WT); __m128i Weight_R_Y = _mm_set1_epi32(R_Y_WT), Weight_R_U = _mm_set1_epi32(R_U_WT), Weight_R_V = _mm_set1_epi32(R_V_WT); __m128i Half = _mm_set1_epi32(HalfV); __m128i C128 = _mm_set1_epi32(128); __m128i Zero = _mm_setzero_si128(); const int BlockSize = 16, Block = Width / BlockSize; for (int YY = 0; YY < Height; YY++) { unsigned char *LinePD = RGB + YY * Stride; unsigned char *LinePY = Y + YY * Width; unsigned char *LinePU = U + YY * Width; unsigned char *LinePV = V + YY * Width; for (int XX = 0; XX < Block * BlockSize; XX += BlockSize, LinePY += BlockSize, LinePU += BlockSize, LinePV += BlockSize) { __m128i Blue, Green, Red, YV, UV, VV, Dest1, Dest2, Dest3; YV = _mm_loadu_si128((__m128i *)(LinePY + 0)); UV = _mm_loadu_si128((__m128i *)(LinePU + 0)); VV = _mm_loadu_si128((__m128i *)(LinePV + 0)); //UV = _mm_sub_epi32(UV, C128); //VV = _mm_sub_epi32(VV, C128); __m128i YV16L = _mm_unpacklo_epi8(YV, Zero); __m128i YV16H = _mm_unpackhi_epi8(YV, Zero); __m128i YV32LL = _mm_unpacklo_epi16(YV16L, Zero); __m128i YV32LH = _mm_unpackhi_epi16(YV16L, Zero); __m128i YV32HL = _mm_unpacklo_epi16(YV16H, Zero); __m128i YV32HH = _mm_unpackhi_epi16(YV16H, Zero); __m128i UV16L = _mm_unpacklo_epi8(UV, Zero); __m128i UV16H = _mm_unpackhi_epi8(UV, Zero); __m128i UV32LL = _mm_unpacklo_epi16(UV16L, Zero); __m128i UV32LH = _mm_unpackhi_epi16(UV16L, Zero); __m128i UV32HL = _mm_unpacklo_epi16(UV16H, Zero); __m128i UV32HH = _mm_unpackhi_epi16(UV16H, Zero); UV32LL = _mm_sub_epi32(UV32LL, C128); UV32LH = _mm_sub_epi32(UV32LH, C128); UV32HL = _mm_sub_epi32(UV32HL, C128); UV32HH = _mm_sub_epi32(UV32HH, C128); __m128i VV16L = _mm_unpacklo_epi8(VV, Zero); __m128i VV16H = _mm_unpackhi_epi8(VV, Zero); __m128i VV32LL = _mm_unpacklo_epi16(VV16L, Zero); __m128i VV32LH = _mm_unpackhi_epi16(VV16L, Zero); __m128i VV32HL = _mm_unpacklo_epi16(VV16H, Zero); __m128i VV32HH = _mm_unpackhi_epi16(VV16H, Zero); VV32LL = _mm_sub_epi32(VV32LL, C128); VV32LH = _mm_sub_epi32(VV32LH, C128); VV32HL = _mm_sub_epi32(VV32HL, C128); VV32HH = _mm_sub_epi32(VV32HH, C128); __m128i LL_B = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LL, Weight_B_U)), Shift)); __m128i LH_B = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LH, Weight_B_U)), Shift)); __m128i HL_B = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HL, Weight_B_U)), Shift)); __m128i HH_B = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HH, Weight_B_U)), Shift)); Blue = _mm_packus_epi16(_mm_packus_epi32(LL_B, LH_B), _mm_packus_epi32(HL_B, HH_B)); __m128i LL_G = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LL), _mm_mullo_epi32(Weight_G_V, VV32LL))), Shift)); __m128i LH_G = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LH), _mm_mullo_epi32(Weight_G_V, VV32LH))), Shift)); __m128i HL_G = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HL), _mm_mullo_epi32(Weight_G_V, VV32HL))), Shift)); __m128i HH_G = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HH), _mm_mullo_epi32(Weight_G_V, VV32HH))), Shift)); Green = _mm_packus_epi16(_mm_packus_epi32(LL_G, LH_G), _mm_packus_epi32(HL_G, HH_G)); __m128i LL_R = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LL, Weight_R_V)), Shift)); __m128i LH_R = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LH, Weight_R_V)), Shift)); __m128i HL_R = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HL, Weight_R_V)), Shift)); __m128i HH_R = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HH, Weight_R_V)), Shift)); Red = _mm_packus_epi16(_mm_packus_epi32(LL_R, LH_R), _mm_packus_epi32(HL_R, HH_R)); Dest1 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5)); Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1))); Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1))); Dest2 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1)); Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Green, _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10))); Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1))); Dest3 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1)); Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1))); Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Red, _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15))); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3), Dest1); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize), Dest2); _mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize * 2), Dest3); } for (int XX = Block * BlockSize; XX < Width; XX++, LinePU++, LinePV++, LinePY++) { int YV = LinePY[XX], UV = LinePU[XX] - 128, VV = LinePV[XX] - 128; LinePD[XX + 0] = ClampToByte(YV + ((B_U_WT * UV + HalfV) >> Shift)); LinePD[XX + 1] = ClampToByte(YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift)); LinePD[XX + 2] = ClampToByte(YV + ((R_V_WT * VV + HalfV) >> Shift)); } }}
03-10
if ( v34 > 2 ) { si128 = _mm_load_si128((const __m128i *)&xmmword_18003A4E0); v43 = v33 + 4; v44 = v32 + 1; inserted = _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16(_mm_cvtsi32_si128((unsigned __int16)v40), v39, 1), (unsigned __int16)v40, 2), v39, 3), (unsigned __int16)v40, 4), v39, 5), (unsigned __int16)v40, 6), v39, 7); v46 = ((unsigned int)(v34 - 3) >> 3) + 1; v47 = _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16(_mm_load_si128((const __m128i *)&xmmword_18003A570), v37, 0), v37, 2), v37, 4), v37, 6); v48 = _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16( _mm_insert_epi16(_mm_cvtsi32_si128((unsigned __int16)v41), v38, 1), (unsigned __int16)v41, 2), v38, 3), (unsigned __int16)v41, 4), v38, 5), (unsigned __int16)v41, 6), v38, 7); do { v49 = _mm_loadl_epi64((const __m128i *)(v44 - 1)); v43 += 16; v44 += 8i64; v50 = _mm_cvtepu8_epi16(v49); v51 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(v44 - 8))); v52 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(v44 - 7))); v53 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(v44 - 6))); v54 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(v44 - 5))); *((__m128i *)v43 - 1) = _mm_packus_epi32( _mm_srai_epi32( _mm_add_epi32( _mm_add_epi32( _mm_madd_epi16(_mm_unpacklo_epi16(v52, v53), v48), _mm_madd_epi16(_mm_unpacklo_epi16(v50, v51), inserted)), _mm_madd_epi16(_mm_unpacklo_epi16(v54, si128), v47)), 0xAu), _mm_srai_epi32( _mm_add_epi32( _mm_add_epi32( _mm_madd_epi16(_mm_unpackhi_epi16(v52, v53), v48), _mm_madd_epi16(_mm_unpackhi_epi16(v50, v51), inserted)), _mm_madd_epi16(_mm_unpackhi_epi16(v54, si128), v47)), 0xAu)); --v46; } while ( v46 ); } 将上述代码转成c++并解释功能
最新发布
03-28
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值