SIMD_INLINE float Float16ToFloat32(uint16_t value)
{
union Bits
{
float f;
int32_t si;
uint32_t ui;
}v;
union Bits2
{
float f;
int32_t si;
uint32_t ui;
}s;
const int SHIFT = 13;
const int SHIFT_SIGN = 16;
const int32_t INF_N = 0x7F800000; // flt32 infinity
const int32_t MAX_N = 0x477FE000; // max flt16 normal as a flt32
const int32_t MIN_N = 0x38800000; // min flt16 normal as a flt32
const int32_t SIGN_N = 0x80000000; // flt32 sign bit
const int32_t INF_C = INF_N >> SHIFT;
const int32_t NAN_N = (INF_C + 1) << SHIFT; // minimum flt16 nan as a flt32
const int32_t MAX_C = MAX_N >> SHIFT;
const int32_t MIN_C = MIN_N >> SHIFT;
const int32_t SIGN_C = SIGN_N >> SHIFT_SIGN; // flt16 sign bit
const int32_t MUL_N = 0x52000000; // (1 << 23) / MIN_N
const int32_t MUL_C = 0x33800000; // MIN_N / (1 << (23 - shift))
const int32_t SUB_C = 0x003FF; // max flt32 subnormal down shifted
const int32_t NOR_C = 0x00400; // min flt32 normal down shifted
const int32_t MAX_D = INF_C - MAX_C - 1;
const int32_t MIN_D = MIN_C - SUB_C - 1;
v.ui = value;
int32_t sign = v.si & SIGN_C;
v.si ^= sign;
sign <<= SHIFT_SIGN;
v.si ^= ((v.si + MIN_D) ^ v.si) & -(v.si > SUB_C);
v.si ^= ((v.si + MAX_D) ^ v.si) & -(v.si > MAX_C);
s.si = MUL_C;
s.f *= v.si;
int32_t mask = -(NOR_C > v.si);
v.si <<= SHIFT;
v.si ^= (s.si ^ v.si) & mask;
v.si |= sign;
return v.f;
}
SIMD中半精度浮点数转单精度浮点数的方法
最新推荐文章于 2024-12-25 20:25:13 发布