SIMD中半精度浮点数转单精度浮点数的方法

最新推荐文章于 2024-12-25 20:25:13 发布

veaglefly

最新推荐文章于 2024-12-25 20:25:13 发布

阅读量1.1k

点赞数

分类专栏：其他说明

本文链接：https://blog.youkuaiyun.com/u012675743/article/details/105080773

版权

其他说明专栏收录该内容

3 篇文章

订阅专栏

SIMD_INLINE float Float16ToFloat32(uint16_t value)
{
	union Bits
	{
		float f;
		int32_t si;
		uint32_t ui;
	}v;
	union Bits2
	{
		float f;
		int32_t si;
		uint32_t ui;
	}s;
	const int SHIFT = 13;
	const int SHIFT_SIGN = 16;
	const int32_t INF_N = 0x7F800000; // flt32 infinity
	const int32_t MAX_N = 0x477FE000; // max flt16 normal as a flt32
	const int32_t MIN_N = 0x38800000; // min flt16 normal as a flt32
	const int32_t SIGN_N = 0x80000000; // flt32 sign bit
	const int32_t INF_C = INF_N >> SHIFT;
	const int32_t NAN_N = (INF_C + 1) << SHIFT; // minimum flt16 nan as a flt32
	const int32_t MAX_C = MAX_N >> SHIFT;
	const int32_t MIN_C = MIN_N >> SHIFT;
	const int32_t SIGN_C = SIGN_N >> SHIFT_SIGN; // flt16 sign bit

	const int32_t MUL_N = 0x52000000; // (1 << 23) / MIN_N
	const int32_t MUL_C = 0x33800000; // MIN_N / (1 << (23 - shift))

	const int32_t SUB_C = 0x003FF; // max flt32 subnormal down shifted
	const int32_t NOR_C = 0x00400; // min flt32 normal down shifted

	const int32_t MAX_D = INF_C - MAX_C - 1;
	const int32_t MIN_D = MIN_C - SUB_C - 1;
	
	v.ui = value;
	int32_t sign = v.si & SIGN_C;
	v.si ^= sign;
	sign <<= SHIFT_SIGN;
	v.si ^= ((v.si + MIN_D) ^ v.si) & -(v.si > SUB_C);
	v.si ^= ((v.si + MAX_D) ^ v.si) & -(v.si > MAX_C);

	s.si = MUL_C;
	s.f *= v.si;
	int32_t mask = -(NOR_C > v.si);
	v.si <<= SHIFT;
	v.si ^= (s.si ^ v.si) & mask;
	v.si |= sign;
	return v.f;
}