inline uint16x8_t v_pack(const uint32x4_t& a, const uint32x4_t& b)
{
uint16x4_t a1 = vqmovn_u32(a), b1 = vqmovn_u32(b);
return uint16x8_t(vcombine_u16(a1, b1));
}
uint16x4_t = vqmovn_u32(const uint32x4_t) 饱和截断每个lane值为原来的一半
uint16x8_t = vcombine_u16(const uint16x4_t, const uint16x4_t) 组合两个16x4得到16x8
inline uint8x16_t v_rshr_pack_u(const int16x8_t& a, const int16x8_t& b)
{
uint8x8_t a1 = vqrshrun_n_s16(a, 2);
uint8x8_t b1 = vqrshrun_n_s16(b, 2);
return uint8x16_t(vcombine_u8(a1, b1));
}
uint8x8_t = vqrshrun_n_s16(const int16x8_t, int a) 右移每个值a位,然后饱和截断为原来的一半
inline int16x8_t v_mul_hi(const int16x8_t& a, const int16x8_t& b)
{
return int16x8_t(vcombine_s16(
vshrn_n_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b)), 16),
vshrn_n_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b