1、简单实现
下面代码是Adler-32算法的简单实现,我们来整理一下这段代码的逻辑:
A = 1 + D1 + D2 + ... + Dn (mod 65521)
B = (1 + D1) + (1 + D1 + D2) + ... + (1 + D1 + D2 + ... + Dn) (mod 65521)
= nxD1 + (n-1) x D2 + (n-2) x D3 + ... + Dn + n (mod 65521)
Adler-32(D) = (B x 65536) + A
看到这个公式相信大家已经有了思路,对于A的计算很简单简单的相加,对于B的相加需要一个系数
const uint32_t MOD_ADLER = 65521;
uint32_t adler32(unsigned char *data, size_t len)
/*
where data is the location of the data in physical memory and
len is the length of the data in bytes
*/
{
uint32_t a = 1, b = 0;
size_t index;
// Process each byte of the data in order
for (index = 0; index < len; ++index)
{
a = (a + data[index]) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}
return (b << 16) | a;
}
(b << 16) | a
等价于 bx65536+a
2、Neon优化代码
下面是neon优化的代码,taps就是第一节所说的系数,该段neon代码是32个元素同时计算,其中有一个重要的部分 s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
adacc代表的是A,向左偏移5位,即代表乘以32,这个很好理解,例如:
- 第一轮循环,前32个数参与计算,结果为32xD1 + 31xD2 + … + 1xD32 + 32
- 第二轮循环,按照第一节的公式,D1需要乘以64,恰好adacc等于1+D1 +D2 + … + D32,将其左移5位后再与s2acc相加,即为前32个元素(第一轮循环)正确的计算结果(在第二轮循环中),第二轮 循环的32个元素正常计算即可
- 后续循环同理,最后进行归约即可
其中neon intrinsic 用到了饱和度转换,相当于原算法中的取模运算
static void NEON_accum32(uint32_t *s, const unsigned char *buf,
z_size_t len)
{
/* Please refer to the 'Algorithm' section of:
* https://en.wikipedia.org/wiki/Adler-32
* Here, 'taps' represents the 'n' scalar multiplier of 'B', which
* will be multiplied and accumulated.
*/
static const uint8_t taps[32] = {
32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1 };
/* This may result in some register spilling (and 4 unnecessary VMOVs). */
const uint8x16_t t0 = vld1q_u8(taps);
const uint8x16_t t1 = vld1q_u8(taps + 16);
const uint8x8_t n_first_low = vget_low_u8(t0);
const uint8x8_t n_first_high = vget_high_u8(t0);
const uint8x8_t n_second_low = vget_low_u8(t1);
const uint8x8_t n_second_high = vget_high_u8(t1);
uint32x2_t adacc2, s2acc2, as;
uint16x8_t adler, sum2;
uint8x16_t d0, d1;
uint32x4_t adacc = vdupq_n_u32(0);
uint32x4_t s2acc = vdupq_n_u32(0);
adacc = vsetq_lane_u32(s[0], adacc, 0);
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
/* Think of it as a vectorized form of the code implemented to
* handle the tail (or a DO16 on steroids). But in this case
* we handle 32 elements and better exploit the pipeline.
*/
while (len >= 2) {
d0 = vld1q_u8(buf);
d1 = vld1q_u8(buf + 16);
s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
adler = vpaddlq_u8(d0);
adler = vpadalq_u8(adler, d1);
sum2 = vmull_u8(n_first_low, vget_low_u8(d0));
sum2 = vmlal_u8(sum2, n_first_high, vget_high_u8(d0));
sum2 = vmlal_u8(sum2, n_second_low, vget_low_u8(d1));
sum2 = vmlal_u8(sum2, n_second_high, vget_high_u8(d1));
adacc = vpadalq_u16(adacc, adler);
s2acc = vpadalq_u16(s2acc, sum2);
len -= 2;
buf += 32;
}
/* This is the same as before, but we only handle 16 elements as
* we are almost done.
*/
while (len > 0) {
d0 = vld1q_u8(buf);
s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
adler = vpaddlq_u8(d0);
sum2 = vmull_u8(n_second_low, vget_low_u8(d0));
sum2 = vmlal_u8(sum2, n_second_high, vget_high_u8(d0));
adacc = vpadalq_u16(adacc, adler);
s2acc = vpadalq_u16(s2acc, sum2);
buf += 16;
len--;
}
/* Combine the accumulated components (adler and sum2). */
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
as = vpadd_u32(adacc2, s2acc2);
/* Store the results. */
s[0] = vget_lane_u32(as, 0);
s[1] = vget_lane_u32(as, 1);
}
最后解释一下s[0] = vget_lane_u32(as, 0); s[1] = vget_lane_u32(as, 1);
最后两行代码,我们需要注意 as = vpadd_u32(adacc2, s2acc2);
,这段代码将前两个元素相加,后两个元素相加,刚好就是 A 和 B(第一节算法逻辑中的A和B),分别将其放入s数组中,即s[0]=A, s[1]=B
附加一段代码,让大家思考一下数组到数字的转换
#include <iostream>
void test(uint16_t *s){
s[0] = 1;
s[1] = 1;
}
int main(){
uint16_t a[2];
test(a);
uint32_t *s = reinterpret_cast<uint32_t*>(a);
std::cout << *s << std::endl;//65537
return 0;
}