golang 布隆过滤器
源码分析
结构体定义,m和k,通过README可以知道,
m是数组集合大小,而k是hash函数个数
// member of a set.
type BloomFilter struct {
m uint
k uint
b *bitset.BitSet
}
// New creates a new Bloom filter with _m_ bits and _k_ hashing functions
// We force _m_ and _k_ to be at least one to avoid panics.
func New(m uint, k uint) *BloomFilter {
return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
}
这里使用了bitset作为数组实现
结构体定义:
// A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0.
type BitSet struct {
length uint
set []uint64
}
// New creates a new BitSet with a hint that length bits will be required
func New(length uint) (bset *BitSet) {
defer recover ....
bset = &BitSet{
length,
make([]uint64, wordsNeeded(length)), // 计算实际申请长度
}
return bset
}
用int64位表示0~63个整数
比如:
第一次add 0:
数组表示应该是1(1)
第二次add 10
数组表示应该是1024+1=1025(1000000001)
第三次add 64
因为已经大于63,所以只能新建一个int64,所以应该两个元素,1025和1
计算hash
// Add data to the Bloom Filter. Returns the filter (allows chaining)
func (f *BloomFilter) Add(data []byte) *BloomFilter {
h := baseHashes(data)
for i := uint(0); i < f.k; i++ { //执行k次,一个整数用k位表示,一旦不存在,k位bit肯定不为1
// 实现hash函数是murmurhash,https://xiaobazhang.github.io/2018/06/19/MurmurHash%E7%AE%97%E6%B3%95/
f.b.Set(f.location(h, i))
}
return f
}
计算碰撞率
// EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits
// and k hash functions, what the false positive rate will be
// while storing n entries; runs 100,000 tests. This is an empirical
// test using integers as keys. As a side-effect, it clears the BloomFilter.
func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) {
rounds := uint32(100000)
f.ClearAll()
n1 := make([]byte, 4)
for i := uint32(0); i < uint32(n); i++ {
binary.BigEndian.PutUint32(n1, i)
f.Add(n1)
}
fp := 0
// test for number of rounds
for i := uint32(0); i < rounds; i++ {
binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
if f.Test(n1) {
//fmt.Printf("%v failed.\n", i+uint32(n)+1)
fp++
}
}
fpRate = float64(fp) / (float64(rounds))
f.ClearAll()
return
}
根据n和fp估算m和k
证明公式:https://en.wikipedia.org/wiki/Bloom_filter
// EstimateParameters estimates requirements for m and k.
// Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
// used with permission.
func EstimateParameters(n uint, p float64) (m uint, k uint) {
m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
return
}
// NewWithEstimates creates a new Bloom filter for about n items with fp
// false positive rate
func NewWithEstimates(n uint, fp float64) *BloomFilter {
m, k := EstimateParameters(n, fp)
return New(m, k)
}