golang 布隆过滤器实现源码分析

最新推荐文章于 2025-05-09 16:47:09 发布

wanhf11

最新推荐文章于 2025-05-09 16:47:09 发布

阅读量1.6k

点赞数

CC 4.0 BY-SA版权

分类专栏： golang

本文链接：https://blog.youkuaiyun.com/qq_17612199/article/details/88775391

golang 专栏收录该内容

36 篇文章

订阅专栏

本文介绍了使用Golang实现的布隆过滤器，包括其结构体定义、构造方法及核心功能如添加数据、估计错误正例率等。还提供了一个根据预期元素数量和错误率来估算参数的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

golang 布隆过滤器

“github.com/willf/bloom”

源码分析

结构体定义，m和k，通过README可以知道，
m是数组集合大小，而k是hash函数个数

// member of a set.
type BloomFilter struct {
    m uint
    k uint
    b *bitset.BitSet
}

// New creates a new Bloom filter with _m_ bits and _k_ hashing functions
// We force _m_ and _k_ to be at least one to avoid panics.
func New(m uint, k uint) *BloomFilter {
    return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
}

这里使用了bitset作为数组实现
结构体定义：

// A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0.
type BitSet struct {
    length uint
    set    []uint64
}

// New creates a new BitSet with a hint that length bits will be required
func New(length uint) (bset *BitSet) {
    defer recover ....
    bset = &BitSet{
        length,
        make([]uint64, wordsNeeded(length)), // 计算实际申请长度
    }
    return bset
}

用int64位表示0～63个整数
比如：

第一次add 0:
数组表示应该是1（1）
第二次add 10
数组表示应该是1024+1=1025（1000000001）
第三次add 64
因为已经大于63，所以只能新建一个int64，所以应该两个元素，1025和1

计算hash

// Add data to the Bloom Filter. Returns the filter (allows chaining)
func (f *BloomFilter) Add(data []byte) *BloomFilter {
    h := baseHashes(data)
    for i := uint(0); i < f.k; i++ { //执行k次，一个整数用k位表示，一旦不存在，k位bit肯定不为1
        // 实现hash函数是murmurhash，https://xiaobazhang.github.io/2018/06/19/MurmurHash%E7%AE%97%E6%B3%95/
        f.b.Set(f.location(h, i))
    }
    return f
}

计算碰撞率

// EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits
// and k hash functions, what the false positive rate will be
// while storing n entries; runs 100,000 tests. This is an empirical
// test using integers as keys. As a side-effect, it clears the BloomFilter.
func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) {
    rounds := uint32(100000)
    f.ClearAll()
    n1 := make([]byte, 4)
    for i := uint32(0); i < uint32(n); i++ {
        binary.BigEndian.PutUint32(n1, i)
        f.Add(n1)
    }
    fp := 0
    // test for number of rounds
    for i := uint32(0); i < rounds; i++ {
        binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
        if f.Test(n1) {
            //fmt.Printf("%v failed.\n", i+uint32(n)+1)
            fp++
        }
    }
    fpRate = float64(fp) / (float64(rounds))
    f.ClearAll()
    return
}

根据n和fp估算m和k
证明公式：https://en.wikipedia.org/wiki/Bloom_filter

// EstimateParameters estimates requirements for m and k.
// Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
// used with permission.
func EstimateParameters(n uint, p float64) (m uint, k uint) {
    m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
    k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
    return
}

// NewWithEstimates creates a new Bloom filter for about n items with fp
// false positive rate
func NewWithEstimates(n uint, fp float64) *BloomFilter {
    m, k := EstimateParameters(n, fp)
    return New(m, k)
}