哈希表的扩展-布隆过滤器

最新推荐文章于 2025-07-13 19:41:07 发布

原创最新推荐文章于 2025-07-13 19:41:07 发布 · 548 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#布隆过滤器 #哈希表的扩展 #位图

数据结构专栏收录该内容

17 篇文章

订阅专栏

布隆过滤器的简介

什么是布隆过滤器？

布隆过滤器(Bloom Filter)是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法，缺点是有一定的误识别率和删除困难。

为什么会出现布隆过滤器？

在日常生活中，包括在设计计算机软件时，我们经常判断一个元素是否在一个集合中。比如：要检查一个单词是否拼写正确（即是否在已知的字典中）；判断一个嫌疑人的名字是否已经在嫌疑名单中；在网络爬虫中，一个网站是否被访问过等等。最直接的方法就是将集合中全部的元素存在计算机中，遇到一个新元素，将它和集合中的元素直接对比。计算机中的集合是用哈希表存储。优点是：快速准确，缺点是：费存储空间。为了提高效率我们可以采用hash表，并且将集合中的元素都映射到bitmap中的一个位上，这样的话就会节省空间和查找的时间。但是由于哈希冲突的原因，我们有可能会产生误判，即不同的元素经过散列函数之后可能产生同一个地址。

布隆过滤器有哪些应用？
1、Google著名的分布式数据库Bigtable使用布隆过滤器来查找不存在的行或列，以减少磁盘查找IO的次数。
2、Squid网页代理缓存服务在cache digests 。
3、Venti文档存储系统也采用布隆过滤器来检测先前存储的数据。
4、SPIN模型检测器使用布隆过滤器在大规模验证问题时跟踪可达状态空间。
5、Google Chrome浏览器使用布隆过滤器加速安全浏览服务。
6、在很多Key-Value系统也使用了布隆过滤器加快查询过程。如：Hbase、Accumulo、Leveldb。

简单的实现布隆过滤器

BitSet.h
#pragma once

#include <vector>

class BitSet
{
public:
    BitSet(size_t range)//构造函数
    {
        _a.resize((range >> 5) + 1, 0);
    }

    void Set(size_t num)
    {
        size_t index = num >> 5;//在哪个数中
        size_t pos = num % 32;//在哪个比特位中

        _a[index] |= (1 << pos);//将num对应的位置1
    }

    void ReSet(size_t num)
    {
        size_t index = num >> 5;
        size_t pos = num % 32;

        _a[index] &= ~(1 << pos);//将num对应的位置0
    }

    bool Test(size_t num)
    {
        size_t index = num >> 5;
        size_t pos = num % 32;

        return _a[index] & (1 << pos);//如果存在，对应的位是1，&1为1，否则相反
    }

protected:
    vector<int> _a;
};

BloomFilter.h
#pragma once

template <typename K>
struct _Func1
{
    size_t BKDRHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..           
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return BKDRHash(key.c_str());
    }
};

template <typename K>
struct _Func2
{
    size_t SDBMHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = 65599 * hash + ch;
            //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return SDBMHash(key.c_str());
    }
};

template <typename K>
struct _Func3
{
    size_t RSHash(const char *str)
    {
        register size_t hash = 0;
        size_t magic = 63689;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * magic + ch;
            magic *= 378551;
        }
        return hash;
    }

    size_t operator()(const string& key)
    {
        return RSHash(key.c_str());
    }
};

template <typename K>
struct _Func4
{
    size_t APHash(const char *str)
    {
        register size_t hash = 0;
        size_t ch;
        for (long i = 0; ch = (size_t)*str++; i++)
        {
            if ((i & 1) == 0)
            {
                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
            }
            else
            {
                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
            }
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return APHash(key.c_str());
    }
};

template <typename K>
struct _Func5
{
    size_t JSHash(const char *str)
    {
        if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0  
            return 0;
        register size_t hash = 1315423911;
        while (size_t ch = (size_t)*str++)
        {
            hash ^= ((hash << 5) + ch + (hash >> 2));
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return JSHash(key.c_str());
    }
};

template <typename K = string
    , typename Func1 = _Func1<K>
    , typename Func2 = _Func2<K>
    , typename Func3 = _Func3<K>
    , typename Func4 = _Func4<K>
    , typename Func5 = _Func5<K >>
class BloomFilter
{
public:
    BloomFilter(const size_t range)
        :_s1(range)
        , _size(range)
    {}

    void Set(const K& key)
    {
        size_t index1 = Func1()(key.c_str()) % _size;
        size_t index2 = Func2()(key.c_str()) % _size;
        size_t index3 = Func3()(key.c_str()) % _size;
        size_t index4 = Func4()(key.c_str()) % _size;
        size_t index5 = Func5()(key.c_str()) % _size;

        _s1.Set(index1);
        _s1.Set(index2);
        _s1.Set(index3);
        _s1.Set(index4);
        _s1.Set(index5);
    }

    bool Test(const K& key)
    {
        size_t index1 = Func1()(key.c_str()) % _size;
        _s1.Test(index1);
        if (_s1.Test(index1) == 0)
            return false;

        size_t index2 = Func2()(key.c_str()) % _size;
        _s1.Test(index2);
        if (_s1.Test(index2) == 0)
            return false;

        size_t index3 = Func3()(key.c_str()) % _size;
        _s1.Test(index3);
        if (_s1.Test(index3) == 0)
            return false;

        size_t index4 = Func4()(key.c_str()) % _size;
        _s1.Test(index4);
        if (_s1.Test(index4) == 0)
            return false;

        size_t index5 = Func1()(key.c_str()) % _size;
        _s1.Test(index5);
        if (_s1.Test(index5) == 0)
            return false;
        return true;
    }
protected:
    BitSet _s1;
    size_t _size;
};

void TestBloomFilter()
{
    BloomFilter<> bf1(1000);
    bf1.Set("sort");
    bf1.Set("man");
    bf1.Set("left");
    bf1.Set("123");
    bf1.Set("真的");
    bf1.Set("https://hao.360.cn/?a1006");
    bf1.Set("https://hao.360.cn/?a10061");
    bf1.Set("https://hao.360.cn/?a10062");
    bf1.Set("https://hao.360.cn/?a10063");
    bf1.Set("https://hao.360.cn/?a10064");

    cout << "Is True?:" << bf1.Test("sort") << endl;
    cout << "Is True?:" << bf1.Test("123") << endl;
    cout << "Is True?:" << bf1.Test("left1") << endl;
    cout << "Is True?:" << bf1.Test("真的") << endl;
    cout << "Is True?:" << bf1.Test("假的") << endl;
    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a1006") << endl;
    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a10064") << endl;
    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a10067") << endl;
}

Test.cpp
#include <iostream>
#include <string>
#include <cassert>
#include <cstdlib>

using namespace std;

#include "BitSet.h"
#include "BloomFilter.h"

int main()
{
    TestBloomFilter();
    return 0;
}