哈希扩展--位图和布隆过滤器

最新推荐文章于 2024-09-23 18:23:13 发布

原创最新推荐文章于 2024-09-23 18:23:13 发布 · 344 阅读

0 ·

CC 4.0 BY-SA版权

数据结构专栏收录该内容

23 篇文章

订阅专栏

本文介绍位图数据结构的原理及应用，并基于位图实现布隆过滤器，用于高效判断元素是否存在。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

位图

位图就是用一个比特位来表示两种状态，0或者1；这样做可以大大的减少空间的使用节约空间，一个整形数据在32位机中为32个字节，用一个整形数据就可以表示32个“事件”的状态，适用于大规模数据，但数据状态又不是很多的情况，通常是用来判断某个数据存不存在的。

代码实现如下：

BitMap.h

#ifndef __BITMAP_H__
#define __BITMAP_H__

#include <stdio.h>
#include <malloc.h>
#include <assert.h>


typedef struct BitMap
{
    int* _bmp;
    int _capacity; //位集合所占空间大小(有几个整形数据)
    int _size;    //比特位的有效个数(假如最大数为24，他的有效个数就为25,0到24有25个数)
}BMap;




void BitMapInit(BMap* bp, int bitcount);

void BitMapSet(BMap* bp, int which);

//将任意一位置零
void BitMapReSet(BMap* bp, int which);

//判断任意一位是0或1
int BitMapTest(BMap* bp, int which);

int BitMapSize(BMap* bp);

//比特位为1的总个数
int BitMapCount(BMap* bp);

//销毁位图
void BitMapDestroy(BMap* bp);


#endif

BitMap.c

#include "BitMap.h"

//初始化
void BitMapInit(BMap* bp, int bitcount)//第二个参数为需要比特位的个数
{
    assert(bp);
    bp->_capacity = bitcount / 32 + 1;

    //因为需要每一个比特位初始化都为0，所以这里用calloc
    bp->_bmp = (int*)calloc(bp->_capacity, sizeof(int));
    if (NULL == bp->_bmp)
    {
        assert(0);
        return;
    }

    bp->_size = bitcount;
}

//将任意一位置1
void BitMapSet(BMap* bp, int which)//which从0开始
{
    int index = 0;
    int pos = 0;

    assert(bp);
    if (which >= bp->_size)
        return;

    index = which / 32;//计算出在哪一个整形数据中
    pos = which % 32;  //计算出再这个数的哪一位

    bp->_bmp[index] |= (1 << pos);
}

//将任意一位置零
void BitMapReSet(BMap* bp, int which)
{
    int index = 0;
    int pos = 0;
    assert(bp);

    if (which >= bp->_size)
        return;

    index = index = which / 32;
    pos = which % 32;  
    bp->_bmp[index] &= ~(1 << pos);
}

//判断任意一位是0或1
int BitMapTest(BMap* bp, int which)
{
    int index = 0;
    int pos = 0;
    assert(bp);

    if (which >= bp->_size)
    {
        printf("which超过表示范围!\n");
        return 0;
    }

    index = which / 32;
    pos = which % 32;

    return bp->_bmp[index] & (1 << pos);
}

int BitMapSize(BMap* bp)
{
    assert(bp);

    return bp->_size;
}

//比特位为1的总个数
int BitMapCount(BMap* bp)
{
    int i = 0;
    int count = 0;
    const char bitCount[] = "\0\1\1\2\1\2\2\3\1\2\2\3\2\3\3\4";//'\'相当于逗号，将0-15一的个数直接写在表中

    for (; i < bp->_capacity; i++)
    {
        int tmp = bp->_bmp[i];
        int j = 0;

        //每一次循环只能算一个字节中比特位的个数
        while (j < sizeof(bp->_bmp[0]))
        {
            char c = tmp;
            count += bitCount[c & 0x0f];//每次只能算四个比特位中1的个数

            c >>= 4;
            count += bitCount[c & 0x0f];


            tmp >>= 8;//算第二个字节
            j++;
        }
    }

    return count;
}

//销毁位图
void BitMapDestroy(BMap* bp)
{
    assert(bp);
    free(bp->_bmp);
    bp->_bmp = NULL;
    bp->_capacity = 0;
    bp->_size = 0;
}

test.c

#include "BitMap.h"
#include "BloomFilter.h"

void TestBitMap()
{
    BMap bp;
    BitMapInit(&bp, 28);
    BitMapSet(&bp, 3);//which是从0开始
    BitMapSet(&bp, 4);
    BitMapSet(&bp, 1);
    BitMapReSet(&bp, 1);

    printf("第零位为:%d\n", BitMapTest(&bp, 0));
    printf("第一位为:%d\n", BitMapTest(&bp, 1));

    printf("位图中共有1:%d个\n", BitMapCount(&bp));
    BitMapDestroy(&bp);
}

int main()
{
    TestBitMap();
}

布隆过滤器

布隆过滤器就是用于海量数据中查找某个元素在不在，如：一个单词是否在已知的词典当中；一个嫌疑人的名字是否已经在嫌疑人名单当中；这些数据的特点就是占用空间大，如果直接用哈希表的话，不好处理，这样我们就很容易想到用上面的位图来解决这些问题，但是，数据，该怎么放呢？我们可以采用哈希函数，来将数据映射到对应的比特位，取的时候用同样的方法，但是，又存在一个问题，那就是哈希冲突，解决哈希冲突的办法就是使用多个哈希函数来映射，如果他们说有一个元素不在集合，那么他就肯定不在。如果他们都说在，虽然有可能不在，但直觉上判断这种概率相对较低。

代码实现如下：

BloomFilter.h

#pragma once


#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include "BitMap.h"
#include "comm.h"


typedef char* DataType;
typedef unsigned(*PSTI)(DataType str);


typedef struct BloomFilter
{
    BMap _bp;
    int _size;  //有效数据的个数
    PSTI _STI[5]; //函数指针数组
}BFilter;

//初始化
void BloomFilterInit(BFilter* bf, int capacity, PSTI* pSTI, int size);//size表示数组的大小

//插入元素
void BloomFilterInseret(BFilter* bf, DataType data);

//判断数据是否在布隆中--只要算出来一个不在就肯定不在
int BloomFilterIsIn(BFilter* bf, DataType data);

int BloomFilterSize(BFilter* bf);

void BloomFilterDestroy(BFilter* bf);

BloomFilter.c

#include "BloomFilter.h"
#include "BitMap.h"
#include "comm.h"

//初始化
void BloomFilterInit(BFilter* bf, int capacity, PSTI* pSTI, int size)//size表示数组的大小
{
    int i = 0;
    assert(bf);
    BitMapInit(&bf->_bp, capacity * 5);//
    bf->_size = 0;

    for (; i < size; i++)
        bf->_STI[i] = pSTI[i];
}


//插入元素
void BloomFilterInseret(BFilter* bf, DataType data)
{
    assert(bf);
    unsigned hashaddr = -1;

    hashaddr = bf->_STI[0](data) % bf->_bp._size;
    BitMapSet(&bf->_bp, hashaddr);

    hashaddr = bf->_STI[1](data) % bf->_bp._size;
    BitMapSet(&bf->_bp, hashaddr);

    hashaddr = bf->_STI[2](data) % bf->_bp._size;
    BitMapSet(&bf->_bp, hashaddr);

    hashaddr = bf->_STI[3](data) % bf->_bp._size;
    BitMapSet(&bf->_bp, hashaddr);

    hashaddr = bf->_STI[4](data) % bf->_bp._size;
    BitMapSet(&bf->_bp, hashaddr);

    bf->_size++;
}

//判断数据是否在布隆中--只要算出来一个不在就肯定不在
int BloomFilterIsIn(BFilter* bf, DataType data)
{
    assert(bf);
    unsigned hashaddr = -1;

    hashaddr = bf->_STI[0](data) % bf->_bp._size;
    if (!BitMapTest(&bf->_bp, hashaddr))
        return 0;

    hashaddr = bf->_STI[1](data) % bf->_bp._size;
    if (!BitMapTest(&bf->_bp, hashaddr))
        return 0;

    hashaddr = bf->_STI[2](data) % bf->_bp._size;
    if (!BitMapTest(&bf->_bp, hashaddr))
        return 0;

    hashaddr = bf->_STI[3](data) % bf->_bp._size;
    if (!BitMapTest(&bf->_bp, hashaddr))
        return 0;

    hashaddr = bf->_STI[4](data) % bf->_bp._size;
    if (!BitMapTest(&bf->_bp, hashaddr))
        return 0;

    return 1;
}


int BloomFilterSize(BFilter* bf)
{
    assert(bf);

    return bf->_size;
}


void BloomFilterDestroy(BFilter* bf)
{
    assert(bf);

    BitMapDestroy(&bf->_bp);
    bf->_size = 0;
}

comm.h

#pragma once

//将字符串转化为整数
unsigned StrToInt1(const char * str);

unsigned int StrToInt2(char *str);

// RS Hash Function
unsigned int StrToInt3(char *str);

// JS Hash Function
unsigned int StrToInt4(char *str);

// ELF Hash Function
unsigned int StrToInt5(char *str);

unsigned IntToInt(int data);

comm.c

#include "comm.h"



//将字符串转换为整数
unsigned StrToInt1(const char * str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * seed + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

unsigned IntToInt(int data)
{
    return data;
}

unsigned int StrToInt2(char *str)
{
    unsigned int hash = 0;

    while (*str)
    {
        // equivalent to: hash = 65599*hash + (*str++);
        hash = (*str++) + (hash << 6) + (hash << 16) - hash;
    }

    return (hash & 0x7FFFFFFF);
}

// RS Hash Function
unsigned int StrToInt3(char *str)
{
    unsigned int b = 378551;
    unsigned int a = 63689;
    unsigned int hash = 0;

    while (*str)
    {
        hash = hash * a + (*str++);
        a *= b;
    }

    return (hash & 0x7FFFFFFF);
}

// JS Hash Function
unsigned int StrToInt4(char *str)
{
    unsigned int hash = 1315423911;

    while (*str)
    {
        hash ^= ((hash << 5) + (*str++) + (hash >> 2));
    }

    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
unsigned int StrToInt5(char *str)
{
    unsigned int hash = 0;
    unsigned int x = 0;

    while (*str)
    {
        hash = (hash << 4) + (*str++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }

    return (hash & 0x7FFFFFFF);
}