原理
用于海量数据去重,对数据经多次hash,放入bitmap,由于采用hash算法,可能数据可能重复,所以使用前务必按照公式计算错误率
实现
# coding:utf-8
import redis
import mmh3
REDIS_HOST = "192.168.159.128"
TEN_M = 50 * 1024 * 8
BF_KEY = "bf_key"
redis_client = redis.Redis(host=REDIS_HOST, port=6379, db=0)
seeds = [18, 6, 9, 7, 2, 17, 23, 23]
def init_bloom():
for i in range(0, TEN_M):
redis_client.setbit(BF_KEY, i, 0)
def add_bloom(str):
for seed in seeds:
sign = mmh3.hash(str, seed=seed)
if sign < 0:
offset = (sign & 0xffffffff) % TEN_M
redis_client.setbit(BF_KEY, offset=offset, value=1)
else:
offset = sign % TEN_M
redis_client.setbit(BF_KEY, offset=offset, value=1)
def in_bloom(str):
for seed in seeds:
sign = mmh3.hash(str, seed=seed)
if sign < 0:
offset = (sign & 0xffffffff) % TEN_M
if redis_client.getbit(BF_KEY, offset) != 1:
return False
else:
offset = sign % TEN_M
if redis_client.getbit(BF_KEY, offset) != 1:
return False
return True
if __name__ == '__main__':
print in_bloom("hehe2222222222122222222ffffffffffffffffffffddddddd22222221")
两个有用的公式
计算hash次数:k=(m/n)ln(2)
计算fp(错误率):p=(1-e^kn/m)^k