## 声明计算偏移量所需要的函数,来源:http://www.partow.net/programming/hashfunctions/index.html
def rs_hash(key):
a = 378551
b = 63689
hash_value = 0
for i in range(len(key)):
hash_value = hash_value * a + ord(key[i])
a = a * b
return hash_value
def js_hash(key):
hash_value = 1315423911
for i in range(len(key)):
hash_value ^= ((hash_value << 5) + ord(key[i]) + (hash_value >> 2))
return hash_value
def pjw_hash(key):
bits_in_unsigned_int = 4 * 8
three_quarters = (bits_in_unsigned_int * 3) / 4
one_eighth = bits_in_unsigned_int / 8
high_bits = 0xFFFFFFFF << int(bits_in_unsigned_int - one_eighth)
hash_value = 0
test = 0
for i in range(len(key)):
hash_value = (hash_value << int(one_eighth)) + ord(key[i])
test = hash_value & high_bits
if test != 0:
hash_value = ((hash_value ^ (test >> int(three_quarters))) & (~high_bits))
return hash_value & 0x7FFFFFFF
def elf_hash(key):
hash_value = 0
for i in range(len(key)):
hash_value = (hash_value << 4) + ord(key[i])
x = hash_value & 0xF0000000
if x != 0:
hash_value ^= (x >> 24)
hash_value &= ~x
return hash_value
def bkdr_hash(key):
seed = 131 # 31 131 1313 13131 131313 etc..
hash_value = 0
for i in range(len(key)):
hash_value = (hash_value * seed) + ord(key[i])
return hash_value
def sdbm_hash(key):
hash_value = 0
for i in range(len(key)):
hash_value = ord(key[i]) + (hash_value << 6) + (hash_value << 16) - hash_value;
return hash_value
def djb_hash(key):
hash_value = 5381
for i in range(len(key)):
hash_value = ((hash_value << 5) + hash_value) + ord(key[i])
return hash_value
def dek_hash(key):
hash_value = len(key);
for i in range(len(key)):
hash_value = ((hash_value << 5) ^ (hash_value >> 27)) ^ ord(key[i])
return hash_value
def bp_hash(key):
hash_value = 0
for i in range(len(key)):
hash_value = hash_value << 7 ^ ord(key[i])
return hash_value
def fnv_hash(key):
fnv_prime = 0x811C9DC5
hash_value = 0
for i in range(len(key)):
hash_value *= fnv_prime
hash_value ^= ord(key[i])
return hash_value
def ap_hash(key):
hash_value = 0xAAAAAAAA
for i in range(len(key)):
if (i & 1) == 0:
hash_value ^= ((hash_value << 7) ^ ord(key[i]) * (hash_value >> 3))
else:
hash_value ^= (~((hash_value << 11) + ord(key[i]) ^ (hash_value >> 5)))
return hash_value
## 实现去重过滤类
class BloomFilterRedis:
hash_list = [rs_hash, js_hash, pjw_hash, elf_hash, bkdr_hash,
sdbm_hash, djb_hash, dek_hash]
def __init__(self, key, host='127.0.0.1', port=6379, hash_list=hash_list):
# redis-bitmap的key
self.key = key
# redis连接信息
self.pool = redis.ConnectionPool(host=host, port=port)
self.handle = redis.StrictRedis(connection_pool=self.pool, charset='utf-8')
# 哈希函数列表
self.hash_list = hash_list
@classmethod
def random_generator(cls, hash_value):
'''
将hash函数得出的函数值映射到[0, 2^32-1]区间内
'''
return hash_value % (1 << 32)
def do_filter(self, item):
'''
检查是否是新的条目,是新条目则更新bitmap并返回True,是重复条目则返回False
'''
flag = False
for hash_func in self.hash_list:
# 获得到hash函数对象
# hash_func = getattr(GeneralHashFunctions, hash_func_str)
print(hash_func)
# 计算hash值
hash_value = hash_func(item)
# 将hash值映射到[0, 2^32]区间
real_value = BloomFilterRedis.random_generator(hash_value)
print(real_value)
# bitmap中对应位是0,则置为1,并说明此条目为新的条目
if self.handle.getbit(self.key, real_value) == 0:
self.handle.setbit(self.key, real_value, 1)
flag = True
# 当所有hash值在bitmap中对应位都是1,说明此条目重复,返回False
return flag
if __name__ == '__main__':
bloomFilterRedis = BloomFilterRedis("bloom")
bloomFilterRedis.do_filter("one item to check")```
python布隆过滤器实现去重详解
最新推荐文章于 2024-03-26 08:21:05 发布