建议和fm算法做对比
算法步骤
1:将M [1],M [2],…M [m]初始化为0;
2:对于从文件“ stream_for_fm.txt”中读取的每个元素x,请执行以下3-6:
3:令H(X)表示的元素X的散列值(二进制形式)h(x)的哈希参数是随机的,设p(y)为第一1比特的从右侧的秩y中(例如,如果y = 1100则p( y)= 3,如果y = 1111,则p(y)= 1);
4:设置j = h(x)%m +1; // *将h(x)处理为整数,然后获取存储区ID * //
5:设置w = floor(h(x)/ m); // 删除h(x)中的log2(m)个最低有效位 //
6:设置M [j] = max(M [j],p(w));
7:设置s =(M [1] + M [2] +…+ M [m])/ m; 并返回E = 0.39701 * m * 2**s作为文件中不同元素的数量的估计;
def p(y):
count =0
while True:
a = y%2
y = y//2
if a==0:
count+=1
else:
break
return count+1
def random_hash_parameter():
return random.randint(1,2**25+1),random.randint(1,2**25+1)
import math
def delete(s,m):
line=[]
sum=0
while s!=0:
a = s%2
line.append(a)
s = s//2
# line.reverse()
# print(line)
if len(line)<=math.log(m,2):
return 0
else:
line = line[int(math.log(m,2)):]
# print(line)
for i in range(len(line)):
sum += line[i]*2**i
return sum
print(math.log(64,2))
print(delete(8,2))
6.0
4
loglog算法对不同元素的估计
import random
def loglog():
m=64
M=[]
M=[0]*m
a,b=random_hash_parameter()
with open("stream_for_fm.txt",'r') as f:
while True:
temp = f.readline()
if temp =='':
break
temp=int(temp.strip())
hx=a*temp +b
j=hx % m
w= delete(hx,m)
M[j]=max(M[j],p(w))
print(M)
s=sum(M)/m
return 0.39701*m*2**s
print(loglog())
[10, 11, 11, 10, 11, 18, 11, 19, 12, 16, 12, 12, 11, 11, 13, 10, 12, 12, 13, 14, 12, 13, 11, 10, 10, 14, 15, 14, 11, 14, 10, 12, 14, 11, 13, 9, 12, 9, 10, 11, 12, 11, 10, 18, 11, 13, 10, 13, 10, 13, 15, 12, 10, 13, 11, 13, 11, 12, 13, 12, 14, 11, 9, 11]
108681.52975006303