I am trying to implement Flajolet Martin algorithm. I have a dataset with over 6000 records but the output of the following code is 4096. Please help me in understanding the mistake being made by me.
import xxhash
import math
def return_trailing_zeroes(s):
s = str(s)
rev = s[::-1]
count = 0
for i in rev:
if i is '0':
count = count + 1
else:
break
return count
def gethash(line):
num=abs(xxhash.xxh32(line).intdigest())
return num
fp=open("/content/drive/MyDrive/Data.txt","r")
h_max=0
for line in fp:
hash_value_1 = gethash(line)
binary_1 = format(hash_value_1, '032b')
t1 = return_trailing_zeroes(binary_1)
if t1>h_max:
h_max=t1
fp.close()
print(2**h_max)
I tried this implementation of HyperLogLog algorithm and the output of the following code is 2560.
def return_trailing_zeroes(s): s = str(s) rev = s[::-1] count = 0 for i in rev: if i is '0': count = count + 1 else: break
return counth1_m=0
h2_m=0
h3_m=0
h4_m=0
fp=open("/content/drive/MyDrive/Data.txt","r")
for line in fp:
hash_value_1 = abs(xxhash.xxh32(line).intdigest())
hash_value_2 = abs(hash32(line))
hash_value_3 = abs(jhashcode.hashcode(line))
hash_value_4 = abs(mmh3.hash(line))
binary_1 = format(hash_value_1, '032b')
binary_2 = format(hash_value_2, '032b')
binary_3 = format(hash_value_3, '032b')
binary_4 = format(hash_value_4, '032b')
t1 = return_trailing_zeroes(binary_1)
t2 = return_trailing_zeroes(binary_2)
t3 = return_trailing_zeroes(binary_3)
t4 = return_trailing_zeroes(binary_4)
if t1>h1_m: h1_m=t1
if t2>h2_m: h2_m=t2
if t3>h3_m: h3_m=t3
if t4>h4_m: h4_m=t4
fp.close()
avg_hash12 = (2**(h1_m) + 2**(h2_m))/ float(2)
avg_hash34 = (2**(h3_m) + 2**(h4_m))/ float(2)
distinct_elements = math.ceil(statistics.median([avg_hash12, avg_hash34]))
print(distinct_elements)