-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbloom.py
94 lines (74 loc) · 2.67 KB
/
bloom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
This is the implementation of a Bloom Filter (https://en.wikipedia.org/wiki/Bloom_filter)
It is a lightweight and memory-efficient way of storing data - in this case which memes
have already been posted to the group.
There is a high probability you don't need to change anything in this file.
Method checkfull() checks the amount of ones in the filter (8192 is the default array size, initialized with zeros)
Method checkp() checks the probability of returning a false positive.
If the filter is filled in over 75% or the probability is unsatisfactory, flush() its memory.
insert() and find() do not require an explanation - any valid string can be inserted and queried.
getIndex() helps with the aforementioned functions do their work
getTable() and writeTable() are employed so that the Filter can be stored in a file
setTable() is for debugging only
DISCLAIMER: The filter is slightly biased towards field table[size-1] due to the workaround in line 76, but in this
scope it hardly makes any difference
"""
import hashlib
import math
import sys
size = 8192
exponent = 13
amount = 20
class Bloom:
def __init__(self):
self.table = [0 for i in range(size)] #2E13
def checkfull(self):
count = 0
for i in self.table:
if i == 1:
count += 1
return count
def checkp(self):
count = 0
x = 100000
for i in range(x):
if self.find(str(i)) == 1:
count += 1
n = count*100/x
print("Probability of false positive is:", n,"%", file=sys.stderr)
def insert(self, value):
for i in range(amount):
index = self.get_index(value)
self.table[index] = 1
value += "a"
def flush(self):
for i in range(len(self.table)):
self.table[i] = 0
def find(self, value):
for i in range(amount):
index = self.get_index(value)
if self.table[index] == 0:
return 0
value += "a"
return 1
def get_index(self, value):
m = hashlib.md5()
m.update(value.encode('utf-8'))
index = round(int(m.hexdigest(), 16) / math.pow(2, 128-exponent))
if index == size:
index = size-1
return index
def getTable(self, file):
n = file.readline()
for i in range(min(len(self.table), len(n))):
self.table[i] = int((n[i]))
def setTable(self, file, n=size): #for manual use only
s = ''
for i in range(n):
s += '0'
file.write(s)
def writeTable(self, file):
s = ''
for c in self.table:
s += str(c)
file.write(s)