-
Notifications
You must be signed in to change notification settings - Fork 0
/
lsh_optimal_params.py
50 lines (43 loc) · 1.49 KB
/
lsh_optimal_params.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
_integration_precision = 0.001
def _integration(f, a, b):
p = _integration_precision
area = 0.0
x = a
while x < b:
area += f(x+0.5*p)*p
x += p
return area, None
try:
from scipy.integrate import quad as integrate
except ImportError:
# For when no scipy installed
integrate = _integration
def _false_positive_probability(threshold, b, r):
_probability = lambda s : 1 - (1 - s**float(r))**float(b)
a, err = integrate(_probability, 0.0, threshold)
return a
def _false_negative_probability(threshold, b, r):
_probability = lambda s : 1 - (1 - (1 - s**float(r))**float(b))
a, err = integrate(_probability, threshold, 1.0)
return a
def _optimal_param(threshold, num_perm, false_positive_weight,
false_negative_weight):
'''
Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
of probabilities of false positive and false negative.
'''
min_error = float("inf")
opt = (0, 0)
for b in range(1, num_perm+1):
max_r = int(num_perm / b)
for r in range(1, max_r+1):
fp = _false_positive_probability(threshold, b, r)
fn = _false_negative_probability(threshold, b, r)
error = fp*false_positive_weight + fn*false_negative_weight
if error < min_error:
min_error = error
opt = (b, r)
return opt
for t in range(10, 100, 5):
t = 1.0 * t / 100
print(f't={t}, opt={_optimal_param(t, 128, 0.5, 0.5)}')