-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathBloomFilter.java
259 lines (237 loc) · 10.6 KB
/
BloomFilter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/******************************************************************
*
* Vaughn Hartzell 001
*
* Note, additional comments provided throughout source code is
* for educational purposes.
*
********************************************************************/
import java.util.BitSet;
import java.util.Random;
import java.util.HashSet;
import java.util.Set;
import java.security.SecureRandom;
import java.lang.Math;
/**
* Bloom Filters
*
* A Bloom filter is an implementation of a set which allows a certain
* probability of 'false positives' when determining if a given object is
* a member of that set, in return for a reduction in the amount of memory
* required for the set. It effectively works as follows:
* 1) We allocate 'm' bits to represent the set data.
* 2) We provide a hash function, which, instead of a single hash code,
produces'k' hash codes and sets those bits.
* 3) To add an element to the set, we derive bit indexes from all 'k'
hash codes and set those bits.
* 4) To determine if an element is in the set, we again calculate the
* corresponding hash codes and bit indexes, and say it is likely
* present if and only if all corresponding bits are set.
*
* The margin of error (or false positive rate) thus comes from the fact
* that as we add more and more objects to the set, we increase the likelihood
* of "accidentally" setting a combination of bits that corresponds to an
* element that isn't actually in the set. However, through tuning the bloom
* filter setup based on the expected data, we mathematically have control
* over the desired false positive probability rate that we want to received
* based on probability theory.
*
* False Positive rate discussion:
*
* The Bloom filter performance changes as we change parameters discussed
* below with the class constructors. There are two key variables that impact
* the false positive rate:
* 1) number of bits per item
* 2) number of hash codes
*
* In other words, how many more bits are there in the filter than the
* maximum number of items we want to represent in the set, and hence the
* number of bits that we actually set for each element that we add to the
* set. The more bits we require to be marked as set to '1' in order to mark
* an element as 'present' - e.g., the more hash code per item - the lower the
* chance of false positives, because for a given element potentially in
* the set, there's less chance of some random combination of bits from other
* elements also accidentally marking that element as present when it isn't.
*
* But, for a given bit filter size, there is a 'point of no return', at
* which having more hash codes simply means that we fill up the bit set too
* quickly as we add elements -- and hence get more false positives -- than
* with fewer hash codes.
*
* Based on this discussion, you can find many Bloom Filter calculators
* available online to determine how to adjust the variables inorder to
* achieve the desired probability of false positive rates that you can
* tolerate and/or desire for your application, e.g.,:
* - https://toolslick.com/programming/data-structure/bloom-filter-calculator
* - https://www.engineersedge.com/calculators/bloom_filter_calculator_15596.htm
* - https://www.di-mgt.com.au/bloom-calculator.html
* - https://programming.guide/bloom-filter-calculator.html
*/
class BloomFilter {
private static final int MAX_HASHES = 8;
private static final long[] byteTable;
private static final long HSTART = 0xBB40E64DA205B064L;
private static final long HMULT = 7664345821815920749L;
/*
* Hash provision code provided below:
*
* The following methods implement a strong 64-bit hash function, which
* produces a good dispersal. In other words, given a random set of
* elements to hash, there is a high chance that our hash function will
* produce corresponding hash codes that are well dispersed over the
* possible range of hash codes. Hence:
* 1) whatever the size of the hash table, the number of collisions will
* be close to the number that we would "theoretically" expect;
* 2) for a given hash code width (i.e. the number of bits in the hash
* code), we can predict how likely it is for the above-mentioned goal
* to be met; e.g. given a typical random selection of element, for
* each element to be given a unique hash code.
*
* Additional discussion on hash implementation:
*
* Notice that Java's hash table implementation — and hence implementations
* of hashCode() — don't require the goals above to be met. Java maps and
* sets, rather than storing just the hash code, store the actual key object.
* This means that implementations of hashCode() generally only need to
* be "fairly good". It isn't the end of the world if two key objects have
* the same hash function, because the keys themselves are also compared in
* deciding if a match has been found.
*
* Below implements a 64-bit Linear Congruential Generator (LCG). It uses
* a table of 256 random values indexed by successive bytes in the data,
* and recommends a multiple suitable for an LCG with a modulus of 2^64,
* which is effectively what we have when we multiple using 64-bit long.
*
* The value of HMULT is found to be a good practice with 64-bit LCG. It
* has roughly half of its bits set and is 'virtually' prime (it is
* composed of three prime factors). The value of HSTART is arbitrary,
* essentially any value would do.
*
* Fore more information:
* Read about 'Linear Congruential Generators' (LCG) in the Literature.
*/
static {
byteTable = new long[256 * MAX_HASHES];
long h = 0x544B2FBACAAF1684L;
for (int i = 0; i < byteTable.length; i++) {
for (int j = 0; j < 31; j++)
h = (h >>> 7) ^ h; h = (h << 11) ^ h; h = (h >>> 10) ^ h;
byteTable[i] = h;
}
}
private long hashCode(String s, int hcNo) {
long h = HSTART;
final long hmult = HMULT;
final long[] ht = byteTable;
int startIx = 256 * hcNo;
for (int len = s.length(), i = 0; i < len; i++) {
char ch = s.charAt(i);
h = (h * hmult) ^ ht[startIx + (ch & 0xff)];
h = (h * hmult) ^ ht[startIx + ((ch >>> 8) & 0xff)];
}
return h;
}
private final BitSet data; // The hash bit map
private final int noHashes; // number of hashes
private final int hashMask; // hash mask
/*
* Constructors discussion:
*
* 1) The first constructor will take as the first parameter the
* base 2 logarithm of the number of bits, so passing an
* example of 16, gives you a bloom filter of size 2^16. The
* second parameter is the number of hash functions to use.
*
* 2) On the second constructor, consider that we can calculate
* bit indexes from hash codes simply by ANDing with the mask. In
* actual practice, we may not want to have to pass in the logarithm
* of the number of bits, so we also provide a constructor that
* in effect calculates the required value from a maximum number of
* items and a number of bits per item.
*/
public BloomFilter(int log2noBits, int noHashes) {
if (log2noBits < 1 || log2noBits > 31)
throw new IllegalArgumentException("Invalid number of bits");
if (noHashes < 1 || noHashes > MAX_HASHES)
throw new IllegalArgumentException("Invalid number of hashes");
this.data = new BitSet(1 << log2noBits);
this.noHashes = noHashes;
this.hashMask = (1 << log2noBits) - 1;
}
public BloomFilter(int noItems, int bitsPerItem, int noHashes) {
int bitsRequired = noItems * bitsPerItem;
if (bitsRequired >= Integer.MAX_VALUE) {
throw new IllegalArgumentException("Bloom filter would be too big");
}
int logBits = 4;
while ((1 << logBits) < bitsRequired)
logBits++;
if (noHashes < 1 || noHashes > MAX_HASHES)
throw new IllegalArgumentException("Invalid number of hashes");
this.data = new BitSet(1 << logBits);
this.noHashes = noHashes;
this.hashMask = (1 << logBits) - 1;
}
/*
* Method add
*
* The method will set the bits in the bloom filter map for each of the 'k'
* hash codes based on the passed String being added to the set.
*
* @param String - the value to add the to set
*/
public void add(String s) {
for (int n = 0; n < noHashes; n++) {
long hc = hashCode(s, n);
int bitNo = (int) (hc) & this.hashMask;
data.set(bitNo);
}
}
/*
* Method contains
*
* The method will check the bits in the bloom filter map for each
* of the 'k' hash codes based on the passed in parameter. It returns
* false if not in the set, else true if most probably in the set.
*
* @param boolean - false if not in set, else true for most probably in set
*/
public boolean contains(String s) {
// ADD YOUR CODE HERE - DO NOT FORGET TO ADD YOUR NAME AT TOP OF FILE
//
// HINT: the bitmap is the private class variable 'data', and it is
// of type BitSet (Java class BitSet). See Oracle documentation for
// this class on available methods. You can also see how method 'add'
// in this class uses the object.
for(int n=0;n<noHashes;n++){
long hc = hashCode(s, n);
int bitNo = (int) (hc) & this.hashMask;
if(!data.get(bitNo)){
return false;
}
}
return true;
}
/*********************************
*
* Method randomString
*
* This static method is used by the main routine for testing purposes.
* It generates random strings for entering into our Bloom filter hash map.
*
*********************************/
public static final String LETTERS =
"abcdefghijklmnopqrstuvexyABCDEFGHIJKLMNOPQRSTUVWYXZzéèêàôû";
public static String randomString(Random r) {
int wordLen;
do {
wordLen = 5 + 2 * (int) (r.nextGaussian() + 0.5d);
} while (wordLen < 1 || wordLen > 12);
StringBuilder sb = new StringBuilder(wordLen);
for (int i = 0; i < wordLen; i++) {
char ch = LETTERS.charAt(r.nextInt(LETTERS.length()));
sb.append(ch);
}
return new String(sb);
}
}