From 6ad389892afd63fe7afd71f34b5d1276579827b9 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Sun, 11 Sep 2016 01:02:03 -0400 Subject: [PATCH] Updated API to support more configuration Added configuration options to the API to support supplied hashing functions, configurable filter sizes, bucket sizes and fingerprint sizes. Additionally there is the option to specify maximum number of kicks displacing fingerprints from their buckets. They are to be provided to `cfilter.New(opts ...options)` and are as follows: - cfilter.Size(uint) sets the number of buckets in the filter - cfilter.BucketSize(uint8) sets the size of each bucket - cfilter.FingerprintSize(uint8) sets the size of the fingerprint - cfilter.MaximumKicks(uint) sets the maximum number of bucket kicks - cfilter.HashFn(hash.Hash) sets the fingerprinting hashing function NOTE: this commit introduces a change to the API, in order to retrieve the number or items currently in the filter instead of `cf.Size()` you would now use `cf.Count()`. --- README.md | 4 +-- bucket.go | 4 +-- cfilter.go | 69 ++++++++++++++++++++++++++++--------------------- cfilter_test.go | 37 +++++++++++++++++++------- fingerprint.go | 6 ++--- options.go | 64 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 135 insertions(+), 49 deletions(-) create mode 100644 options.go diff --git a/README.md b/README.md index f722ff6..bac6871 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ A cuckoo filter supports following operations: * `Delete(item)`: delete the given item from the filter. Note that to use this method, it must be ensured that this item is in the filter (e.g., based on records on external storage); otherwise, a false item may be deleted. -* `Size()`: return the total number of items currently in the filter +* `Count()`: return the total number of items currently in the filter ## Example Usage ```go @@ -53,7 +53,7 @@ cf.Insert([]byte("buongiorno")) cf.Lookup([]byte("hola")) // returns 1 (given only 'buongiorno' was added) -cf.Size() +cf.Count() // tries deleting 'bonjour' from filter, may delete another element // this could occur when another byte slice with the same fingerprint diff --git a/bucket.go b/bucket.go index 5f72b05..e7feaeb 100644 --- a/bucket.go +++ b/bucket.go @@ -2,8 +2,6 @@ package cfilter import "math/rand" -const bSize = 4 - type bucket []fingerprint func (b bucket) insert(f fingerprint) bool { @@ -40,7 +38,7 @@ func (b bucket) remove(f fingerprint) bool { } func (b bucket) swap(f fingerprint) fingerprint { - i := rand.Intn(bSize - 1) + i := rand.Intn(len(b) - 1) b[i], f = f, b[i] return f diff --git a/cfilter.go b/cfilter.go index 3368f6b..9c0d79a 100644 --- a/cfilter.go +++ b/cfilter.go @@ -2,34 +2,43 @@ package cfilter import ( "hash" - "hash/fnv" "math/rand" ) -// The maximum number of times we kick down items/displace from their buckets const maxCuckooCount = 500 -// The number of buckets in the filter -const cfSize = (1 << 18) / bSize - // CFilter represents a Cuckoo Filter, a probabilistic data store -// for approximated set membership queries +// for approximated set membership queries. type CFilter struct { - size uint - hashfn hash.Hash64 - buckets []bucket + hashfn hash.Hash // Hash function used for fingerprinting + buckets []bucket // Buckets where fingerprints are stored + count uint + + bSize uint8 // Bucket size + fpSize uint8 // Fingerprint size + size uint // Number of buckets in the filter + kicks uint // Maximum number of times we kick down items from buckets } // New returns a new CFilter object. It's Insert, Lookup, Delete and // Size behave as their names suggest. -func New() *CFilter { +// Takes zero or more of the following option functions and applies them in +// order to the Filter: +// - cfilter.Size(uint) sets the number of buckets in the filter +// - cfilter.BucketSize(uint8) sets the size of each bucket +// - cfilter.FingerprintSize(uint8) sets the size of the fingerprint +// - cfilter.MaximumKicks(uint) sets the maximum number of bucket kicks +// - cfilter.HashFn(hash.Hash) sets the fingerprinting hashing function +func New(opts ...option) *CFilter { cf := new(CFilter) + for _, opt := range opts { + opt(cf) + } + configure(cf) - cf.size = 0 - cf.hashfn = fnv.New64() - cf.buckets = make([]bucket, cfSize, cfSize) + cf.buckets = make([]bucket, cf.size, cf.size) for i := range cf.buckets { - cf.buckets[i] = make([]fingerprint, bSize, bSize) + cf.buckets[i] = make([]fingerprint, cf.bSize, cf.bSize) } return cf @@ -38,22 +47,22 @@ func New() *CFilter { // Insert adds an element (in byte-array form) to the Cuckoo filter, // returns true if successful and false otherwise. func (cf *CFilter) Insert(item []byte) bool { - f := fprint(item, cf.hashfn) - j := hashfp(item) % cfSize - k := (j ^ hashfp(f)) % cfSize + f := fprint(item, cf.fpSize, cf.hashfn) + j := hashfp(item) % cf.size + k := (j ^ hashfp(f)) % cf.size if cf.buckets[j].insert(f) || cf.buckets[k].insert(f) { - cf.size++ + cf.count++ return true } i := [2]uint{j, k}[rand.Intn(2)] for n := 0; n < maxCuckooCount; n++ { f = cf.buckets[i].swap(f) - i ^= hashfp(f) % cfSize + i ^= hashfp(f) % cf.size if cf.buckets[i].insert(f) { - cf.size++ + cf.count++ return true } } @@ -64,9 +73,9 @@ func (cf *CFilter) Insert(item []byte) bool { // Lookup checks if an element (in byte-array form) exists in the Cuckoo // Filter, returns true if found and false otherwise. func (cf *CFilter) Lookup(item []byte) bool { - f := fprint(item, cf.hashfn) - j := hashfp(item) % cfSize - k := (j ^ hashfp(f)) % cfSize + f := fprint(item, cf.fpSize, cf.hashfn) + j := hashfp(item) % cf.size + k := (j ^ hashfp(f)) % cf.size return cf.buckets[j].lookup(f) || cf.buckets[k].lookup(f) } @@ -74,19 +83,19 @@ func (cf *CFilter) Lookup(item []byte) bool { // Delete removes an element (in byte-array form) from the Cuckoo Filter, // returns true if element existed prior and false otherwise. func (cf *CFilter) Delete(item []byte) bool { - f := fprint(item, cf.hashfn) - j := hashfp(item) % cfSize - k := (j ^ hashfp(f)) % cfSize + f := fprint(item, cf.fpSize, cf.hashfn) + j := hashfp(item) % cf.size + k := (j ^ hashfp(f)) % cf.size if cf.buckets[j].remove(f) || cf.buckets[k].remove(f) { - cf.size-- + cf.count-- return true } return false } -// Size returns the total number of elements added to the Cuckoo Filter. -func (cf *CFilter) Size() uint { - return cf.size +// Count returns the total number of elements currently in the Cuckoo Filter. +func (cf *CFilter) Count() uint { + return cf.count } diff --git a/cfilter_test.go b/cfilter_test.go index a820597..433fdb6 100644 --- a/cfilter_test.go +++ b/cfilter_test.go @@ -1,13 +1,16 @@ -package cfilter +package cfilter_test import ( "bufio" + "hash/fnv" "os" "testing" + + "github.com/irfansharif/cfilter" ) func TestMultipleInsertions(t *testing.T) { - cf := New() + cf := cfilter.New() fd, err := os.Open("/usr/share/dict/words") if err != nil { @@ -26,7 +29,7 @@ func TestMultipleInsertions(t *testing.T) { words = append(words, word) } - size := cf.Size() + size := cf.Count() if size != wordCount { t.Errorf("Expected word count = %d, not %d", wordCount, size) } @@ -35,19 +38,19 @@ func TestMultipleInsertions(t *testing.T) { cf.Delete(word) } - size = cf.Size() + size = cf.Count() if size != 0 { t.Errorf("Expected word count = 0, not %d", size) } } func TestBasicInsertion(t *testing.T) { - cf := New() + cf := cfilter.New() if !cf.Insert([]byte("buongiorno")) { t.Errorf("Wasn't able to insert very first word, 'buongiorno'") } - size := cf.Size() + size := cf.Count() if size != 1 { t.Errorf("Expected size after insertion to be 1, not %d", size) } @@ -64,22 +67,36 @@ func TestBasicInsertion(t *testing.T) { t.Errorf("Did not expect to find 'buongiorno' in filter after deletion") } - size = cf.Size() + size = cf.Count() if size != 0 { t.Errorf("Expected size after deletion to be 0, not %d", size) } } func TestInitialization(t *testing.T) { - cf := New() - size := cf.Size() + cf := cfilter.New() + size := cf.Count() if size != 0 { t.Errorf("Expected initial size to be 0, not %d", size) } } +func TestConfigurationOptions(t *testing.T) { + cf := cfilter.New( + cfilter.Size(1<<18), + cfilter.BucketSize(4), + cfilter.FingerprintSize(2), + cfilter.MaximumKicks(500), + cfilter.HashFn(fnv.New64()), + ) + size := cf.Count() + if size != 0 { + t.Errorf("Expected size to be 10, not %d", size) + } +} + func BenchmarkInsertionAndDeletion(b *testing.B) { - cf := New() + cf := cfilter.New() for n := 0; n < b.N; n++ { cf.Insert([]byte("buongiorno")) cf.Delete([]byte("buongiorno")) diff --git a/fingerprint.go b/fingerprint.go index 0e8145f..60ec58c 100644 --- a/fingerprint.go +++ b/fingerprint.go @@ -5,17 +5,15 @@ import ( "hash" ) -const fpSize = 2 - type fingerprint []byte -func fprint(item []byte, hashfn hash.Hash64) fingerprint { +func fprint(item []byte, fpSize uint8, hashfn hash.Hash) fingerprint { hashfn.Reset() hashfn.Write(item) h := hashfn.Sum(nil) fp := fingerprint{} - for i := 0; i < fpSize; i++ { + for i := uint8(0); i < fpSize; i++ { fp = append(fp, h[i]) } diff --git a/options.go b/options.go new file mode 100644 index 0000000..46b0246 --- /dev/null +++ b/options.go @@ -0,0 +1,64 @@ +package cfilter + +import ( + "hash" + "hash/fnv" +) + +type option func(*CFilter) + +// Size sets the number of buckets in the filter. +// Defaults to ((1 << 18) / BucketSize). +func Size(s uint) option { + return func(cf *CFilter) { + cf.size = s + } +} + +// BucketSize sets the size of each bucket in the filter. Defaults to 4. +func BucketSize(s uint8) option { + return func(cf *CFilter) { + cf.bSize = s + } +} + +// FingerprintSize sets the size of the fingerprint. Defaults to 2. +func FingerprintSize(s uint8) option { + return func(cf *CFilter) { + cf.fpSize = s + } +} + +// MaximumKicks sets the maximum number of times we kick down items/displace +// from their buckets. Defaults to 500. +func MaximumKicks(k uint) option { + return func(cf *CFilter) { + cf.kicks = k + } +} + +// HashFn sets the hashing function to be used for fingerprinting. Defaults to +// a 64-bit FNV-1 hash.Hash. +func HashFn(hashfn hash.Hash) option { + return func(cf *CFilter) { + cf.hashfn = hashfn + } +} + +func configure(cf *CFilter) { + if cf.hashfn == nil { + cf.hashfn = fnv.New64() + } + if cf.bSize == 0 { + cf.bSize = 4 + } + if cf.fpSize == 0 { + cf.fpSize = 2 + } + if cf.kicks == 0 { + cf.kicks = 500 + } + if cf.size == 0 { + cf.size = (1 << 18) / uint(cf.bSize) + } +}