Skip to content

Commit

Permalink
Updated API to support more configuration
Browse files Browse the repository at this point in the history
Added configuration options to the API to support supplied hashing
functions, configurable filter sizes, bucket sizes and fingerprint
sizes. Additionally there is the option to specify maximum number of
kicks displacing fingerprints from their buckets. They are to be
provided to `cfilter.New(opts ...options)` and are as follows:

      - cfilter.Size(uint) sets the number of buckets in the filter
      - cfilter.BucketSize(uint8) sets the size of each bucket
      - cfilter.FingerprintSize(uint8) sets the size of the fingerprint
      - cfilter.MaximumKicks(uint) sets the maximum number of bucket kicks
      - cfilter.HashFn(hash.Hash) sets the fingerprinting hashing function

NOTE: this commit introduces a change to the API, in order to retrieve
the number or items currently in the filter instead of `cf.Size()` you
would now use `cf.Count()`.
  • Loading branch information
irfansharif committed Sep 11, 2016
1 parent c3db6c3 commit 6ad3898
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 49 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ A cuckoo filter supports following operations:
* `Delete(item)`: delete the given item from the filter. Note that to use this
method, it must be ensured that this item is in the filter (e.g., based on
records on external storage); otherwise, a false item may be deleted.
* `Size()`: return the total number of items currently in the filter
* `Count()`: return the total number of items currently in the filter

## Example Usage
```go
Expand All @@ -53,7 +53,7 @@ cf.Insert([]byte("buongiorno"))
cf.Lookup([]byte("hola"))

// returns 1 (given only 'buongiorno' was added)
cf.Size()
cf.Count()

// tries deleting 'bonjour' from filter, may delete another element
// this could occur when another byte slice with the same fingerprint
Expand Down
4 changes: 1 addition & 3 deletions bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package cfilter

import "math/rand"

const bSize = 4

type bucket []fingerprint

func (b bucket) insert(f fingerprint) bool {
Expand Down Expand Up @@ -40,7 +38,7 @@ func (b bucket) remove(f fingerprint) bool {
}

func (b bucket) swap(f fingerprint) fingerprint {
i := rand.Intn(bSize - 1)
i := rand.Intn(len(b) - 1)
b[i], f = f, b[i]

return f
Expand Down
69 changes: 39 additions & 30 deletions cfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,43 @@ package cfilter

import (
"hash"
"hash/fnv"
"math/rand"
)

// The maximum number of times we kick down items/displace from their buckets
const maxCuckooCount = 500

// The number of buckets in the filter
const cfSize = (1 << 18) / bSize

// CFilter represents a Cuckoo Filter, a probabilistic data store
// for approximated set membership queries
// for approximated set membership queries.
type CFilter struct {
size uint
hashfn hash.Hash64
buckets []bucket
hashfn hash.Hash // Hash function used for fingerprinting
buckets []bucket // Buckets where fingerprints are stored
count uint

bSize uint8 // Bucket size
fpSize uint8 // Fingerprint size
size uint // Number of buckets in the filter
kicks uint // Maximum number of times we kick down items from buckets
}

// New returns a new CFilter object. It's Insert, Lookup, Delete and
// Size behave as their names suggest.
func New() *CFilter {
// Takes zero or more of the following option functions and applies them in
// order to the Filter:
// - cfilter.Size(uint) sets the number of buckets in the filter
// - cfilter.BucketSize(uint8) sets the size of each bucket
// - cfilter.FingerprintSize(uint8) sets the size of the fingerprint
// - cfilter.MaximumKicks(uint) sets the maximum number of bucket kicks
// - cfilter.HashFn(hash.Hash) sets the fingerprinting hashing function
func New(opts ...option) *CFilter {
cf := new(CFilter)
for _, opt := range opts {
opt(cf)
}
configure(cf)

cf.size = 0
cf.hashfn = fnv.New64()
cf.buckets = make([]bucket, cfSize, cfSize)
cf.buckets = make([]bucket, cf.size, cf.size)
for i := range cf.buckets {
cf.buckets[i] = make([]fingerprint, bSize, bSize)
cf.buckets[i] = make([]fingerprint, cf.bSize, cf.bSize)
}

return cf
Expand All @@ -38,22 +47,22 @@ func New() *CFilter {
// Insert adds an element (in byte-array form) to the Cuckoo filter,
// returns true if successful and false otherwise.
func (cf *CFilter) Insert(item []byte) bool {
f := fprint(item, cf.hashfn)
j := hashfp(item) % cfSize
k := (j ^ hashfp(f)) % cfSize
f := fprint(item, cf.fpSize, cf.hashfn)
j := hashfp(item) % cf.size
k := (j ^ hashfp(f)) % cf.size

if cf.buckets[j].insert(f) || cf.buckets[k].insert(f) {
cf.size++
cf.count++
return true
}

i := [2]uint{j, k}[rand.Intn(2)]
for n := 0; n < maxCuckooCount; n++ {
f = cf.buckets[i].swap(f)
i ^= hashfp(f) % cfSize
i ^= hashfp(f) % cf.size

if cf.buckets[i].insert(f) {
cf.size++
cf.count++
return true
}
}
Expand All @@ -64,29 +73,29 @@ func (cf *CFilter) Insert(item []byte) bool {
// Lookup checks if an element (in byte-array form) exists in the Cuckoo
// Filter, returns true if found and false otherwise.
func (cf *CFilter) Lookup(item []byte) bool {
f := fprint(item, cf.hashfn)
j := hashfp(item) % cfSize
k := (j ^ hashfp(f)) % cfSize
f := fprint(item, cf.fpSize, cf.hashfn)
j := hashfp(item) % cf.size
k := (j ^ hashfp(f)) % cf.size

return cf.buckets[j].lookup(f) || cf.buckets[k].lookup(f)
}

// Delete removes an element (in byte-array form) from the Cuckoo Filter,
// returns true if element existed prior and false otherwise.
func (cf *CFilter) Delete(item []byte) bool {
f := fprint(item, cf.hashfn)
j := hashfp(item) % cfSize
k := (j ^ hashfp(f)) % cfSize
f := fprint(item, cf.fpSize, cf.hashfn)
j := hashfp(item) % cf.size
k := (j ^ hashfp(f)) % cf.size

if cf.buckets[j].remove(f) || cf.buckets[k].remove(f) {
cf.size--
cf.count--
return true
}

return false
}

// Size returns the total number of elements added to the Cuckoo Filter.
func (cf *CFilter) Size() uint {
return cf.size
// Count returns the total number of elements currently in the Cuckoo Filter.
func (cf *CFilter) Count() uint {
return cf.count
}
37 changes: 27 additions & 10 deletions cfilter_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package cfilter
package cfilter_test

import (
"bufio"
"hash/fnv"
"os"
"testing"

"github.com/irfansharif/cfilter"
)

func TestMultipleInsertions(t *testing.T) {
cf := New()
cf := cfilter.New()

fd, err := os.Open("/usr/share/dict/words")
if err != nil {
Expand All @@ -26,7 +29,7 @@ func TestMultipleInsertions(t *testing.T) {
words = append(words, word)
}

size := cf.Size()
size := cf.Count()
if size != wordCount {
t.Errorf("Expected word count = %d, not %d", wordCount, size)
}
Expand All @@ -35,19 +38,19 @@ func TestMultipleInsertions(t *testing.T) {
cf.Delete(word)
}

size = cf.Size()
size = cf.Count()
if size != 0 {
t.Errorf("Expected word count = 0, not %d", size)
}
}

func TestBasicInsertion(t *testing.T) {
cf := New()
cf := cfilter.New()
if !cf.Insert([]byte("buongiorno")) {
t.Errorf("Wasn't able to insert very first word, 'buongiorno'")
}

size := cf.Size()
size := cf.Count()
if size != 1 {
t.Errorf("Expected size after insertion to be 1, not %d", size)
}
Expand All @@ -64,22 +67,36 @@ func TestBasicInsertion(t *testing.T) {
t.Errorf("Did not expect to find 'buongiorno' in filter after deletion")
}

size = cf.Size()
size = cf.Count()
if size != 0 {
t.Errorf("Expected size after deletion to be 0, not %d", size)
}
}

func TestInitialization(t *testing.T) {
cf := New()
size := cf.Size()
cf := cfilter.New()
size := cf.Count()
if size != 0 {
t.Errorf("Expected initial size to be 0, not %d", size)
}
}

func TestConfigurationOptions(t *testing.T) {
cf := cfilter.New(
cfilter.Size(1<<18),
cfilter.BucketSize(4),
cfilter.FingerprintSize(2),
cfilter.MaximumKicks(500),
cfilter.HashFn(fnv.New64()),
)
size := cf.Count()
if size != 0 {
t.Errorf("Expected size to be 10, not %d", size)
}
}

func BenchmarkInsertionAndDeletion(b *testing.B) {
cf := New()
cf := cfilter.New()
for n := 0; n < b.N; n++ {
cf.Insert([]byte("buongiorno"))
cf.Delete([]byte("buongiorno"))
Expand Down
6 changes: 2 additions & 4 deletions fingerprint.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@ import (
"hash"
)

const fpSize = 2

type fingerprint []byte

func fprint(item []byte, hashfn hash.Hash64) fingerprint {
func fprint(item []byte, fpSize uint8, hashfn hash.Hash) fingerprint {
hashfn.Reset()
hashfn.Write(item)
h := hashfn.Sum(nil)

fp := fingerprint{}
for i := 0; i < fpSize; i++ {
for i := uint8(0); i < fpSize; i++ {
fp = append(fp, h[i])
}

Expand Down
64 changes: 64 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package cfilter

import (
"hash"
"hash/fnv"
)

type option func(*CFilter)

// Size sets the number of buckets in the filter.
// Defaults to ((1 << 18) / BucketSize).
func Size(s uint) option {
return func(cf *CFilter) {
cf.size = s
}
}

// BucketSize sets the size of each bucket in the filter. Defaults to 4.
func BucketSize(s uint8) option {
return func(cf *CFilter) {
cf.bSize = s
}
}

// FingerprintSize sets the size of the fingerprint. Defaults to 2.
func FingerprintSize(s uint8) option {
return func(cf *CFilter) {
cf.fpSize = s
}
}

// MaximumKicks sets the maximum number of times we kick down items/displace
// from their buckets. Defaults to 500.
func MaximumKicks(k uint) option {
return func(cf *CFilter) {
cf.kicks = k
}
}

// HashFn sets the hashing function to be used for fingerprinting. Defaults to
// a 64-bit FNV-1 hash.Hash.
func HashFn(hashfn hash.Hash) option {
return func(cf *CFilter) {
cf.hashfn = hashfn
}
}

func configure(cf *CFilter) {
if cf.hashfn == nil {
cf.hashfn = fnv.New64()
}
if cf.bSize == 0 {
cf.bSize = 4
}
if cf.fpSize == 0 {
cf.fpSize = 2
}
if cf.kicks == 0 {
cf.kicks = 500
}
if cf.size == 0 {
cf.size = (1 << 18) / uint(cf.bSize)
}
}

0 comments on commit 6ad3898

Please sign in to comment.