Skip to content

Commit

Permalink
Merge pull request #181 from hearchco/anon-and-hashed-queries
Browse files Browse the repository at this point in the history
fix(log,cache): anonymous and hashed queries
  • Loading branch information
aleksasiriski authored Feb 7, 2024
2 parents 4d8a918 + ef23f3c commit ea5ff1f
Show file tree
Hide file tree
Showing 26 changed files with 307 additions and 79 deletions.
18 changes: 18 additions & 0 deletions src/anonymize/hash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package anonymize

import (
"crypto/sha256"
"encoding/base64"
)

func HashToSHA256B64(orig string) string {
// hash string with sha256 which returns binary
hasher := sha256.New()
hasher.Write([]byte(orig))
hashedBinary := hasher.Sum(nil)

// encode binary hash to base64 string
hashedString := base64.URLEncoding.EncodeToString(hashedBinary)

return hashedString
}
24 changes: 24 additions & 0 deletions src/anonymize/hash_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package anonymize_test

import (
"testing"

"github.com/hearchco/hearchco/src/anonymize"
)

func TestHashToSHA256B64(t *testing.T) {
// original string, expected hash (sha256 returns binary and is encoded to base64)
tests := []testPair{
{"", "47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU="},
{"banana death", "e8kN64XJ4Icr6Tl9VYrBRj50UJCPlyillODm3vVNk2g="},
{"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", "LYwvbZeMohcStfbeNsnTH6jpak-l2P-LAYjfuefBcbs="},
{"Ćao hrčko!! 🐹", "_Y3KWzrx2UkeTp8b--48L6OFgv51JWPlZArjoFOrmbw="},
}

for _, test := range tests {
hash := anonymize.HashToSHA256B64(test.orig)
if hash != test.expected {
t.Errorf("HashToSHA256B64(%q) = %q, want %q", test.orig, hash, test.expected)
}
}
}
58 changes: 58 additions & 0 deletions src/anonymize/string.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package anonymize

import (
"math/rand"
"sort"
"strings"
"time"
)

// remove duplicate characters from string
func Deduplicate(orig string) string {
dedupStr := ""
encountered := make(map[rune]bool)

for _, char := range orig {
if !encountered[char] {
encountered[char] = true
dedupStr += string(char)
}
}

return dedupStr
}

// sort string characters lexicographically
func SortString(orig string) string {
// Convert the string to a slice of characters
characters := strings.Split(orig, "")

// Sort the slice
sort.Strings(characters)

// Join the sorted slice back into a string
return strings.Join(characters, "")
}

// shuffle string because deduplicate retains the order of letters
func Shuffle(orig string) string {
inRune := []rune(orig)

// WARNING: in year 2262, this will break
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
rng.Shuffle(len(inRune), func(i, j int) {
inRune[i], inRune[j] = inRune[j], inRune[i]
})

return string(inRune)
}

// anonymize string
func String(orig string) string {
return Shuffle(Deduplicate(orig))
}

// anonymize substring of string
func Substring(orig string, ssToAnon string) string {
return strings.ReplaceAll(orig, ssToAnon, String(ssToAnon))
}
67 changes: 67 additions & 0 deletions src/anonymize/string_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package anonymize_test

import (
"testing"

"github.com/hearchco/hearchco/src/anonymize"
)

func TestDeduplicate(t *testing.T) {
// original string, expected deduplicated string
tests := []testPair{
{"", ""},
{"gmail", "gmail"},
{"banana death", "ban deth"},
{"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", "Lorem ipsudlta,cngbq.UvxDhfE"},
}

for _, test := range tests {
deduplicated := anonymize.Deduplicate(test.orig)
if deduplicated != test.expected {
t.Errorf("deduplicate(%q) = %q, want %q", test.orig, deduplicated, test.expected)
}
}
}

func TestSortString(t *testing.T) {
// original string, sorted string
tests := []testPair{
{"", ""},
{"gmail", "agilm"},
{"banana death", " aaaabdehnnt"},
{
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
" ,,.Laaaaaaabccccddddddddeeeeeeeeeeeggiiiiiiiiiiilllllmmmmmmnnnnnoooooooooopppqrrrrrrsssssstttttttttuuuuuu",
},
}

for _, test := range tests {
sorted := anonymize.SortString(test.orig)

if sorted != test.expected {
t.Errorf("SortString(%q) = %q, want %q", test.orig, sorted, test.expected)
}
}
}

func TestShuffle(t *testing.T) {
// original string, sorted string
tests := []testPair{
{"", ""},
{"gmail", "agilm"},
{"banana death", " aaaabdehnnt"},
{
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
" ,,.Laaaaaaabccccddddddddeeeeeeeeeeeggiiiiiiiiiiilllllmmmmmmnnnnnoooooooooopppqrrrrrrsssssstttttttttuuuuuu",
},
}

for _, test := range tests {
shuffled := anonymize.Shuffle(test.orig)
shuffledSorted := anonymize.SortString(shuffled)

if shuffledSorted != test.expected {
t.Errorf("SortString(Shuffle(%q)) = %q, want %q", test.orig, shuffledSorted, test.expected)
}
}
}
6 changes: 6 additions & 0 deletions src/anonymize/structs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package anonymize_test

type testPair struct {
orig string
expected string
}
5 changes: 3 additions & 2 deletions src/cache/pebble/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/cockroachdb/pebble"
"github.com/fxamacker/cbor/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/cache"
"github.com/rs/zerolog/log"
)
Expand Down Expand Up @@ -47,7 +48,7 @@ func (db *DB) Set(k string, v cache.Value) error {

if val, err := cbor.Marshal(v); err != nil {
return fmt.Errorf("pebble.Set(): error marshaling value: %w", err)
} else if err := db.pdb.Set([]byte(k), val, pebble.NoSync); err != nil {
} else if err := db.pdb.Set([]byte(anonymize.HashToSHA256B64(k)), val, pebble.NoSync); err != nil {
return fmt.Errorf("pebble.Set(): error setting KV to pebble: %w", err)
} else {
cacheTimeSince := time.Since(cacheTimer)
Expand All @@ -60,7 +61,7 @@ func (db *DB) Set(k string, v cache.Value) error {
}

func (db *DB) Get(k string, o cache.Value) error {
v, c, err := db.pdb.Get([]byte(k))
v, c, err := db.pdb.Get([]byte(anonymize.HashToSHA256B64(k)))
val := []byte(v) // copy data before closing, casting needed for unmarshal

if err == pebble.ErrNotFound {
Expand Down
5 changes: 3 additions & 2 deletions src/cache/redis/redis.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"github.com/fxamacker/cbor/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/cache"
"github.com/hearchco/hearchco/src/config"
"github.com/redis/go-redis/v9"
Expand Down Expand Up @@ -52,7 +53,7 @@ func (db *DB) Set(k string, v cache.Value) error {

if val, err := cbor.Marshal(v); err != nil {
return fmt.Errorf("redis.Set(): error marshaling value: %w", err)
} else if err := db.rdb.Set(db.ctx, k, val, 0).Err(); err != nil {
} else if err := db.rdb.Set(db.ctx, anonymize.HashToSHA256B64(k), val, 0).Err(); err != nil {
return fmt.Errorf("redis.Set(): error setting KV to redis: %w", err)
} else {
cacheTimeSince := time.Since(cacheTimer)
Expand All @@ -65,7 +66,7 @@ func (db *DB) Set(k string, v cache.Value) error {
}

func (db *DB) Get(k string, o cache.Value) error {
v, err := db.rdb.Get(db.ctx, k).Result()
v, err := db.rdb.Get(db.ctx, anonymize.HashToSHA256B64(k)).Result()
val := []byte(v) // copy data before closing, casting needed for unmarshal

if err == redis.Nil {
Expand Down
13 changes: 9 additions & 4 deletions src/cli/climode.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"time"

"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/bucket/result"
"github.com/hearchco/hearchco/src/cache"
"github.com/hearchco/hearchco/src/category"
Expand All @@ -29,7 +30,8 @@ func printResults(results []result.Result) {

func Run(flags Flags, db cache.DB, conf *config.Config) {
log.Info().
Str("query", flags.Query).
Str("queryAnon", anonymize.String(flags.Query)).
Str("queryHash", anonymize.HashToSHA256B64(flags.Query)).
Int("maxPages", flags.MaxPages).
Bool("visit", flags.Visit).
Msg("Started hearching")
Expand All @@ -54,7 +56,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) {
// Error in reading cache is not returned, just logged
log.Error().
Err(gerr).
Str("query", flags.Query).
Str("queryAnon", anonymize.String(flags.Query)).
Str("queryHash", anonymize.HashToSHA256B64(flags.Query)).
Msg("cli.Run(): failed accessing cache")
} else if results != nil {
foundInDB = true
Expand All @@ -64,7 +67,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) {

if foundInDB {
log.Debug().
Str("query", flags.Query).
Str("queryAnon", anonymize.String(flags.Query)).
Str("queryHash", anonymize.HashToSHA256B64(flags.Query)).
Msg("Found results in cache")
} else {
log.Debug().Msg("Nothing found in cache, doing a clean search")
Expand All @@ -75,7 +79,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) {
if serr != nil {
log.Error().
Err(serr).
Str("query", flags.Query).
Str("queryAnon", anonymize.String(flags.Query)).
Str("queryHash", anonymize.HashToSHA256B64(flags.Query)).
Msg("cli.Run(): error updating database with search results")
}
}
Expand Down
9 changes: 7 additions & 2 deletions src/engines/bing/bing.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strings"

"github.com/gocolly/colly/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/bucket"
"github.com/hearchco/hearchco/src/config"
"github.com/hearchco/hearchco/src/engines"
Expand Down Expand Up @@ -74,13 +75,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi
colCtx := colly.NewContext()
colCtx.Put("page", strconv.Itoa(1))

sedefaults.DoGetRequest(Info.URL+query+localeParam, colCtx, col, Info.Name, &retError)
urll := Info.URL + query + localeParam
anonUrll := Info.URL + anonymize.String(query) + localeParam
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)

for i := 1; i < options.MaxPages; i++ {
colCtx = colly.NewContext()
colCtx.Put("page", strconv.Itoa(i+1))

sedefaults.DoGetRequest(Info.URL+query+"&first="+strconv.Itoa(i*10+1)+localeParam, colCtx, col, Info.Name, &retError)
urll := Info.URL + query + "&first=" + strconv.Itoa(i*10+1) + localeParam
anonUrll := Info.URL + anonymize.String(query) + "&first=" + strconv.Itoa(i*10+1) + localeParam
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)
}

col.Wait()
Expand Down
9 changes: 7 additions & 2 deletions src/engines/brave/brave.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/gocolly/colly/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/bucket"
"github.com/hearchco/hearchco/src/config"
"github.com/hearchco/hearchco/src/engines"
Expand Down Expand Up @@ -68,13 +69,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi
colCtx := colly.NewContext()
colCtx.Put("page", strconv.Itoa(1))

sedefaults.DoGetRequest(Info.URL+query+"&source=web", colCtx, col, Info.Name, &retError)
urll := Info.URL + query + "&source=web"
anonUrll := Info.URL + anonymize.String(query) + "&source=web"
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)

for i := 1; i < options.MaxPages; i++ {
colCtx = colly.NewContext()
colCtx.Put("page", strconv.Itoa(i+1))

sedefaults.DoGetRequest(Info.URL+query+"&spellcheck=0&offset="+strconv.Itoa(i), colCtx, col, Info.Name, &retError)
urll := Info.URL + query + "&spellcheck=0&offset=" + strconv.Itoa(i)
anonUrll := Info.URL + anonymize.String(query) + "&spellcheck=0&offset=" + strconv.Itoa(i)
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)
}

col.Wait()
Expand Down
5 changes: 4 additions & 1 deletion src/engines/duckduckgo/duckduckgo.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/bucket"
"github.com/hearchco/hearchco/src/config"
"github.com/hearchco/hearchco/src/engines"
Expand Down Expand Up @@ -77,7 +78,9 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi
colCtx := colly.NewContext()
colCtx.Put("page", strconv.Itoa(1))

sedefaults.DoGetRequest(Info.URL+"?q="+query, colCtx, col, Info.Name, &retError)
urll := Info.URL + "?q=" + query
anonUrll := Info.URL + "?q=" + anonymize.String(query)
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)

for i := 1; i < options.MaxPages; i++ {
colCtx = colly.NewContext()
Expand Down
5 changes: 3 additions & 2 deletions src/engines/etools/etools.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,15 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi
colCtx.Put("page", strconv.Itoa(1))

sedefaults.DoPostRequest(Info.URL, strings.NewReader("query="+query+"&country=web&language=all"+safeSearchParam), colCtx, col, Info.Name, &retError)
col.Wait() //wait so I can get the JSESSION cookie back
col.Wait() // wait so I can get the JSESSION cookie back

for i := 1; i < options.MaxPages; i++ {
pageStr := strconv.Itoa(i + 1)
colCtx = colly.NewContext()
colCtx.Put("page", pageStr)

sedefaults.DoGetRequest(pageURL+pageStr, colCtx, col, Info.Name, &retError)
// query not needed as its saved in the session
sedefaults.DoGetRequest(pageURL+pageStr, pageURL+pageStr, colCtx, col, Info.Name, &retError)
}

col.Wait()
Expand Down
9 changes: 7 additions & 2 deletions src/engines/google/google.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/gocolly/colly/v2"
"github.com/hearchco/hearchco/src/anonymize"
"github.com/hearchco/hearchco/src/bucket"
"github.com/hearchco/hearchco/src/config"
"github.com/hearchco/hearchco/src/engines"
Expand Down Expand Up @@ -53,13 +54,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi
colCtx := colly.NewContext()
colCtx.Put("page", strconv.Itoa(1))

sedefaults.DoGetRequest(Info.URL+query, colCtx, col, Info.Name, &retError)
urll := Info.URL + query
anonUrll := Info.URL + anonymize.String(query)
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)

for i := 1; i < options.MaxPages; i++ {
colCtx = colly.NewContext()
colCtx.Put("page", strconv.Itoa(i+1))

sedefaults.DoGetRequest(Info.URL+query+"&start="+strconv.Itoa(i*10), colCtx, col, Info.Name, &retError)
urll := Info.URL + query + "&start=" + strconv.Itoa(i*10)
anonUrll := Info.URL + anonymize.String(query) + "&start=" + strconv.Itoa(i*10)
sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError)
}

col.Wait()
Expand Down
Loading

0 comments on commit ea5ff1f

Please sign in to comment.