diff --git a/src/anonymize/hash.go b/src/anonymize/hash.go new file mode 100644 index 00000000..4c1838cf --- /dev/null +++ b/src/anonymize/hash.go @@ -0,0 +1,18 @@ +package anonymize + +import ( + "crypto/sha256" + "encoding/base64" +) + +func HashToSHA256B64(orig string) string { + // hash string with sha256 which returns binary + hasher := sha256.New() + hasher.Write([]byte(orig)) + hashedBinary := hasher.Sum(nil) + + // encode binary hash to base64 string + hashedString := base64.URLEncoding.EncodeToString(hashedBinary) + + return hashedString +} diff --git a/src/anonymize/hash_test.go b/src/anonymize/hash_test.go new file mode 100644 index 00000000..3e87fb0a --- /dev/null +++ b/src/anonymize/hash_test.go @@ -0,0 +1,24 @@ +package anonymize_test + +import ( + "testing" + + "github.com/hearchco/hearchco/src/anonymize" +) + +func TestHashToSHA256B64(t *testing.T) { + // original string, expected hash (sha256 returns binary and is encoded to base64) + tests := []testPair{ + {"", "47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU="}, + {"banana death", "e8kN64XJ4Icr6Tl9VYrBRj50UJCPlyillODm3vVNk2g="}, + {"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", "LYwvbZeMohcStfbeNsnTH6jpak-l2P-LAYjfuefBcbs="}, + {"Ćao hrčko!! 🐹", "_Y3KWzrx2UkeTp8b--48L6OFgv51JWPlZArjoFOrmbw="}, + } + + for _, test := range tests { + hash := anonymize.HashToSHA256B64(test.orig) + if hash != test.expected { + t.Errorf("HashToSHA256B64(%q) = %q, want %q", test.orig, hash, test.expected) + } + } +} diff --git a/src/anonymize/string.go b/src/anonymize/string.go new file mode 100644 index 00000000..1f2d6b6d --- /dev/null +++ b/src/anonymize/string.go @@ -0,0 +1,58 @@ +package anonymize + +import ( + "math/rand" + "sort" + "strings" + "time" +) + +// remove duplicate characters from string +func Deduplicate(orig string) string { + dedupStr := "" + encountered := make(map[rune]bool) + + for _, char := range orig { + if !encountered[char] { + encountered[char] = true + dedupStr += string(char) + } + } + + return dedupStr +} + +// sort string characters lexicographically +func SortString(orig string) string { + // Convert the string to a slice of characters + characters := strings.Split(orig, "") + + // Sort the slice + sort.Strings(characters) + + // Join the sorted slice back into a string + return strings.Join(characters, "") +} + +// shuffle string because deduplicate retains the order of letters +func Shuffle(orig string) string { + inRune := []rune(orig) + + // WARNING: in year 2262, this will break + rng := rand.New(rand.NewSource(time.Now().UnixNano())) + rng.Shuffle(len(inRune), func(i, j int) { + inRune[i], inRune[j] = inRune[j], inRune[i] + }) + + return string(inRune) +} + +// anonymize string +func String(orig string) string { + return Shuffle(Deduplicate(orig)) +} + +// anonymize substring of string +func Substring(orig string, ssToAnon string) string { + return strings.ReplaceAll(orig, ssToAnon, String(ssToAnon)) +} diff --git a/src/anonymize/string_test.go b/src/anonymize/string_test.go new file mode 100644 index 00000000..10e8df93 --- /dev/null +++ b/src/anonymize/string_test.go @@ -0,0 +1,67 @@ +package anonymize_test + +import ( + "testing" + + "github.com/hearchco/hearchco/src/anonymize" +) + +func TestDeduplicate(t *testing.T) { + // original string, expected deduplicated string + tests := []testPair{ + {"", ""}, + {"gmail", "gmail"}, + {"banana death", "ban deth"}, + {"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", "Lorem ipsudlta,cngbq.UvxDhfE"}, + } + + for _, test := range tests { + deduplicated := anonymize.Deduplicate(test.orig) + if deduplicated != test.expected { + t.Errorf("deduplicate(%q) = %q, want %q", test.orig, deduplicated, test.expected) + } + } +} + +func TestSortString(t *testing.T) { + // original string, sorted string + tests := []testPair{ + {"", ""}, + {"gmail", "agilm"}, + {"banana death", " aaaabdehnnt"}, + { + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + " ,,.Laaaaaaabccccddddddddeeeeeeeeeeeggiiiiiiiiiiilllllmmmmmmnnnnnoooooooooopppqrrrrrrsssssstttttttttuuuuuu", + }, + } + + for _, test := range tests { + sorted := anonymize.SortString(test.orig) + + if sorted != test.expected { + t.Errorf("SortString(%q) = %q, want %q", test.orig, sorted, test.expected) + } + } +} + +func TestShuffle(t *testing.T) { + // original string, sorted string + tests := []testPair{ + {"", ""}, + {"gmail", "agilm"}, + {"banana death", " aaaabdehnnt"}, + { + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + " ,,.Laaaaaaabccccddddddddeeeeeeeeeeeggiiiiiiiiiiilllllmmmmmmnnnnnoooooooooopppqrrrrrrsssssstttttttttuuuuuu", + }, + } + + for _, test := range tests { + shuffled := anonymize.Shuffle(test.orig) + shuffledSorted := anonymize.SortString(shuffled) + + if shuffledSorted != test.expected { + t.Errorf("SortString(Shuffle(%q)) = %q, want %q", test.orig, shuffledSorted, test.expected) + } + } +} diff --git a/src/anonymize/structs_test.go b/src/anonymize/structs_test.go new file mode 100644 index 00000000..97b20919 --- /dev/null +++ b/src/anonymize/structs_test.go @@ -0,0 +1,6 @@ +package anonymize_test + +type testPair struct { + orig string + expected string +} diff --git a/src/cache/pebble/pebble.go b/src/cache/pebble/pebble.go index 76d1e462..0229b42d 100644 --- a/src/cache/pebble/pebble.go +++ b/src/cache/pebble/pebble.go @@ -7,6 +7,7 @@ import ( "github.com/cockroachdb/pebble" "github.com/fxamacker/cbor/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/cache" "github.com/rs/zerolog/log" ) @@ -47,7 +48,7 @@ func (db *DB) Set(k string, v cache.Value) error { if val, err := cbor.Marshal(v); err != nil { return fmt.Errorf("pebble.Set(): error marshaling value: %w", err) - } else if err := db.pdb.Set([]byte(k), val, pebble.NoSync); err != nil { + } else if err := db.pdb.Set([]byte(anonymize.HashToSHA256B64(k)), val, pebble.NoSync); err != nil { return fmt.Errorf("pebble.Set(): error setting KV to pebble: %w", err) } else { cacheTimeSince := time.Since(cacheTimer) @@ -60,7 +61,7 @@ func (db *DB) Set(k string, v cache.Value) error { } func (db *DB) Get(k string, o cache.Value) error { - v, c, err := db.pdb.Get([]byte(k)) + v, c, err := db.pdb.Get([]byte(anonymize.HashToSHA256B64(k))) val := []byte(v) // copy data before closing, casting needed for unmarshal if err == pebble.ErrNotFound { diff --git a/src/cache/redis/redis.go b/src/cache/redis/redis.go index e79e45ce..a2ee0612 100644 --- a/src/cache/redis/redis.go +++ b/src/cache/redis/redis.go @@ -6,6 +6,7 @@ import ( "time" "github.com/fxamacker/cbor/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/cache" "github.com/hearchco/hearchco/src/config" "github.com/redis/go-redis/v9" @@ -52,7 +53,7 @@ func (db *DB) Set(k string, v cache.Value) error { if val, err := cbor.Marshal(v); err != nil { return fmt.Errorf("redis.Set(): error marshaling value: %w", err) - } else if err := db.rdb.Set(db.ctx, k, val, 0).Err(); err != nil { + } else if err := db.rdb.Set(db.ctx, anonymize.HashToSHA256B64(k), val, 0).Err(); err != nil { return fmt.Errorf("redis.Set(): error setting KV to redis: %w", err) } else { cacheTimeSince := time.Since(cacheTimer) @@ -65,7 +66,7 @@ func (db *DB) Set(k string, v cache.Value) error { } func (db *DB) Get(k string, o cache.Value) error { - v, err := db.rdb.Get(db.ctx, k).Result() + v, err := db.rdb.Get(db.ctx, anonymize.HashToSHA256B64(k)).Result() val := []byte(v) // copy data before closing, casting needed for unmarshal if err == redis.Nil { diff --git a/src/cli/climode.go b/src/cli/climode.go index 0b9f24d4..24b3c8f6 100644 --- a/src/cli/climode.go +++ b/src/cli/climode.go @@ -4,6 +4,7 @@ import ( "fmt" "time" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket/result" "github.com/hearchco/hearchco/src/cache" "github.com/hearchco/hearchco/src/category" @@ -29,7 +30,8 @@ func printResults(results []result.Result) { func Run(flags Flags, db cache.DB, conf *config.Config) { log.Info(). - Str("query", flags.Query). + Str("queryAnon", anonymize.String(flags.Query)). + Str("queryHash", anonymize.HashToSHA256B64(flags.Query)). Int("maxPages", flags.MaxPages). Bool("visit", flags.Visit). Msg("Started hearching") @@ -54,7 +56,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) { // Error in reading cache is not returned, just logged log.Error(). Err(gerr). - Str("query", flags.Query). + Str("queryAnon", anonymize.String(flags.Query)). + Str("queryHash", anonymize.HashToSHA256B64(flags.Query)). Msg("cli.Run(): failed accessing cache") } else if results != nil { foundInDB = true @@ -64,7 +67,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) { if foundInDB { log.Debug(). - Str("query", flags.Query). + Str("queryAnon", anonymize.String(flags.Query)). + Str("queryHash", anonymize.HashToSHA256B64(flags.Query)). Msg("Found results in cache") } else { log.Debug().Msg("Nothing found in cache, doing a clean search") @@ -75,7 +79,8 @@ func Run(flags Flags, db cache.DB, conf *config.Config) { if serr != nil { log.Error(). Err(serr). - Str("query", flags.Query). + Str("queryAnon", anonymize.String(flags.Query)). + Str("queryHash", anonymize.HashToSHA256B64(flags.Query)). Msg("cli.Run(): error updating database with search results") } } diff --git a/src/engines/bing/bing.go b/src/engines/bing/bing.go index 2cc3a850..811b0e62 100644 --- a/src/engines/bing/bing.go +++ b/src/engines/bing/bing.go @@ -8,6 +8,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -74,13 +75,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query+localeParam, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + localeParam + anonUrll := Info.URL + anonymize.String(query) + localeParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&first="+strconv.Itoa(i*10+1)+localeParam, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&first=" + strconv.Itoa(i*10+1) + localeParam + anonUrll := Info.URL + anonymize.String(query) + "&first=" + strconv.Itoa(i*10+1) + localeParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/brave/brave.go b/src/engines/brave/brave.go index 312a269d..37db5259 100644 --- a/src/engines/brave/brave.go +++ b/src/engines/brave/brave.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -68,13 +69,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query+"&source=web", colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&source=web" + anonUrll := Info.URL + anonymize.String(query) + "&source=web" + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&spellcheck=0&offset="+strconv.Itoa(i), colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&spellcheck=0&offset=" + strconv.Itoa(i) + anonUrll := Info.URL + anonymize.String(query) + "&spellcheck=0&offset=" + strconv.Itoa(i) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/duckduckgo/duckduckgo.go b/src/engines/duckduckgo/duckduckgo.go index be59d48c..4fe21129 100644 --- a/src/engines/duckduckgo/duckduckgo.go +++ b/src/engines/duckduckgo/duckduckgo.go @@ -8,6 +8,7 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -77,7 +78,9 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+"?q="+query, colCtx, col, Info.Name, &retError) + urll := Info.URL + "?q=" + query + anonUrll := Info.URL + "?q=" + anonymize.String(query) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() diff --git a/src/engines/etools/etools.go b/src/engines/etools/etools.go index 0a1df7fa..0ac72e73 100644 --- a/src/engines/etools/etools.go +++ b/src/engines/etools/etools.go @@ -75,14 +75,15 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx.Put("page", strconv.Itoa(1)) sedefaults.DoPostRequest(Info.URL, strings.NewReader("query="+query+"&country=web&language=all"+safeSearchParam), colCtx, col, Info.Name, &retError) - col.Wait() //wait so I can get the JSESSION cookie back + col.Wait() // wait so I can get the JSESSION cookie back for i := 1; i < options.MaxPages; i++ { pageStr := strconv.Itoa(i + 1) colCtx = colly.NewContext() colCtx.Put("page", pageStr) - sedefaults.DoGetRequest(pageURL+pageStr, colCtx, col, Info.Name, &retError) + // query not needed as its saved in the session + sedefaults.DoGetRequest(pageURL+pageStr, pageURL+pageStr, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/google/google.go b/src/engines/google/google.go index 38ab7820..7d58d750 100644 --- a/src/engines/google/google.go +++ b/src/engines/google/google.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -53,13 +54,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + anonUrll := Info.URL + anonymize.String(query) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&start="+strconv.Itoa(i*10), colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&start=" + strconv.Itoa(i*10) + anonUrll := Info.URL + anonymize.String(query) + "&start=" + strconv.Itoa(i*10) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/googlescholar/googlescholar.go b/src/engines/googlescholar/googlescholar.go index 2133cb91..068e111f 100644 --- a/src/engines/googlescholar/googlescholar.go +++ b/src/engines/googlescholar/googlescholar.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -58,13 +59,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + anonUrll := Info.URL + anonymize.String(query) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&start="+strconv.Itoa(i*10), colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&start=" + strconv.Itoa(i*10) + anonUrll := Info.URL + anonymize.String(query) + "&start=" + strconv.Itoa(i*10) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/mojeek/mojeek.go b/src/engines/mojeek/mojeek.go index 3442257e..d358903a 100644 --- a/src/engines/mojeek/mojeek.go +++ b/src/engines/mojeek/mojeek.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -57,13 +58,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query+localeParam+safeSearchParam, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + localeParam + safeSearchParam + anonUrll := Info.URL + anonymize.String(query) + localeParam + safeSearchParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&s="+strconv.Itoa(i*10+1)+localeParam+safeSearchParam, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&s=" + strconv.Itoa(i*10+1) + localeParam + safeSearchParam + anonUrll := Info.URL + anonymize.String(query) + "&s=" + strconv.Itoa(i*10+1) + localeParam + safeSearchParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/presearch/presearch.go b/src/engines/presearch/presearch.go index 6fbc4abb..8827b551 100644 --- a/src/engines/presearch/presearch.go +++ b/src/engines/presearch/presearch.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -97,14 +98,18 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx.Put("page", strconv.Itoa(1)) colCtx.Put("isAPI", "false") - sedefaults.DoGetRequest(Info.URL+query, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + anonUrll := Info.URL + anonymize.String(query) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) colCtx.Put("isAPI", "false") - sedefaults.DoGetRequest(Info.URL+query+"&page="+strconv.Itoa(i+1), colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&page=" + strconv.Itoa(i+1) + anonUrll := Info.URL + anonymize.String(query) + "&page=" + strconv.Itoa(i+1) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/qwant/qwant.go b/src/engines/qwant/qwant.go index 3bf2d763..90eec283 100644 --- a/src/engines/qwant/qwant.go +++ b/src/engines/qwant/qwant.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -76,9 +77,10 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi for i := 0; i < options.MaxPages; i++ { colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - reqString := Info.URL + query + "&count=" + strconv.Itoa(nRequested) + localeParam + "&offset=" + strconv.Itoa(i*nRequested) + deviceParam + safeSearchParam - sedefaults.DoGetRequest(reqString, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&count=" + strconv.Itoa(nRequested) + localeParam + "&offset=" + strconv.Itoa(i*nRequested) + deviceParam + safeSearchParam + anonUrll := Info.URL + anonymize.String(query) + "&count=" + strconv.Itoa(nRequested) + localeParam + "&offset=" + strconv.Itoa(i*nRequested) + deviceParam + safeSearchParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/startpage/startpage.go b/src/engines/startpage/startpage.go index 88a4c034..72a51574 100644 --- a/src/engines/startpage/startpage.go +++ b/src/engines/startpage/startpage.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -75,13 +76,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query+safeSearch, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + safeSearch + anonUrll := Info.URL + anonymize.String(query) + safeSearch + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&page="+strconv.Itoa(i+1)+safeSearch, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&page=" + strconv.Itoa(i+1) + safeSearch + anonUrll := Info.URL + anonymize.String(query) + "&page=" + strconv.Itoa(i+1) + safeSearch + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/swisscows/authenticator.go b/src/engines/swisscows/authenticator.go index 6276903f..9a6afbe0 100644 --- a/src/engines/swisscows/authenticator.go +++ b/src/engines/swisscows/authenticator.go @@ -6,6 +6,8 @@ import ( "strings" "time" "unicode" + + "github.com/hearchco/hearchco/src/anonymize" ) func generateNonce(length int) string { @@ -63,7 +65,7 @@ func generateSignature(params string, nonce string) (string, error) { var rot13Nonce string = rot13Switch(nonce) var data string = "/web/search" + params + rot13Nonce - var encData string = hashToSHA256B64(data) + var encData string = anonymize.HashToSHA256B64(data) encData = strings.ReplaceAll(encData, "=", "") encData = strings.ReplaceAll(encData, "+", "-") encData = strings.ReplaceAll(encData, "/", "_") diff --git a/src/engines/swisscows/hash.go b/src/engines/swisscows/hash.go deleted file mode 100644 index a19984eb..00000000 --- a/src/engines/swisscows/hash.go +++ /dev/null @@ -1,13 +0,0 @@ -package swisscows - -import ( - "crypto/sha256" - "encoding/base64" -) - -func hashToSHA256B64(input string) string { - hasher := sha256.New() - hasher.Write([]byte(input)) - sha := base64.URLEncoding.EncodeToString(hasher.Sum(nil)) - return sha -} diff --git a/src/engines/swisscows/swisscows.go b/src/engines/swisscows/swisscows.go index b319be5e..61944b88 100644 --- a/src/engines/swisscows/swisscows.go +++ b/src/engines/swisscows/swisscows.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -45,20 +46,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi return } - // log.Debug(). - // Str("query", qry). - // Str("nonce", nonce). - // Str("signature", sig). - // Msg("") - r.Headers.Set("X-Request-Nonce", nonce) r.Headers.Set("X-Request-Signature", sig) r.Headers.Set("Pragma", "no-cache") }) col.OnResponse(func(r *colly.Response) { + query := r.Request.URL.Query().Get("query") + urll := r.Request.URL.String() + anonUrll := anonymize.Substring(urll, query) log.Trace(). - Str("url", r.Request.URL.String()). + Str("url", anonUrll). Str("nonce", r.Request.Headers.Get("X-Request-Nonce")). Str("signature", r.Request.Headers.Get("X-Request-Signature")). Msg("swisscows.Search() -> col.OnResponse()") @@ -99,8 +97,9 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi //col.Request("OPTIONS", seAPIURL+"freshness=All&itemsCount="+strconv.Itoa(sResCount)+"&offset="+strconv.Itoa(i*10)+"&query="+query+localeURL, nil, colCtx, nil) //col.Wait() - reqURL := Info.URL + "freshness=All&itemsCount=" + strconv.Itoa(settings.RequestedResultsPerPage) + "&offset=" + strconv.Itoa(i*10) + "&query=" + query + localeParam - sedefaults.DoGetRequest(reqURL, colCtx, col, Info.Name, &retError) + urll := Info.URL + "freshness=All&itemsCount=" + strconv.Itoa(settings.RequestedResultsPerPage) + "&offset=" + strconv.Itoa(i*10) + "&query=" + query + localeParam + anonUrll := Info.URL + "freshness=All&itemsCount=" + strconv.Itoa(settings.RequestedResultsPerPage) + "&offset=" + strconv.Itoa(i*10) + "&query=" + anonymize.String(query) + localeParam + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/yahoo/yahoo.go b/src/engines/yahoo/yahoo.go index 8b177b4b..5d09753a 100644 --- a/src/engines/yahoo/yahoo.go +++ b/src/engines/yahoo/yahoo.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -63,13 +64,17 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi colCtx := colly.NewContext() colCtx.Put("page", strconv.Itoa(1)) - sedefaults.DoGetRequest(Info.URL+query, colCtx, col, Info.Name, &retError) + urll := Info.URL + query + anonUrll := Info.URL + anonymize.String(query) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) for i := 1; i < options.MaxPages; i++ { colCtx = colly.NewContext() colCtx.Put("page", strconv.Itoa(i+1)) - sedefaults.DoGetRequest(Info.URL+query+"&b="+strconv.Itoa((i+1)*10), colCtx, col, Info.Name, &retError) + urll := Info.URL + query + "&b=" + strconv.Itoa((i+1)*10) + anonUrll := Info.URL + anonymize.String(query) + "&b=" + strconv.Itoa((i+1)*10) + sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) } col.Wait() diff --git a/src/engines/yep/yep.go b/src/engines/yep/yep.go index af48855a..78ade6e7 100644 --- a/src/engines/yep/yep.go +++ b/src/engines/yep/yep.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/config" "github.com/hearchco/hearchco/src/engines" @@ -58,14 +59,20 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi nRequested := settings.RequestedResultsPerPage safeSearchParam := getSafeSearch(&options) - var apiURL string + var urll string if nRequested == Info.ResultsPerPage { - apiURL = Info.URL + "client=web" + localeParam + "&no_correct=false&q=" + query + safeSearchParam + "&type=web" + urll = Info.URL + "client=web" + localeParam + "&no_correct=false&q=" + query + safeSearchParam + "&type=web" } else { - apiURL = Info.URL + "client=web" + localeParam + "&limit=" + strconv.Itoa(nRequested) + "&no_correct=false&q=" + query + safeSearchParam + "&type=web" + urll = Info.URL + "client=web" + localeParam + "&limit=" + strconv.Itoa(nRequested) + "&no_correct=false&q=" + query + safeSearchParam + "&type=web" + } + var anonUrll string + if nRequested == Info.ResultsPerPage { + anonUrll = Info.URL + "client=web" + localeParam + "&no_correct=false&q=" + anonymize.String(query) + safeSearchParam + "&type=web" + } else { + anonUrll = Info.URL + "client=web" + localeParam + "&limit=" + strconv.Itoa(nRequested) + "&no_correct=false&q=" + anonymize.String(query) + safeSearchParam + "&type=web" } - sedefaults.DoGetRequest(apiURL, nil, col, Info.Name, &retError) + sedefaults.DoGetRequest(urll, anonUrll, nil, col, Info.Name, &retError) col.Wait() pagesCol.Wait() diff --git a/src/router/search.go b/src/router/search.go index e72afc42..480ce612 100644 --- a/src/router/search.go +++ b/src/router/search.go @@ -9,6 +9,7 @@ import ( "github.com/goccy/go-json" "github.com/rs/zerolog/log" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket/result" "github.com/hearchco/hearchco/src/cache" "github.com/hearchco/hearchco/src/category" @@ -101,7 +102,8 @@ func Search(c *gin.Context, conf *config.Config, db cache.DB) error { // Error in reading cache is not returned, just logged log.Error(). Err(gerr). - Str("query", query). + Str("queryAnon", anonymize.String(query)). + Str("queryHash", anonymize.HashToSHA256B64(query)). Msg("router.Search(): failed accessing cache") } else if results != nil { foundInDB = true @@ -111,7 +113,8 @@ func Search(c *gin.Context, conf *config.Config, db cache.DB) error { if foundInDB { log.Debug(). - Str("query", query). + Str("queryAnon", anonymize.String(query)). + Str("queryHash", anonymize.HashToSHA256B64(query)). Msg("Found results in cache") } else { log.Debug().Msg("Nothing found in cache, doing a clean search") @@ -133,7 +136,8 @@ func Search(c *gin.Context, conf *config.Config, db cache.DB) error { // Error in updating cache is not returned, just logged log.Error(). Err(serr). - Str("query", query). + Str("queryAnon", anonymize.String(query)). + Str("queryHash", anonymize.HashToSHA256B64(query)). Msg("router.Search(): error updating database with search results") } } diff --git a/src/search/search.go b/src/search/search.go index 436b50a6..26478c68 100644 --- a/src/search/search.go +++ b/src/search/search.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "github.com/hearchco/hearchco/src/anonymize" "github.com/hearchco/hearchco/src/bucket" "github.com/hearchco/hearchco/src/bucket/result" "github.com/hearchco/hearchco/src/category" @@ -24,11 +25,12 @@ func PerformSearch(query string, options engines.Options, conf *config.Config) [ ResultMap: make(map[string]*result.Result), } - timings, toRun := procBang(&query, &options, conf) + query, timings, toRun := procBang(query, &options, conf) query = url.QueryEscape(query) log.Debug(). - Str("query", query). + Str("queryAnon", anonymize.String(query)). + Str("queryHash", anonymize.HashToSHA256B64(query)). Msg("Searching") resTimer := time.Now() @@ -83,29 +85,31 @@ func runEngines(engs []engines.Name, timings config.Timings, settings map[engine } } -func procBang(query *string, options *engines.Options, conf *config.Config) (config.Timings, []engines.Name) { - useSpec, specEng := procSpecificEngine(*query, options, conf) - goodCat := procCategory(*query, options) - if !goodCat && !useSpec && (*query)[0] == '!' { +func procBang(query string, options *engines.Options, conf *config.Config) (string, config.Timings, []engines.Name) { + useSpec, specEng := procSpecificEngine(query, options, conf) + goodCat := procCategory(query, options) + if !goodCat && !useSpec && query[0] == '!' { // options.category is set to GENERAL log.Debug(). - Str("query", *query). + Str("queryAnon", anonymize.String(query)). + Str("queryHash", anonymize.HashToSHA256B64(query)). Msg("search.procBang(): invalid bang (not category or engine shortcut)") } - trimBang(query) + query = trimBang(query) if useSpec { - return conf.Categories[category.GENERAL].Timings, []engines.Name{specEng} + return query, conf.Categories[category.GENERAL].Timings, []engines.Name{specEng} } else { - return conf.Categories[options.Category].Timings, conf.Categories[options.Category].Engines + return query, conf.Categories[options.Category].Timings, conf.Categories[options.Category].Engines } } -func trimBang(query *string) { - if (*query)[0] == '!' { - *query = strings.SplitN(*query, " ", 2)[1] +func trimBang(query string) string { + if (query)[0] == '!' { + return strings.SplitN(query, " ", 2)[1] } + return query } func procSpecificEngine(query string, options *engines.Options, conf *config.Config) (bool, engines.Name) { diff --git a/src/sedefaults/sedefaults.go b/src/sedefaults/sedefaults.go index bc29ebb6..cd3374ec 100644 --- a/src/sedefaults/sedefaults.go +++ b/src/sedefaults/sedefaults.go @@ -89,20 +89,19 @@ func ColRequest(seName engines.Name, col *colly.Collector, ctx context.Context) func ColError(seName engines.Name, col *colly.Collector) { col.OnError(func(r *colly.Response, err error) { - urll := r.Request.URL.String() if engines.IsTimeoutError(err) { log.Trace(). - Err(err). + // Err(err). // timeout error produces Get "url" error with the query Str("engine", seName.String()). - Str("url", urll). + // Str("url", urll). // can't reliably anonymize it (because it's engine dependent and query isn't passed to this function) Msg("sedefaults.ColError() -> col.OnError(): request timeout error for url") } else { log.Error(). Err(err). Str("engine", seName.String()). - Str("url", urll). + // Str("url", urll). // can't reliably anonymize it (because it's engine dependent and query isn't passed to this function) Int("statusCode", r.StatusCode). - Str("response", string(r.Body)). + Str("response", string(r.Body)). // query can be present, depending on the response from the engine (Google has the query in 3 places) Msg("sedefaults.ColError() -> col.OnError(): request error for url") dumpPath := fmt.Sprintf("%v%v_col.log.html", config.LogDumpLocation, seName.String()) @@ -209,10 +208,10 @@ func InitializeCollectors(colPtr **colly.Collector, pagesColPtr **colly.Collecto } } -func DoGetRequest(urll string, colCtx *colly.Context, collector *colly.Collector, packageName engines.Name, retError *error) { +func DoGetRequest(urll string, anonurll string, colCtx *colly.Context, collector *colly.Collector, packageName engines.Name, retError *error) { log.Trace(). Str("engine", packageName.String()). - Str("url", urll). + Str("url", anonurll). Msg("GET") err := collector.Request("GET", urll, nil, colCtx, nil) if err != nil {