Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zoekt: implement mode which has same behaviour as attribution search #613

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,9 @@ func (n ngram) String() string {
}

type runeNgramOff struct {
ngram ngram
byteSize uint32 // size of ngram
byteOff uint32
runeOff uint32
ngram ngram
// index is the original index inside of the returned array of splitNGrams
index uint32
}

func splitNGrams(str []byte) []runeNgramOff {
Expand All @@ -120,9 +119,7 @@ func splitNGrams(str []byte) []runeNgramOff {
result := make([]runeNgramOff, 0, len(str))
var i uint32

chars := -1
for len(str) > 0 {
chars++
r, sz := utf8.DecodeRune(str)
str = str[sz:]
runeGram[0] = runeGram[1]
Expand All @@ -139,10 +136,8 @@ func splitNGrams(str []byte) []runeNgramOff {

ng := runesToNGram(runeGram)
result = append(result, runeNgramOff{
ngram: ng,
byteSize: i - off[0],
byteOff: off[0],
runeOff: uint32(chars),
ngram: ng,
index: uint32(len(result)),
})
}
return result
Expand Down
11 changes: 6 additions & 5 deletions btree.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,13 @@ func (n *innerNode) insert(ng ngram, opts btreeOpts) {

// See btree.find
func (n *innerNode) find(ng ngram) (int, int) {
for i, k := range n.keys {
if ng < k {
return n.children[i].find(ng)
}
i := sort.Search(len(n.keys), func(i int) bool {
return ng < n.keys[i]
})
if i >= len(n.children) {
i = len(n.children) - 1
}
return n.children[len(n.children)-1].find(ng)
return n.children[i].find(ng)
}

// See btree.find
Expand Down
1 change: 1 addition & 0 deletions cmd/zoekt/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
"//:zoekt",
"//query",
"//shards",
"@com_github_felixge_fgprof//:fgprof",
],
)

Expand Down
83 changes: 72 additions & 11 deletions cmd/zoekt/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ import (
"context"
"flag"
"fmt"
"io"
"log"
"os"
"path/filepath"
"runtime/pprof"
"time"

"github.com/felixge/fgprof"
"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/query"
"github.com/sourcegraph/zoekt/shards"
Expand Down Expand Up @@ -81,10 +83,12 @@ func main() {
index := flag.String("index_dir",
filepath.Join(os.Getenv("HOME"), ".zoekt"), "search for index files in `directory`")
cpuProfile := flag.String("cpu_profile", "", "write cpu profile to `file`")
fullProfile := flag.String("full_profile", "", "write full profile to `file`")
profileTime := flag.Duration("profile_time", time.Second, "run this long to gather stats.")
verbose := flag.Bool("v", false, "print some background data")
withRepo := flag.Bool("r", false, "print the repo before the file name")
list := flag.Bool("l", false, "print matching filenames only")
exact := flag.Bool("exact_stdin", false, "look for exact matches on STDIN")

flag.Usage = func() {
name := os.Args[0]
Expand All @@ -95,12 +99,39 @@ func main() {
}
flag.Parse()

if len(flag.Args()) == 0 {
var pat string
var q query.Q
var sOpts zoekt.SearchOptions
if *exact {
needle, err := io.ReadAll(os.Stdin)
if err != nil {
log.Fatal(err)
}
pat = string(needle)
q = &query.Substring{
Pattern: pat,
CaseSensitive: true,
Content: true,
}
sOpts = zoekt.SearchOptions{
ShardMaxMatchCount: 10_000,
ShardRepoMaxMatchCount: 1,
TotalMaxMatchCount: 100_000,
MaxWallTime: 20 * time.Second,
MaxDocDisplayCount: 5,
}
} else if len(flag.Args()) == 0 {
fmt.Fprintf(os.Stderr, "Pattern is missing.\n")
flag.Usage()
os.Exit(2)
} else {
var err error
pat = flag.Arg(0)
q, err = query.Parse(pat)
if err != nil {
log.Fatal(err)
}
}
pat := flag.Arg(0)

var searcher zoekt.Searcher
var err error
Expand All @@ -114,16 +145,11 @@ func main() {
log.Fatal(err)
}

query, err := query.Parse(pat)
if err != nil {
log.Fatal(err)
}
if *verbose {
log.Println("query:", query)
log.Println("query:", q)
}

var sOpts zoekt.SearchOptions
sres, err := searcher.Search(context.Background(), query, &sOpts)
sres, err := searcher.Search(context.Background(), q, &sOpts)
if *cpuProfile != "" {
// If profiling, do it another time so we measure with
// warm caches.
Expand All @@ -140,15 +166,50 @@ func main() {
if err := pprof.StartCPUProfile(f); err != nil {
log.Fatal(err)
}
count := 0
for {
sres, _ = searcher.Search(context.Background(), query, &sOpts)
if time.Since(t) > *profileTime {
sres, _ = searcher.Search(context.Background(), q, &sOpts)
count++
if elapsed := time.Since(t); elapsed > *profileTime {
if *verbose {
log.Printf("ran %d times in %v (%f searches/s)", count, elapsed, float64(count)/elapsed.Seconds())
}
break
}
}
pprof.StopCPUProfile()
}

if *fullProfile != "" {
// If profiling, do it another time so we measure with
// warm caches.
f, err := os.Create(*fullProfile)
if err != nil {
log.Fatal(err)
}
defer f.Close()
if *verbose {
log.Println("Displaying matches...")
}

t := time.Now()
stopProfile := fgprof.Start(f, fgprof.FormatPprof)
count := 0
for {
sres, _ = searcher.Search(context.Background(), q, &sOpts)
count++
if elapsed := time.Since(t); elapsed > *profileTime {
if *verbose {
log.Printf("ran %d times in %v (%f searches/s)", count, elapsed, float64(count)/elapsed.Seconds())
}
break
}
}
if err := stopProfile(); err != nil {
log.Fatal(err)
}
}

if err != nil {
log.Fatal(err)
}
Expand Down
22 changes: 22 additions & 0 deletions deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ def go_dependencies():
sum = "h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=",
version = "v2.2.0",
)
go_repository(
name = "com_github_chzyer_logex",
build_file_proto_mode = "disable_global",
importpath = "github.com/chzyer/logex",
sum = "h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE=",
version = "v1.1.10",
)

go_repository(
name = "com_github_chzyer_readline",
Expand All @@ -178,6 +185,13 @@ def go_dependencies():
sum = "h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=",
version = "v1.5.1",
)
go_repository(
name = "com_github_chzyer_test",
build_file_proto_mode = "disable_global",
importpath = "github.com/chzyer/test",
sum = "h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=",
version = "v0.0.0-20180213035817-a1ea475d72b1",
)

go_repository(
name = "com_github_client9_misspell",
Expand Down Expand Up @@ -346,6 +360,14 @@ def go_dependencies():
sum = "h1:Q7juDM0QtcnhCpeyLGQKyg4TOIghuNXrkL32pHAUMxo=",
version = "v1.1.0",
)
go_repository(
name = "com_github_felixge_fgprof",
build_file_proto_mode = "disable_global",
importpath = "github.com/felixge/fgprof",
sum = "h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=",
version = "v0.9.3",
)

go_repository(
name = "com_github_flosch_pongo2_v4",
build_file_proto_mode = "disable_global",
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/andygrunwald/go-gerrit v0.0.0-20230628115649-c44fe2fbf2ca
github.com/bmatcuk/doublestar v1.3.4
github.com/edsrzf/mmap-go v1.1.0
github.com/felixge/fgprof v0.9.3
github.com/fsnotify/fsnotify v1.6.0
github.com/gfleury/go-bitbucket-v1 v0.0.0-20230626192437-8d7be5866751
github.com/go-enry/go-enry/v2 v2.8.4
Expand Down Expand Up @@ -45,6 +46,7 @@ require (
go.opentelemetry.io/otel/trace v1.16.0
go.uber.org/atomic v1.11.0
go.uber.org/automaxprocs v1.5.2
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
golang.org/x/net v0.11.0
golang.org/x/oauth2 v0.9.0
golang.org/x/sync v0.3.0
Expand Down
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudflare/circl v1.3.3 h1:fE/Qz0QdIGqeWfnwq0RE0R7MI51s0M2E4Ga9kq5AEMs=
github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA=
Expand Down Expand Up @@ -81,6 +84,8 @@ github.com/envoyproxy/protoc-gen-validate v0.10.1 h1:c0g45+xCJhdgFGw7a5QAfdS4byA
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs=
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=
github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw=
github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
Expand Down Expand Up @@ -155,6 +160,7 @@ github.com/google/go-github/v27 v27.0.6/go.mod h1:/0Gr8pJ55COkmv+S/yPKCczSkUPIM/
github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8=
github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg=
github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 h1:hR7/MlvK23p6+lIw9SN1TigNLn9ZnF3W4SYRKq2gAHs=
github.com/google/pprof v0.0.0-20230602150820-91b7bce49751/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA=
github.com/google/s2a-go v0.1.4 h1:1kZ/sQM3srePvKs3tXAvQzo66XfcReoqFpIpIccE7Oc=
Expand All @@ -180,6 +186,7 @@ github.com/hashicorp/go-hclog v0.16.2 h1:K4ev2ib4LdQETX5cSZBG0DVLk1jwGqSPXBjdah3
github.com/hashicorp/go-hclog v0.16.2/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ=
github.com/hashicorp/go-retryablehttp v0.7.4 h1:ZQgVdpTdAL7WpMIwLzCfbalOcSUdkDZnpUv3/+BxzFA=
github.com/hashicorp/go-retryablehttp v0.7.4/go.mod h1:Jy/gPYAdjqffZ/yFGCFV2doI5wjtH1ewM9u8iYVjtX8=
github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w=
github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A=
Expand Down Expand Up @@ -361,6 +368,8 @@ golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
Expand Down Expand Up @@ -427,6 +436,7 @@ golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Expand Down
35 changes: 23 additions & 12 deletions indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"unicode/utf8"

"github.com/sourcegraph/zoekt/query"
"golang.org/x/exp/slices"
)

// indexData holds the pattern-independent data that we have to have
Expand Down Expand Up @@ -388,6 +389,11 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult

// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
// PERF: Sort to increase the chances adjacent checks are in the same btree
// bucket (which can cause disk IO).
slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
return a.ngram < b.ngram
})
frequencies := make([]uint32, 0, len(ngramOffs))
ngramLookups := 0
for _, o := range ngramOffs {
Expand Down Expand Up @@ -415,18 +421,22 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult

frequencies = append(frequencies, freq)
}
firstI := firstMinarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := lastMinarg(frequencies)
if firstI > lastI {
lastI, firstI = firstI, lastI

var first, last runeNgramOff
{
firstI := firstMinarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := lastMinarg(frequencies)
first = ngramOffs[firstI]
last = ngramOffs[lastI]
if first.index > last.index {
last, first = first, last
}
}

firstNG := ngramOffs[firstI].ngram
lastNG := ngramOffs[lastI].ngram
iter := &ngramDocIterator{
leftPad: firstI,
rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
leftPad: first.index,
rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
ngramLookups: ngramLookups,
}
if query.FileName {
Expand All @@ -435,15 +445,16 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
iter.ends = d.fileEndRunes
}

if firstI != lastI {
i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)
if first != last {
runeDist := last.index - first.index
i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}

iter.iter = i
} else {
hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
Expand Down
Loading