diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 65d9d10..cbc80e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: - name: setup golang uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.21 check-latest: true cache: true - name: golangci-lint @@ -40,7 +40,7 @@ jobs: - name: setup golang uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.21 check-latest: true cache: true - name: test-coverage @@ -62,7 +62,7 @@ jobs: - name: setup golang uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.21 check-latest: true cache: true - name: init codeql diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a41d5fc..226a09a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: - name: set up golang uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.21 check-latest: true cache: true - name: build diff --git a/.golangci.yml b/.golangci.yml index 26f77c6..4ce875d 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,6 +1,6 @@ run: allow-parallel-runners: true - go: '1.20' + go: '1.21' output: format: 'colored-line-number' diff --git a/.goreleaser.yml b/.goreleaser.yml index 700e7e3..31f957a 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -51,8 +51,8 @@ nfpms: license: MIT vendor: Crawley formats: - - deb - - rpm + - deb + - rpm changelog: filters: diff --git a/cmd/crawley/main.go b/cmd/crawley/main.go index bd67f69..7940316 100644 --- a/cmd/crawley/main.go +++ b/cmd/crawley/main.go @@ -21,7 +21,6 @@ import ( const ( appName = "Crawley" - appHelp = "the unix-way web crawler" appSite = "https://github.com/s0rg/crawley" defaultDelay = 150 * time.Millisecond defaultTimeout = 5 * time.Second @@ -63,20 +62,9 @@ func version() string { func usage() { var sb strings.Builder - const twoCR = "\n\n" - - sb.WriteString(appName) - sb.WriteString(" - ") - sb.WriteString(appHelp) - sb.WriteString(", usage:") - sb.WriteString(twoCR) - - sb.WriteString(filepath.Base(os.Args[0])) - sb.WriteString(" [flags] url") - sb.WriteString(twoCR) - - sb.WriteString("possible flags with default values:") - sb.WriteString(twoCR) + fmt.Fprintf(&sb, "%s - the unix-way web crawler, usage:\n\n", appName) + fmt.Fprintf(&sb, "%s [flags] url\n\n", filepath.Base(os.Args[0])) + fmt.Fprint(&sb, "possible flags with default values:\n\n") _, _ = os.Stderr.WriteString(sb.String()) diff --git a/go.mod b/go.mod index 224e1ad..6c12bd2 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/s0rg/crawley -go 1.20 +go 1.21 require ( github.com/s0rg/compflag v1.1.0 diff --git a/pkg/client/http.go b/pkg/client/http.go index 6a6bac8..54ec78a 100644 --- a/pkg/client/http.go +++ b/pkg/client/http.go @@ -6,11 +6,8 @@ import ( "io" "net" "net/http" - "time" ) -const transportTimeout = 10 * time.Second - // HTTP holds pre-configured http.Client. type HTTP struct { c *http.Client @@ -24,13 +21,13 @@ func New(cfg *Config) (h *HTTP) { transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, Dial: (&net.Dialer{ - Timeout: transportTimeout, + Timeout: cfg.Timeout, }).Dial, TLSClientConfig: &tls.Config{ InsecureSkipVerify: cfg.SkipSSL, }, - IdleConnTimeout: transportTimeout, - TLSHandshakeTimeout: transportTimeout, + IdleConnTimeout: cfg.Timeout, + TLSHandshakeTimeout: cfg.Timeout, MaxConnsPerHost: cfg.Workers, MaxIdleConns: cfg.Workers, MaxIdleConnsPerHost: cfg.Workers, diff --git a/pkg/crawler/config.go b/pkg/crawler/config.go index 2c827d6..c08037f 100644 --- a/pkg/crawler/config.go +++ b/pkg/crawler/config.go @@ -9,6 +9,7 @@ import ( ) const ( + minDepth = -1 minWorkers = 1 maxWorkers = 64 minDelay = time.Duration(0) @@ -30,44 +31,27 @@ type config struct { } func (c *config) validate() { - switch { - case c.Client.Workers < minWorkers: - c.Client.Workers = minWorkers - case c.Client.Workers > maxWorkers: - c.Client.Workers = maxWorkers - } - - switch { - case c.Client.Timeout < minTimeout: - c.Client.Timeout = minTimeout - case c.Client.Timeout > maxTimeout: - c.Client.Timeout = maxTimeout - } - - if c.Delay < minDelay { - c.Delay = minDelay - } - - if c.Depth < 0 { - c.Depth = -1 - } + c.Client.Workers = min(maxWorkers, max(minWorkers, c.Client.Workers)) + c.Client.Timeout = min(maxTimeout, max(minTimeout, c.Client.Timeout)) + c.Delay = max(minDelay, c.Delay) + c.Depth = max(minDepth, c.Depth) } func (c *config) String() (rv string) { var sb strings.Builder - _, _ = sb.WriteString(fmt.Sprintf("workers: %d depth: %d timeout: %s", c.Client.Workers, c.Depth, c.Client.Timeout)) + fmt.Fprintf(&sb, "workers: %d depth: %d timeout: %s", c.Client.Workers, c.Depth, c.Client.Timeout) if c.Brute { - _, _ = sb.WriteString(" brute: on") + sb.WriteString(" brute: on") } if c.ScanJS { - _, _ = sb.WriteString(" js: on") + sb.WriteString(" js: on") } if c.Delay > 0 { - _, _ = sb.WriteString(fmt.Sprintf(" delay: %s", c.Delay)) + fmt.Fprintf(&sb, " delay: %s", c.Delay) } return sb.String() diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index d4f76f0..a297613 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "net/url" + "slices" "strings" "sync" "time" @@ -76,7 +77,7 @@ func New(opts ...Option) (c *Crawler) { } // Run starts crawling process for given base uri. -func (c *Crawler) Run(uri string, fn func(string)) (err error) { +func (c *Crawler) Run(uri string, urlcb func(string)) (err error) { var base *url.URL if base, err = url.Parse(uri); err != nil { @@ -106,7 +107,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) { go func() { for s := range c.handleCh { - fn(s) + urlcb(s) } c.wg.Done() @@ -195,7 +196,7 @@ func (c *Crawler) close() { close(c.crawlCh) c.wg.Wait() // wait for crawlers - c.wg.Add(1) + c.wg.Add(1) // for handler`s Done() close(c.handleCh) c.wg.Wait() // wait for handler @@ -265,13 +266,9 @@ func (c *Crawler) isIgnored(v string) (yes bool) { return } - for _, s := range c.cfg.Ignored { - if strings.Contains(v, s) { - return true - } - } - - return false + return slices.ContainsFunc(c.cfg.Ignored, func(s string) bool { + return strings.Contains(v, s) + }) } func (c *Crawler) linkHandler(a atom.Atom, s string) { diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go index 7e37931..73778d7 100644 --- a/pkg/crawler/crawler_test.go +++ b/pkg/crawler/crawler_test.go @@ -105,7 +105,6 @@ func TestCrawlerOK(t *testing.T) { } if len(results) != 3 { - t.Log(results) t.Error("results: less than expected") } @@ -250,7 +249,6 @@ sitemap: http://other.host/sitemap.xml` } if len(resA) != 5 { - t.Log(resA) t.Fatal("unexpected len for A") } diff --git a/pkg/links/clean_test.go b/pkg/links/clean_test.go index 4ab6371..5e41430 100644 --- a/pkg/links/clean_test.go +++ b/pkg/links/clean_test.go @@ -2,7 +2,6 @@ package links import ( "net/url" - "reflect" "testing" ) @@ -47,7 +46,7 @@ func TestClean(t *testing.T) { } if gotOk { - if !reflect.DeepEqual(gotU, tc.wantU) { + if gotU != tc.wantU { t.Errorf("clean() gotU = %v, want %v", gotU, tc.wantU) } } diff --git a/pkg/links/html_test.go b/pkg/links/html_test.go index 41c5e1b..a1a63fe 100644 --- a/pkg/links/html_test.go +++ b/pkg/links/html_test.go @@ -3,7 +3,7 @@ package links import ( "bytes" "net/url" - "reflect" + "slices" "strings" "testing" @@ -49,7 +49,7 @@ func TestExtractTag(t *testing.T) { gotU := extractTag(tc.args.b, tc.args.t, tc.args.k) - if !reflect.DeepEqual(gotU, tc.wantU) { + if gotU != tc.wantU { t.Errorf("extractTag() gotU = %v, want %v", gotU, tc.wantU) } }) @@ -205,7 +205,7 @@ func TestExtractToken(t *testing.T) { t.Errorf("extractToken() key gotU = %v, want %v", key, tc.keyWant) } - if !reflect.DeepEqual(res, tc.wantURL) { + if res != tc.wantURL { t.Errorf("extractToken() link gotU = %v, want %v", res, tc.wantURL) } }) @@ -297,7 +297,7 @@ func TestExtractURLS(t *testing.T) { }) if tc.hasLink { - if !reflect.DeepEqual(res, tc.lnk) { + if res != tc.lnk { t.Errorf("extractToken() link gotU = %v, want %v", res, tc.lnk) } } @@ -326,7 +326,7 @@ loremipsumhTTp://foo fdfdfs HttPs://bar t.Error("unexpected len") } - if !reflect.DeepEqual(res, want) { + if slices.Compare(res, want) != 0 { t.Error("unexpected result") } } diff --git a/pkg/links/js.go b/pkg/links/js.go index feb2308..52ce84c 100644 --- a/pkg/links/js.go +++ b/pkg/links/js.go @@ -40,7 +40,7 @@ func cleanResult(s []byte) (rv string, ok bool) { rv = string(bytes.Trim(s, codeCleanChars)) if strings.HasPrefix(rv, mimeAppPrefix) || strings.HasPrefix(rv, mimeTxtPrefix) { - return "", false + return } return rv, true diff --git a/pkg/links/sitemap_test.go b/pkg/links/sitemap_test.go index eec3acb..917f6f9 100644 --- a/pkg/links/sitemap_test.go +++ b/pkg/links/sitemap_test.go @@ -117,7 +117,6 @@ func TestExtractSitemapURLError(t *testing.T) { }) if len(l) != 0 { - t.Log(l) t.Error("unexpected results count") } } diff --git a/pkg/robots/parser.go b/pkg/robots/parser.go index aebb5c8..4df9bad 100644 --- a/pkg/robots/parser.go +++ b/pkg/robots/parser.go @@ -64,12 +64,11 @@ func extractToken(b []byte) (k tokenKind, v string) { return } - var val []byte - if val = bytes.TrimSpace(b[pos+1:]); len(val) == 0 { - return + if val := bytes.TrimSpace(b[pos+1:]); len(val) > 0 { + return kind, string(val) } - return kind, string(val) + return } func parseRobots(r io.Reader, ua string, t *TXT) (err error) {