Skip to content

Commit

Permalink
code grouming, migration to go1.21 (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0rg authored Aug 16, 2023
1 parent 9c52cc6 commit c0c9ae7
Show file tree
Hide file tree
Showing 15 changed files with 40 additions and 79 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- name: setup golang
uses: actions/setup-go@v4
with:
go-version: ^1.20
go-version: ^1.21
check-latest: true
cache: true
- name: golangci-lint
Expand All @@ -40,7 +40,7 @@ jobs:
- name: setup golang
uses: actions/setup-go@v4
with:
go-version: ^1.20
go-version: ^1.21
check-latest: true
cache: true
- name: test-coverage
Expand All @@ -62,7 +62,7 @@ jobs:
- name: setup golang
uses: actions/setup-go@v4
with:
go-version: ^1.20
go-version: ^1.21
check-latest: true
cache: true
- name: init codeql
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- name: set up golang
uses: actions/setup-go@v4
with:
go-version: ^1.20
go-version: ^1.21
check-latest: true
cache: true
- name: build
Expand Down
2 changes: 1 addition & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
run:
allow-parallel-runners: true
go: '1.20'
go: '1.21'

output:
format: 'colored-line-number'
Expand Down
4 changes: 2 additions & 2 deletions .goreleaser.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ nfpms:
license: MIT
vendor: Crawley
formats:
- deb
- rpm
- deb
- rpm

changelog:
filters:
Expand Down
18 changes: 3 additions & 15 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (

const (
appName = "Crawley"
appHelp = "the unix-way web crawler"
appSite = "https://github.com/s0rg/crawley"
defaultDelay = 150 * time.Millisecond
defaultTimeout = 5 * time.Second
Expand Down Expand Up @@ -63,20 +62,9 @@ func version() string {
func usage() {
var sb strings.Builder

const twoCR = "\n\n"

sb.WriteString(appName)
sb.WriteString(" - ")
sb.WriteString(appHelp)
sb.WriteString(", usage:")
sb.WriteString(twoCR)

sb.WriteString(filepath.Base(os.Args[0]))
sb.WriteString(" [flags] url")
sb.WriteString(twoCR)

sb.WriteString("possible flags with default values:")
sb.WriteString(twoCR)
fmt.Fprintf(&sb, "%s - the unix-way web crawler, usage:\n\n", appName)
fmt.Fprintf(&sb, "%s [flags] url\n\n", filepath.Base(os.Args[0]))
fmt.Fprint(&sb, "possible flags with default values:\n\n")

_, _ = os.Stderr.WriteString(sb.String())

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/s0rg/crawley

go 1.20
go 1.21

require (
github.com/s0rg/compflag v1.1.0
Expand Down
9 changes: 3 additions & 6 deletions pkg/client/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@ import (
"io"
"net"
"net/http"
"time"
)

const transportTimeout = 10 * time.Second

// HTTP holds pre-configured http.Client.
type HTTP struct {
c *http.Client
Expand All @@ -24,13 +21,13 @@ func New(cfg *Config) (h *HTTP) {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
Dial: (&net.Dialer{
Timeout: transportTimeout,
Timeout: cfg.Timeout,
}).Dial,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: cfg.SkipSSL,
},
IdleConnTimeout: transportTimeout,
TLSHandshakeTimeout: transportTimeout,
IdleConnTimeout: cfg.Timeout,
TLSHandshakeTimeout: cfg.Timeout,
MaxConnsPerHost: cfg.Workers,
MaxIdleConns: cfg.Workers,
MaxIdleConnsPerHost: cfg.Workers,
Expand Down
34 changes: 9 additions & 25 deletions pkg/crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
)

const (
minDepth = -1
minWorkers = 1
maxWorkers = 64
minDelay = time.Duration(0)
Expand All @@ -30,44 +31,27 @@ type config struct {
}

func (c *config) validate() {
switch {
case c.Client.Workers < minWorkers:
c.Client.Workers = minWorkers
case c.Client.Workers > maxWorkers:
c.Client.Workers = maxWorkers
}

switch {
case c.Client.Timeout < minTimeout:
c.Client.Timeout = minTimeout
case c.Client.Timeout > maxTimeout:
c.Client.Timeout = maxTimeout
}

if c.Delay < minDelay {
c.Delay = minDelay
}

if c.Depth < 0 {
c.Depth = -1
}
c.Client.Workers = min(maxWorkers, max(minWorkers, c.Client.Workers))
c.Client.Timeout = min(maxTimeout, max(minTimeout, c.Client.Timeout))
c.Delay = max(minDelay, c.Delay)
c.Depth = max(minDepth, c.Depth)
}

func (c *config) String() (rv string) {
var sb strings.Builder

_, _ = sb.WriteString(fmt.Sprintf("workers: %d depth: %d timeout: %s", c.Client.Workers, c.Depth, c.Client.Timeout))
fmt.Fprintf(&sb, "workers: %d depth: %d timeout: %s", c.Client.Workers, c.Depth, c.Client.Timeout)

if c.Brute {
_, _ = sb.WriteString(" brute: on")
sb.WriteString(" brute: on")
}

if c.ScanJS {
_, _ = sb.WriteString(" js: on")
sb.WriteString(" js: on")
}

if c.Delay > 0 {
_, _ = sb.WriteString(fmt.Sprintf(" delay: %s", c.Delay))
fmt.Fprintf(&sb, " delay: %s", c.Delay)
}

return sb.String()
Expand Down
17 changes: 7 additions & 10 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log"
"net/http"
"net/url"
"slices"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -76,7 +77,7 @@ func New(opts ...Option) (c *Crawler) {
}

// Run starts crawling process for given base uri.
func (c *Crawler) Run(uri string, fn func(string)) (err error) {
func (c *Crawler) Run(uri string, urlcb func(string)) (err error) {
var base *url.URL

if base, err = url.Parse(uri); err != nil {
Expand Down Expand Up @@ -106,7 +107,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {

go func() {
for s := range c.handleCh {
fn(s)
urlcb(s)
}

c.wg.Done()
Expand Down Expand Up @@ -195,7 +196,7 @@ func (c *Crawler) close() {
close(c.crawlCh)
c.wg.Wait() // wait for crawlers

c.wg.Add(1)
c.wg.Add(1) // for handler`s Done()
close(c.handleCh)
c.wg.Wait() // wait for handler

Expand Down Expand Up @@ -265,13 +266,9 @@ func (c *Crawler) isIgnored(v string) (yes bool) {
return
}

for _, s := range c.cfg.Ignored {
if strings.Contains(v, s) {
return true
}
}

return false
return slices.ContainsFunc(c.cfg.Ignored, func(s string) bool {
return strings.Contains(v, s)
})
}

func (c *Crawler) linkHandler(a atom.Atom, s string) {
Expand Down
2 changes: 0 additions & 2 deletions pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ func TestCrawlerOK(t *testing.T) {
}

if len(results) != 3 {
t.Log(results)
t.Error("results: less than expected")
}

Expand Down Expand Up @@ -250,7 +249,6 @@ sitemap: http://other.host/sitemap.xml`
}

if len(resA) != 5 {
t.Log(resA)
t.Fatal("unexpected len for A")
}

Expand Down
3 changes: 1 addition & 2 deletions pkg/links/clean_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package links

import (
"net/url"
"reflect"
"testing"
)

Expand Down Expand Up @@ -47,7 +46,7 @@ func TestClean(t *testing.T) {
}

if gotOk {
if !reflect.DeepEqual(gotU, tc.wantU) {
if gotU != tc.wantU {
t.Errorf("clean() gotU = %v, want %v", gotU, tc.wantU)
}
}
Expand Down
10 changes: 5 additions & 5 deletions pkg/links/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package links
import (
"bytes"
"net/url"
"reflect"
"slices"
"strings"
"testing"

Expand Down Expand Up @@ -49,7 +49,7 @@ func TestExtractTag(t *testing.T) {

gotU := extractTag(tc.args.b, tc.args.t, tc.args.k)

if !reflect.DeepEqual(gotU, tc.wantU) {
if gotU != tc.wantU {
t.Errorf("extractTag() gotU = %v, want %v", gotU, tc.wantU)
}
})
Expand Down Expand Up @@ -205,7 +205,7 @@ func TestExtractToken(t *testing.T) {
t.Errorf("extractToken() key gotU = %v, want %v", key, tc.keyWant)
}

if !reflect.DeepEqual(res, tc.wantURL) {
if res != tc.wantURL {
t.Errorf("extractToken() link gotU = %v, want %v", res, tc.wantURL)
}
})
Expand Down Expand Up @@ -297,7 +297,7 @@ func TestExtractURLS(t *testing.T) {
})

if tc.hasLink {
if !reflect.DeepEqual(res, tc.lnk) {
if res != tc.lnk {
t.Errorf("extractToken() link gotU = %v, want %v", res, tc.lnk)
}
}
Expand Down Expand Up @@ -326,7 +326,7 @@ loremipsumhTTp://foo fdfdfs HttPs://bar
t.Error("unexpected len")
}

if !reflect.DeepEqual(res, want) {
if slices.Compare(res, want) != 0 {
t.Error("unexpected result")
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/links/js.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func cleanResult(s []byte) (rv string, ok bool) {
rv = string(bytes.Trim(s, codeCleanChars))

if strings.HasPrefix(rv, mimeAppPrefix) || strings.HasPrefix(rv, mimeTxtPrefix) {
return "", false
return
}

return rv, true
Expand Down
1 change: 0 additions & 1 deletion pkg/links/sitemap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ func TestExtractSitemapURLError(t *testing.T) {
})

if len(l) != 0 {
t.Log(l)
t.Error("unexpected results count")
}
}
7 changes: 3 additions & 4 deletions pkg/robots/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,11 @@ func extractToken(b []byte) (k tokenKind, v string) {
return
}

var val []byte
if val = bytes.TrimSpace(b[pos+1:]); len(val) == 0 {
return
if val := bytes.TrimSpace(b[pos+1:]); len(val) > 0 {
return kind, string(val)
}

return kind, string(val)
return
}

func parseRobots(r io.Reader, ua string, t *TXT) (err error) {
Expand Down

0 comments on commit c0c9ae7

Please sign in to comment.