Skip to content

Commit

Permalink
Feature/custom headers and cookies (#20)
Browse files Browse the repository at this point in the history
* user-specified headers and cookies
* actions update
* go version up
  • Loading branch information
s0rg authored Jun 10, 2022
1 parent 453c6b1 commit a007314
Show file tree
Hide file tree
Showing 20 changed files with 520 additions and 63 deletions.
4 changes: 2 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ updates:
- package-ecosystem: gomod
directory: /
schedule:
interval: daily
interval: monthly
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: daily
interval: monthly
20 changes: 10 additions & 10 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,20 @@ jobs:
name: ci
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
uses: golangci/golangci-lint-action@v3
test:
runs-on: ubuntu-latest
environment:
name: ci
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: setup golang
uses: actions/setup-go@v2
uses: actions/setup-go@v3
with:
go-version: ^1.17
go-version: ^1.18
- name: test-coverage
uses: paambaati/[email protected]
env:
Expand All @@ -44,14 +44,14 @@ jobs:
name: ci
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: setup golang
uses: actions/setup-go@v2
uses: actions/setup-go@v3
with:
go-version: ^1.17
go-version: ^1.18
- name: init codeql
uses: github/codeql-action/init@v1
uses: github/codeql-action/init@v2
with:
language: 'go'
- name: run analysis
uses: github/codeql-action/analyze@v1
uses: github/codeql-action/analyze@v2
8 changes: 4 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: set up golang
uses: actions/setup-go@v2
uses: actions/setup-go@v3
with:
go-version: ^1.17
go-version: ^1.18
- name: build
uses: goreleaser/goreleaser-action@v2
uses: goreleaser/goreleaser-action@v3
with:
version: latest
args: release -f .goreleaser.yml --rm-dist
Expand Down
4 changes: 4 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ linters:
disable:
- exhaustivestruct
- gochecknoglobals
- nonamedreturns
- testpackage
- exhaustive
- exhaustruct
- varnamelen
- forbidigo
- typecheck
- gofumpt
- gci
# deprecated :(
- interfacer
Expand All @@ -42,3 +45,4 @@ issues:
- cyclop
- dupl
- goerr113
- errcheck
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ Crawls web pages and prints any link it can find.
- `brute` mode - scan html comments for urls (this can lead to bogus results)
- make use of `HTTP_PROXY` / `HTTPS_PROXY` environment values
- directory-only scan mode (aka `fast-scan`)
- user-defined cookies, in curl-compatible format (i.e. `-cookie "ONE=1; TWO=2" -cookie "EXT=3" -cookie @cookie-file`)
- user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file`

# installation

Expand All @@ -43,12 +45,16 @@ possible flags:
-brute
scan html comments
-cookie value
extra cookies for request, can be used multiple times, accept files with '@'-prefix
-delay duration
per-request delay (0 - disable) (default 150ms)
-depth int
scan depth (-1 - unlimited)
-dirs string
policy for non-resource urls: show / hide / only (default "show")
-header value
extra headers for request, can be used multiple times, accept files with '@'-prefix
-headless
disable pre-flight HEAD requests
-help
Expand Down
79 changes: 57 additions & 22 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"time"

"github.com/s0rg/crawley/pkg/crawler"
"github.com/s0rg/crawley/pkg/values"
)

const (
Expand All @@ -24,6 +25,8 @@ var (
gitHash string
gitVersion string
buildDate string
extCookies values.List
extHeaders values.List
fVersion = flag.Bool("version", false, "show version")
fBrute = flag.Bool("brute", false, "scan html comments")
fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification")
Expand Down Expand Up @@ -57,21 +60,7 @@ func crawl(uri string, opts ...crawler.Option) error {
return nil
}

func main() {
flag.Parse()

if *fVersion {
fmt.Printf("%s %s-%s build at: %s site: %s\n", appName, gitVersion, gitHash, buildDate, appSite)

return
}

if flag.NArg() != 1 {
flag.Usage()

return
}

func options() (rv []crawler.Option) {
robots, err := crawler.ParseRobotsPolicy(*fRobotsPolicy)
if err != nil {
log.Fatal("robots policy:", err)
Expand All @@ -82,12 +71,24 @@ func main() {
log.Fatal("dirs policy:", err)
}

if *fSilent {
log.SetOutput(io.Discard)
workdir, err := os.Getwd()
if err != nil {
log.Fatal("work dir:", err)
}

if err := crawl(
flag.Arg(0),
fs := os.DirFS(workdir)

headers, err := extHeaders.Load(fs)
if err != nil {
log.Fatal("headers:", err)
}

cookies, err := extCookies.Load(fs)
if err != nil {
log.Fatal("cookies:", err)
}

return []crawler.Option{
crawler.WithUserAgent(*fUA),
crawler.WithDelay(*fDelay),
crawler.WithMaxCrawlDepth(*fDepth),
Expand All @@ -97,9 +98,43 @@ func main() {
crawler.WithDirsPolicy(dirs),
crawler.WithRobotsPolicy(robots),
crawler.WithoutHeads(*fNoHeads),
); err != nil {
// forcing back stderr in case of errors, otherwise if 'silent' is on -
// no one will know what happened.
crawler.WithExtraHeaders(headers),
crawler.WithExtraCookies(cookies),
}
}

func main() {
flag.Var(
&extHeaders,
"header",
"extra headers for request, can be used multiple times, accept files with '@'-prefix",
)
flag.Var(
&extCookies,
"cookie",
"extra cookies for request, can be used multiple times, accept files with '@'-prefix",
)
flag.Parse()

if *fVersion {
fmt.Printf("%s %s-%s build at: %s site: %s\n", appName, gitVersion, gitHash, buildDate, appSite)

return
}

if flag.NArg() != 1 {
flag.Usage()

return
}

if *fSilent {
log.SetOutput(io.Discard)
}

if err := crawl(flag.Arg(0), options()...); err != nil {
// forcing back stderr in case of errors, otherwise
// if 'silent' is on - no one will know what happened.
log.SetOutput(os.Stderr)
log.Fatal("crawler:", err)
}
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
module github.com/s0rg/crawley

go 1.17
go 1.18

require golang.org/x/net v0.0.0-20211216030914-fe4d6282115f
require golang.org/x/net v0.0.0-20220607020251-c690dde0001d
9 changes: 2 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,2 @@
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f h1:hEYJvxw1lSnWIl8X9ofsYMklzaDs90JI2az5YMd4fPM=
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/net v0.0.0-20220607020251-c690dde0001d h1:4SFsTMi4UahlKoloni7L4eYzhFRifURQLw+yv0QDCx8=
golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
45 changes: 45 additions & 0 deletions pkg/client/cookie.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package client

import (
"net/http"
"strings"
)

const (
keyvalParts = 2
keyvalSeparator = "="
valuesSeparator = ";"
)

func prepareCookies(raw []string) (rv []*http.Cookie) {
for _, r := range raw {
for _, p := range strings.Split(r, valuesSeparator) {
if val, ok := parseOne(p); ok {
rv = append(rv, val)
}
}
}

return rv
}

func parseOne(raw string) (rv *http.Cookie, ok bool) {
pair := strings.SplitN(raw, keyvalSeparator, keyvalParts)

var name, value string

if name = strings.TrimSpace(pair[0]); name == "" {
return
}

if value = strings.TrimSpace(pair[1]); value == "" {
return
}

rv = &http.Cookie{
Name: name,
Value: value,
}

return rv, true
}
36 changes: 36 additions & 0 deletions pkg/client/cookie_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package client

import (
"net/http"
"reflect"
"testing"
)

func Test_prepareCookies(t *testing.T) {
t.Parallel()

tests := []struct {
name string
args []string
want []*http.Cookie
}{
{"1",
[]string{"NAME1=VALUE1; NAME2=VALUE2", "NAME3=VALUE3"},
[]*http.Cookie{
{Name: "NAME1", Value: "VALUE1"},
{Name: "NAME2", Value: "VALUE2"},
{Name: "NAME3", Value: "VALUE3"},
}},
{"2",
[]string{"", "NAME=", "=VALUE", ";;", "===", " VALID = COOKIE "},
[]*http.Cookie{
{Name: "VALID", Value: "COOKIE"},
}},
}

for _, tt := range tests {
if got := prepareCookies(tt.args); !reflect.DeepEqual(got, tt.want) {
t.Errorf("prepareCookies() = %v, want %v", got, tt.want)
}
}
}
38 changes: 38 additions & 0 deletions pkg/client/header.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package client

import "strings"

const (
headerParts = 2
headerSeparator = ":"
)

type header struct {
Key string
Val string
}

func prepareHeaders(raw []string) (rv []*header) {
rv = make([]*header, 0, len(raw))

var (
pair []string
key, val string
)

for _, h := range raw {
pair = strings.SplitN(h, headerSeparator, headerParts)

if key = strings.TrimSpace(pair[0]); key == "" {
continue
}

if val = strings.TrimSpace(pair[1]); val == "" {
continue
}

rv = append(rv, &header{Key: key, Val: val})
}

return rv
}
Loading

0 comments on commit a007314

Please sign in to comment.