Skip to content

Commit

Permalink
Feature/release 1.4 (#29)
Browse files Browse the repository at this point in the history
* tags filters
* generic set
* code grouming
* deps up
* lint workflow fix for 1.19
  • Loading branch information
s0rg authored Oct 3, 2022
1 parent c400d04 commit 5de5e8d
Show file tree
Hide file tree
Showing 24 changed files with 312 additions and 160 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ jobs:
steps:
- name: checkout
uses: actions/checkout@v3
- name: setup golang
uses: actions/setup-go@v3
with:
go-version: ^1.19
check-latest: true
cache: true
- name: golangci-lint
uses: golangci/golangci-lint-action@v3
test:
Expand Down
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
[![Build](https://github.com/s0rg/crawley/workflows/ci/badge.svg)](https://github.com/s0rg/crawley/actions?query=workflow%3Aci)
[![License](https://img.shields.io/badge/license-MIT%20License-blue.svg)](https://github.com/s0rg/crawley/blob/main/LICENSE)
[![Go Version](https://img.shields.io/github/go-mod/go-version/s0rg/crawley)](go.mod)
[![Release](https://img.shields.io/github/v/release/s0rg/crawley)](https://github.com/s0rg/crawley/releases/latest)
[![Mentioned in Awesome Go](https://awesome.re/mentioned-badge.svg)](https://github.com/avelino/awesome-go)
![Downloads](https://img.shields.io/github/downloads/s0rg/crawley/total.svg)

[![CI](https://github.com/s0rg/crawley/workflows/ci/badge.svg)](https://github.com/s0rg/crawley/actions?query=workflow%3Aci)
[![Go Report Card](https://goreportcard.com/badge/github.com/s0rg/crawley)](https://goreportcard.com/report/github.com/s0rg/crawley)
[![Maintainability](https://api.codeclimate.com/v1/badges/6542cd90a6c665e4202e/maintainability)](https://codeclimate.com/github/s0rg/crawley/maintainability)
[![Test Coverage](https://api.codeclimate.com/v1/badges/e1c002df2b4571e01537/test_coverage)](https://codeclimate.com/github/s0rg/crawley/test_coverage)
[![libraries.io](https://img.shields.io/librariesio/github/s0rg/crawley)](https://libraries.io/github/s0rg/crawley)
![Issues](https://img.shields.io/github/issues/s0rg/crawley)

[![License](https://img.shields.io/badge/license-MIT%20License-blue.svg)](https://github.com/s0rg/crawley/blob/main/LICENSE)
[![Go Version](https://img.shields.io/github/go-mod/go-version/s0rg/crawley)](go.mod)
[![Release](https://img.shields.io/github/v/release/s0rg/crawley)](https://github.com/s0rg/crawley/releases/latest)
![Downloads](https://img.shields.io/github/downloads/s0rg/crawley/total.svg)
[![Mentioned in Awesome Go](https://awesome.re/mentioned-badge.svg)](https://github.com/avelino/awesome-go)

# crawley

Crawls web pages and prints any link it can find.

# features

- fast html SAX-parser (powered by `golang.org/x/net/html`)
- small (<1300 SLOC), idiomatic, 100% test covered codebase
- small (~1300 SLOC), idiomatic, 100% test covered codebase
- grabs most of useful resources urls (pics, videos, audios, forms, etc...)
- found urls are streamed to stdout and guranteed to be unique (with fragments omitted)
- scan depth (limited by starting host and path, by default - 0) can be configured
Expand All @@ -28,6 +28,7 @@ Crawls web pages and prints any link it can find.
- directory-only scan mode (aka `fast-scan`)
- user-defined cookies, in curl-compatible format (i.e. `-cookie "ONE=1; TWO=2" -cookie "ITS=ME" -cookie @cookie-file`)
- user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file`
- tag filter - allow to specify tags to crawl for (single: `-tag a -tag form`, multiple: `-tag a,form`, or mixed)

# installation

Expand Down Expand Up @@ -67,6 +68,8 @@ possible flags:
suppress info and error messages in stderr
-skip-ssl
skip ssl verification
-tag value
tags filter, single or comma-separated tag names allowed
-user-agent string
user-agent string
-version
Expand Down
4 changes: 2 additions & 2 deletions SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

| Version | Supported |
| ------- | ------------------ |
| 1.3.x | :white_check_mark: |
| < 1.3 | :x: |
| 1.4.x | :white_check_mark: |
| < 1.4 | :x: |

## Reporting a Vulnerability

Expand Down
59 changes: 43 additions & 16 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"log"
"os"
"runtime"
"strings"
"time"

"github.com/s0rg/crawley/pkg/crawler"
Expand All @@ -22,10 +23,15 @@ const (
)

var (
GitHash string
GitTag string
BuildDate string
defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + GitTag + "-" + GitHash
GitHash string
GitTag string
BuildDate string
defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + GitTag + "-" + GitHash

extCookies values.List
extHeaders values.List
tags []string

fVersion = flag.Bool("version", false, "show version")
fBrute = flag.Bool("brute", false, "scan html comments")
fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification")
Expand All @@ -37,10 +43,19 @@ var (
fUA = flag.String("user-agent", defaultUA, "user-agent string")
fRobotsPolicy = flag.String("robots", "ignore", "policy for robots.txt: ignore / crawl / respect")
fDirsPolicy = flag.String("dirs", "show", "policy for non-resource urls: show / hide / only")
extCookies values.List
extHeaders values.List
)

func version() string {
return fmt.Sprintf("%s %s-%s build at: %s with %s site: %s",
appName,
GitTag,
GitHash,
BuildDate,
runtime.Version(),
appSite,
)
}

func puts(s string) {
_, _ = os.Stdout.WriteString(s + "\n")
}
Expand Down Expand Up @@ -123,6 +138,7 @@ func initOptions() (rv []crawler.Option, err error) {
crawler.WithoutHeads(*fNoHeads),
crawler.WithExtraHeaders(headers),
crawler.WithExtraCookies(cookies),
crawler.WithTagsFilter(tags),
}

return rv, nil
Expand All @@ -139,17 +155,28 @@ func main() {
"cookie",
"extra cookies for request, can be used multiple times, accept files with '@'-prefix",
)

var tags []string

flag.Func(
"tag",
"tags filter, single or comma-separated tag names allowed",
func(val string) error {
switch {
case strings.ContainsRune(val, ','):
tags = append(tags, strings.Split(val, ",")...)
default:
tags = append(tags, val)
}

return nil
},
)

flag.Parse()

if *fVersion {
fmt.Printf("%s %s-%s build at: %s with %s site: %s\n",
appName,
GitTag,
GitHash,
BuildDate,
runtime.Version(),
appSite,
)
fmt.Println(version())

return
}
Expand All @@ -162,7 +189,7 @@ func main() {

opts, err := initOptions()
if err != nil {
log.Fatal("options:", err)
log.Fatal("[-] options:", err)
}

if *fSilent {
Expand All @@ -173,6 +200,6 @@ func main() {
// forcing back stderr in case of errors, otherwise
// if 'silent' is on - no one will know what happened.
log.SetOutput(os.Stderr)
log.Fatal("crawler:", err)
log.Fatal("[-] crawler:", err)
}
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ module github.com/s0rg/crawley

go 1.19

require golang.org/x/net v0.0.0-20220923203811-8be639271d50
require golang.org/x/net v0.0.0-20221002022538-bcab6841153b
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
golang.org/x/net v0.0.0-20220923203811-8be639271d50 h1:vKyz8L3zkd+xrMeIaBsQ/MNVPVFSffdaU3ZyYlBGFnI=
golang.org/x/net v0.0.0-20220923203811-8be639271d50/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
golang.org/x/net v0.0.0-20221002022538-bcab6841153b h1:6e93nYa3hNqAvLr0pD4PN1fFS+gKzp2zAXqrnTCstqU=
golang.org/x/net v0.0.0-20221002022538-bcab6841153b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
23 changes: 12 additions & 11 deletions pkg/crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,18 @@ const (
)

type config struct {
Headers []string
Cookies []string
UserAgent string
Delay time.Duration
Workers int
Depth int
Robots RobotsPolicy
Dirs DirsPolicy
SkipSSL bool
Brute bool
NoHEAD bool
Headers []string
Cookies []string
AlowedTags []string
UserAgent string
Delay time.Duration
Workers int
Depth int
Robots RobotsPolicy
Dirs DirsPolicy
SkipSSL bool
Brute bool
NoHEAD bool
}

func (c *config) validate() {
Expand Down
5 changes: 5 additions & 0 deletions pkg/crawler/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func TestOptions(t *testing.T) {
WithoutHeads(fbool),
WithExtraHeaders(extHeaders),
WithExtraCookies(extCookies),
WithTagsFilter([]string{"a", "form"}),
}

c := &config{}
Expand Down Expand Up @@ -119,6 +120,10 @@ func TestOptions(t *testing.T) {
if !reflect.DeepEqual(c.Cookies, extCookies) {
t.Error("bad extra cookies")
}

if len(c.AlowedTags) != 2 {
t.Error("unexpected filter size")
}
}

func TestString(t *testing.T) {
Expand Down
26 changes: 9 additions & 17 deletions pkg/crawler/crawl.go → pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"fmt"
"io"
"log"
"mime"
"net/http"
"net/url"
"strings"
Expand All @@ -32,8 +31,6 @@ const (

crawlTimeout = 5 * time.Second
robotsTimeout = 3 * time.Second
contentType = "Content-Type"
contentHTML = "text/html"
)

type taskFlag byte
Expand All @@ -60,6 +57,7 @@ type Crawler struct {
crawlCh chan *url.URL
resultCh chan crawlResult
robots *robots.TXT
filter links.TokenFilter
}

// New creates Crawler instance.
Expand All @@ -72,7 +70,10 @@ func New(opts ...Option) (c *Crawler) {

cfg.validate()

return &Crawler{cfg: cfg}
return &Crawler{
cfg: cfg,
filter: prepareFilter(cfg.AlowedTags),
}
}

// Run starts crawling process for given base uri.
Expand All @@ -90,8 +91,8 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {

defer c.close()

seen := make(set.U64)
seen.Add(urlHash(uri))
seen := make(set.URI)
seen.Add(uri)

web := client.New(
c.cfg.UserAgent,
Expand Down Expand Up @@ -120,7 +121,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {
switch {
case t.Flag == TaskDone:
w--
case seen.Add(urlHash(t.URI)):
case seen.Add(t.URI):
if t.Flag == TaskCrawl && c.crawl(base, &t) {
w++
}
Expand Down Expand Up @@ -261,15 +262,6 @@ func (c *Crawler) linkHandler(a atom.Atom, s string) {
c.resultCh <- t
}

func isHTML(v string) (yes bool) {
typ, _, err := mime.ParseMediaType(v)
if err != nil {
return
}

return typ == contentHTML
}

func (c *Crawler) fetch(
ctx context.Context,
web crawlClient,
Expand All @@ -289,7 +281,7 @@ func (c *Crawler) fetch(

switch {
case isHTML(hdrs.Get(contentType)):
links.ExtractHTML(base, body, c.cfg.Brute, c.linkHandler)
links.ExtractHTML(base, body, c.cfg.Brute, c.filter, c.linkHandler)
case isSitemap(uri):
links.ExtractSitemap(base, body, c.sitemapHandler)
}
Expand Down
35 changes: 33 additions & 2 deletions pkg/crawler/crawl_test.go → pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ sitemap: http://other.host/sitemap.xml`

// case A

resA := make(set.String)
resA := make(set.Set[string])

handlerA := func(s string) {
resA.Add(s)
Expand Down Expand Up @@ -232,7 +232,7 @@ sitemap: http://other.host/sitemap.xml`

// case B

resB := make(set.String)
resB := make(set.Set[string])

handlerB := func(s string) {
resB.Add(s)
Expand Down Expand Up @@ -668,3 +668,34 @@ sitemap: %s/sitemap.xml`, ts.URL)
t.Error("empty sitemap result")
}
}

func TestFilterTags(t *testing.T) {
t.Parallel()

const bodyHTML = `<html><a href="link">ok</a><img src="bad"/><iframe src="ok"/></html>`

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodGet {
w.Header().Add(contentType, contentHTML)
_, _ = io.WriteString(w, bodyHTML)
}
}))

defer ts.Close()

c := New(
WithoutHeads(true),
WithMaxCrawlDepth(1),
WithTagsFilter([]string{"a", "iframe"}),
)

handler := func(s string) {
if strings.Contains(s, "bad") {
t.Fail()
}
}

if err := c.Run(ts.URL, handler); err != nil {
t.Errorf("run: %v", err)
}
}
Loading

0 comments on commit 5de5e8d

Please sign in to comment.