Skip to content

Commit

Permalink
code grouming, missing test, reduce code smells (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0rg authored Oct 3, 2022
1 parent 5de5e8d commit b47e9c5
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 16 deletions.
7 changes: 6 additions & 1 deletion pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,12 @@ func (c *Crawler) fetch(

switch {
case isHTML(hdrs.Get(contentType)):
links.ExtractHTML(base, body, c.cfg.Brute, c.filter, c.linkHandler)
links.ExtractHTML(body, links.ExtractArgs{
Base: base,
Brute: c.cfg.Brute,
Filter: c.filter,
Handler: c.linkHandler,
})
case isSitemap(uri):
links.ExtractSitemap(base, body, c.sitemapHandler)
}
Expand Down
31 changes: 31 additions & 0 deletions pkg/crawler/util_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package crawler

import (
"testing"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

func TestPrepareFilter(t *testing.T) {
t.Parallel()

f := prepareFilter([]string{"a", "form", "foo", "div"})
d := []html.Token{
{DataAtom: atom.A},
{DataAtom: atom.Form},
{DataAtom: atom.Div},
}

for _, k := range d {
if !f(k) {
t.Fatalf("not allowed: %v", k)
}
}

v := html.Token{DataAtom: atom.Video}

if f(v) {
t.Fatalf("allowed: %v", v)
}
}
23 changes: 12 additions & 11 deletions pkg/links/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@ const (
type HTMLHandler func(atom.Atom, string)
type TokenFilter func(html.Token) bool

type ExtractArgs struct {
Base *url.URL
Brute bool
Filter TokenFilter
Handler HTMLHandler
}

func AllowALL(_ html.Token) bool { return true }

// ExtractHTML run `handler` for every link found inside html from `r`, rebasing them to `b` (if need).
func ExtractHTML(
b *url.URL,
r io.Reader,
brute bool,
allowed TokenFilter,
handler HTMLHandler,
) {
func ExtractHTML(r io.Reader, a ExtractArgs) {
var (
tkns = html.NewTokenizer(r)
key = keySRC
Expand All @@ -46,12 +47,12 @@ func ExtractHTML(
case html.ErrorToken:
return
case html.StartTagToken, html.SelfClosingTagToken:
if t = tkns.Token(); allowed(t) {
extractToken(b, t, &key, handler)
if t = tkns.Token(); a.Filter(t) {
extractToken(a.Base, t, &key, a.Handler)
}
case html.CommentToken:
if brute {
extractComment(tkns.Token().Data, handler)
if a.Brute {
extractComment(tkns.Token().Data, a.Handler)
}
}
}
Expand Down
18 changes: 14 additions & 4 deletions pkg/links/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,13 @@ func TestExtractURLS(t *testing.T) {

var res string

ExtractHTML(testBase, io.NopCloser(buf), true, AllowALL, func(_ atom.Atom, s string) {
res = s
ExtractHTML(io.NopCloser(buf), ExtractArgs{
Base: testBase,
Brute: true,
Filter: AllowALL,
Handler: func(_ atom.Atom, s string) {
res = s
},
})

if tc.hasLink {
Expand Down Expand Up @@ -247,8 +252,13 @@ func TestExtractAllowed(t *testing.T) {
return tkn.DataAtom == atom.A
}

ExtractHTML(testBase, io.NopCloser(buf), true, filter, func(_ atom.Atom, s string) {
res = append(res, s)
ExtractHTML(io.NopCloser(buf), ExtractArgs{
Base: testBase,
Brute: true,
Filter: filter,
Handler: func(_ atom.Atom, s string) {
res = append(res, s)
},
})

if len(res) != 1 {
Expand Down

0 comments on commit b47e9c5

Please sign in to comment.