Skip to content

Commit

Permalink
Fix space handling in URLs (#103)
Browse files Browse the repository at this point in the history
* Add tests

* Fix link finder with encoded spaces

* Bump version
  • Loading branch information
raviqqe authored Sep 23, 2020
1 parent c6520dc commit cb73d98
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 16 deletions.
2 changes: 1 addition & 1 deletion configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package main
import "time"

const (
version = "1.5.5"
version = "1.5.6"
defaultBufferSize = 4096
defaultConcurrency = 512
defaultMaxRedirections = 64
Expand Down
13 changes: 1 addition & 12 deletions link_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"net/url"
"regexp"
"strings"
"unicode"

"github.com/yhat/scrape"
"golang.org/x/net/html"
Expand Down Expand Up @@ -44,7 +43,7 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error {
return ok
}) {
for _, a := range atomToAttributes[n.DataAtom] {
s := normalizeURL(scrape.Attr(n, a))
s := strings.TrimSpace(scrape.Attr(n, a))

if s == "" || f.isLinkExcluded(s) {
continue
Expand Down Expand Up @@ -72,13 +71,3 @@ func (f linkFinder) isLinkExcluded(u string) bool {

return false
}

func normalizeURL(s string) string {
return strings.Map(func(r rune) rune {
if unicode.IsSpace(r) {
return -1
}

return r
}, s)
}
53 changes: 50 additions & 3 deletions link_finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ func TestLinkFinderFindLinks(t *testing.T) {
}{
{``, 0},
{`<a href="/" />`, 1},
{`<a href="/f
o o" />`, 1},
{`<a href="/foo" />`, 1},
// TODO: Test <frame> tag.
{`<iframe src="/iframe"></iframe>`, 1},
{`<img src="/foo.jpg" />`, 1},
Expand Down Expand Up @@ -52,7 +51,55 @@ func TestLinkFinderFindLinks(t *testing.T) {
}
}

func TestLinkFinderScrapePageError(t *testing.T) {
func TestLinkFinderFindLinkWithoutEncodedSpaces(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)

n, err := html.Parse(strings.NewReader(
htmlWithBody(`<a href="http://foo.com/a%20b" />`)),
)
assert.Nil(t, err)

ls := newLinkFinder(nil).Find(n, b)

err, ok := ls["http://foo.com/a%20b"]
assert.True(t, ok)
assert.Nil(t, err)
}

func TestLinkFinderFindLinkWithoutSpacesNotEncoded(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)

n, err := html.Parse(strings.NewReader(
htmlWithBody(`<a href="http://foo.com/a b" />`)),
)
assert.Nil(t, err)

ls := newLinkFinder(nil).Find(n, b)

err, ok := ls["http://foo.com/a%20b"]
assert.True(t, ok)
assert.Nil(t, err)
}

func TestLinkFinderFindLinkWithLeadingAndTrailingSpaces(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)

n, err := html.Parse(strings.NewReader(
htmlWithBody(`<a href=" http://foo.com " />`)),
)
assert.Nil(t, err)

ls := newLinkFinder(nil).Find(n, b)

err, ok := ls["http://foo.com"]
assert.True(t, ok)
assert.Nil(t, err)
}

func TestLinkFinderFailWithInvalidURL(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)

Expand Down

0 comments on commit cb73d98

Please sign in to comment.