Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crawler): add canceled context to Visit func #27

Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 52 additions & 42 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func NewCrawler(opt Option) Crawler {

func (c *Crawler) Crawl(ctx context.Context) error {
log.Println("Crawl maven repository and save indexes")
ctx, ctxCancelFunc := context.WithCancel(ctx)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ctx, ctxCancelFunc := context.WithCancel(ctx)
ctx, cancel := context.WithCancel(ctx)

nit: I think cancel is used conventionally.
https://go.dev/doc/database/cancel-operations

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done - 8f9c98b

defer ctxCancelFunc()

errCh := make(chan error)
defer close(errCh)

Expand Down Expand Up @@ -94,7 +97,7 @@ func (c *Crawler) Crawl(ctx context.Context) error {
go func(url string) {
defer c.limit.Release(1)
defer c.wg.Done()
if err := c.Visit(url); err != nil {
if err := c.Visit(ctx, url); err != nil {
errCh <- xerrors.Errorf("visit error: %w", err)
}
}(url)
Expand All @@ -108,6 +111,7 @@ loop:
case <-crawlDone:
break loop
case err := <-errCh:
ctxCancelFunc() // Stop all running Visit functions to avoid writing to closed c.urlCh.
close(c.urlCh)
return err

Expand All @@ -117,56 +121,62 @@ loop:
return nil
}

func (c *Crawler) Visit(url string) error {
resp, err := c.http.Get(url)
if err != nil {
return xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

d, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return xerrors.Errorf("can't create new goquery doc: %w", err)
}

var children []string
var foundMetadata bool
d.Find("a").Each(func(i int, selection *goquery.Selection) {
link := selection.Text()
if link == "maven-metadata.xml" {
foundMetadata = true
return
} else if link == "../" || !strings.HasSuffix(link, "/") {
// only `../` and dirs have `/` suffix. We don't need to check other files.
return
func (c *Crawler) Visit(ctx context.Context, url string) error {
select {
// Context can be canceled if we receive an error from another Visit function.
case <-ctx.Done():
return nil
default:
resp, err := c.http.Get(url)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a good opportunity to make the HTTP request context-aware.

Suggested change
resp, err := c.http.Get(url)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return xerrors.Errorf("unable to new HTTp request: %w", err)
}
if err = client.Do(req); err != nil {

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point!
Created this in 262f22d.

if err != nil {
return xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

children = append(children, link)
})

if foundMetadata {
meta, err := c.parseMetadata(url + "maven-metadata.xml")
d, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return xerrors.Errorf("metadata parse error: %w", err)
return xerrors.Errorf("can't create new goquery doc: %w", err)
}
if meta != nil {
if err = c.crawlSHA1(url, meta); err != nil {
return err

var children []string
var foundMetadata bool
d.Find("a").Each(func(i int, selection *goquery.Selection) {
link := selection.Text()
if link == "maven-metadata.xml" {
foundMetadata = true
return
} else if link == "../" || !strings.HasSuffix(link, "/") {
// only `../` and dirs have `/` suffix. We don't need to check other files.
return
}
// Return here since there is no need to crawl dirs anymore.
return nil
}
}

c.wg.Add(len(children))
children = append(children, link)
})

go func() {
for _, child := range children {
c.urlCh <- url + child
if foundMetadata {
meta, err := c.parseMetadata(url + "maven-metadata.xml")
if err != nil {
return xerrors.Errorf("metadata parse error: %w", err)
}
if meta != nil {
if err = c.crawlSHA1(url, meta); err != nil {
return err
}
// Return here since there is no need to crawl dirs anymore.
return nil
}
}
}()

return nil
c.wg.Add(len(children))

go func() {
for _, child := range children {
c.urlCh <- url + child
}
}()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if just checking context here?

Suggested change
go func() {
for _, child := range children {
c.urlCh <- url + child
}
}()
go func() {
for _, child := range children {
select {
// Context can be canceled if we receive an error from another Visit function.
case <-ctx.Done():
return
default:
c.urlCh <- url + child
}
}
}()

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this too.
I was worried that we need to stop func immediately after receiving the cancel function.
But after making HTTP requests context-sensitive (262f22d), we can move it into this loop.

Did this in c4a4202.


return nil
}
}

func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
Expand Down
Loading