Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crawler): add canceled context to Visit func #27

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func NewCrawler(opt Option) Crawler {

func (c *Crawler) Crawl(ctx context.Context) error {
log.Println("Crawl maven repository and save indexes")
ctx, cancel := context.WithCancel(ctx)
defer cancel()

errCh := make(chan error)
defer close(errCh)

Expand Down Expand Up @@ -94,7 +97,7 @@ func (c *Crawler) Crawl(ctx context.Context) error {
go func(url string) {
defer c.limit.Release(1)
defer c.wg.Done()
if err := c.Visit(url); err != nil {
if err := c.Visit(ctx, url); err != nil {
errCh <- xerrors.Errorf("visit error: %w", err)
}
}(url)
Expand All @@ -108,6 +111,7 @@ loop:
case <-crawlDone:
break loop
case err := <-errCh:
cancel() // Stop all running Visit functions to avoid writing to closed c.urlCh.
close(c.urlCh)
return err

Expand All @@ -117,8 +121,12 @@ loop:
return nil
}

func (c *Crawler) Visit(url string) error {
resp, err := c.http.Get(url)
func (c *Crawler) Visit(ctx context.Context, url string) error {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return xerrors.Errorf("http get error (%s): %w", url, err)
}
Expand All @@ -145,12 +153,12 @@ func (c *Crawler) Visit(url string) error {
})

if foundMetadata {
meta, err := c.parseMetadata(url + "maven-metadata.xml")
meta, err := c.parseMetadata(ctx, url+"maven-metadata.xml")
if err != nil {
return xerrors.Errorf("metadata parse error: %w", err)
}
if meta != nil {
if err = c.crawlSHA1(url, meta); err != nil {
if err = c.crawlSHA1(ctx, url, meta); err != nil {
return err
}
// Return here since there is no need to crawl dirs anymore.
Expand All @@ -162,22 +170,28 @@ func (c *Crawler) Visit(url string) error {

go func() {
for _, child := range children {
c.urlCh <- url + child
select {
// Context can be canceled if we receive an error from another Visit function.
case <-ctx.Done():
return
default:
c.urlCh <- url + child
}
}
}()

return nil
}

func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata) error {
var versions []Version
for _, version := range meta.Versioning.Versions {
// some metadata may contain characters that require escaping
// for example <version>1.0.7?</version>:
// https://repo.maven.apache.org/maven2/io/github/visal-99/b24paysdk/maven-metadata.xml
version = url.QueryEscape(version)
sha1FileName := fmt.Sprintf("/%s-%s.jar.sha1", url.QueryEscape(meta.ArtifactID), version)
sha1, err := c.fetchSHA1(baseURL + version + sha1FileName)
sha1, err := c.fetchSHA1(ctx, baseURL+version+sha1FileName)
if err != nil {
return err
}
Expand Down Expand Up @@ -207,10 +221,14 @@ func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
return nil
}

func (c *Crawler) parseMetadata(url string) (*Metadata, error) {
resp, err := c.http.Get(url)
func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, error) {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("can't get url: %w", err)
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

Expand All @@ -231,10 +249,14 @@ func (c *Crawler) parseMetadata(url string) (*Metadata, error) {
return &meta, nil
}

func (c *Crawler) fetchSHA1(url string) ([]byte, error) {
resp, err := c.http.Get(url)
func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) {
req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, xerrors.Errorf("unable to new HTTP request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, xerrors.Errorf("can't get sha1 from %s: %w", url, err)
return nil, xerrors.Errorf("http get error (%s): %w", url, err)
}
defer resp.Body.Close()

Expand Down
Loading