Skip to content

Commit

Permalink
Merge pull request #2 from velicanercan/main
Browse files Browse the repository at this point in the history
Fix panic issue and prevent visiting already crawled URLs
  • Loading branch information
riza authored Mar 22, 2024
2 parents 2917b79 + d123fcd commit c4354d2
Showing 1 changed file with 14 additions and 7 deletions.
21 changes: 14 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ package main
import (
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"log"
"net/http"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
)

const (
Expand Down Expand Up @@ -67,6 +68,11 @@ func parseOptions() *Options {
os.Exit(1)
}

// Add https:// if not present, to avoid errors
if !strings.HasPrefix(options.URL, "http") {
options.URL = "https://" + options.URL
}

if extensions != "" {
options.Extensions = strings.Split(extensions, ",")
}
Expand All @@ -79,6 +85,7 @@ func crawl(url string, extensions []string, prefix string) error {
if err != nil {
return fmt.Errorf("error getting index: %w", err)
}
defer body.Close()

urls, err := parseIndex(url, prefix, extensions, body)
if err != nil {
Expand Down Expand Up @@ -126,19 +133,19 @@ func parseIndex(url, prefix string, extensions []string, body io.Reader) ([]stri
urls := make([]string, 0)

doc.Find("a").Each(func(i int, s *goquery.Selection) {
//skip parent directory
if strings.Contains(s.Text(), " Parent Directory") {
return
}

if url[len(url)-1:] != "/" {
url += "/"
}

//skip parent directory
href, _ := s.Attr("href")
if href == "../" { // parent directory
fmt.Println(prefix + "└── " + url + href)
return
}

//handle files
if s.Text()[len(s.Text())-1:] != "/" {
if len(s.Text()) > 0 && s.Text()[len(s.Text())-1:] != "/" {
//filter by extensions
if len(extensions) > 0 {
pos := strings.LastIndex(url+href, ".")
Expand Down

0 comments on commit c4354d2

Please sign in to comment.