From 83fdb557201a7d982c106dcd5e8d58ec8953ec90 Mon Sep 17 00:00:00 2001 From: jojo Date: Thu, 18 Jan 2024 10:26:45 -0300 Subject: [PATCH] :sparkles: scrape v0 --- Makefile | 6 ++++ cmd/newsletter/main.go | 33 +------------------ mongodb/newsletter.go | 1 - scrape.go | 49 +++++++++++++++++----------- scrape_test.go | 72 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 51 deletions(-) create mode 100644 scrape_test.go diff --git a/Makefile b/Makefile index c55cc84..f3a4d1f 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,12 @@ test: go test ./... -timeout 10s -race; \ fi +## Show the tests coverage +.PHONY: coverage +coverage: + go test -coverprofile=c.out + go tool cover -html=c.out + .PHONY: integration-test integration-test: go test -timeout 5s -tags=integration ./... -v diff --git a/cmd/newsletter/main.go b/cmd/newsletter/main.go index 86d3ded..1eb1b74 100644 --- a/cmd/newsletter/main.go +++ b/cmd/newsletter/main.go @@ -2,40 +2,9 @@ package main import ( - "context" "fmt" - "time" - - "go.mongodb.org/mongo-driver/mongo" - "go.mongodb.org/mongo-driver/mongo/options" ) -// var references = []string{ -// "https://research.swtch.com/feed.atom", -// "https://apenwarr.ca/sitemap.txt", -// } - func main() { - - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - - client, err := mongo.Connect(ctx, options.Client().ApplyURI("mongodb://mongodb:27017")) - if err != nil { - panic(err) - } - err = client.Ping(ctx, nil) - if err != nil { - panic(err) - } - - fmt.Println("Connected to MongoDB!") - - collection := client.Database("newsletter").Collection("newsletter") - res, err := collection.InsertOne(ctx, map[string]string{"name": "pi", "value": "3.14159"}) - if err != nil { - panic(err) - } - id := res.InsertedID - fmt.Println(id) + fmt.Println("Hello, World!") } diff --git a/mongodb/newsletter.go b/mongodb/newsletter.go index ed73e08..28c2eda 100644 --- a/mongodb/newsletter.go +++ b/mongodb/newsletter.go @@ -55,7 +55,6 @@ func (m *NLStorage) SaveSite(ctx context.Context, sites []Site) error { database := m.client.Database(m.DBName) collection := database.Collection("sites") - //parse sites to []interface{} to use InsertMany var docs []interface{} for _, site := range sites { docs = append(docs, site) diff --git a/scrape.go b/scrape.go index 6ac8137..11818ca 100644 --- a/scrape.go +++ b/scrape.go @@ -3,32 +3,45 @@ package newsletter import ( "bytes" + "fmt" + "log/slog" "net/http" ) -// GetReferences returns the content of the references -func GetReferences(references []string) ([]string, error) { - refContent := make([]string, len(references)) +// GetReferences returns the content of a url as a string +func GetReferences(ref string) (string, error) { + resp, err := http.Get(ref) + if err != nil { + return "", err + } + + defer func() { + _ = resp.Body.Close() + }() - for _, ref := range references { - resp, err := http.Get(ref) + var bodyString string + if resp.StatusCode == 200 { + buf := new(bytes.Buffer) + _, err := buf.ReadFrom(resp.Body) if err != nil { - return nil, err + return "", err } + bodyString = buf.String() + } else { + slog.Warn(fmt.Sprintf("%s returned status code %d", ref, resp.StatusCode)) + return "", nil + } - defer func() { - _ = resp.Body.Close() - }() + return bodyString, nil +} - if resp.StatusCode == 200 { - buf := new(bytes.Buffer) - _, err := buf.ReadFrom(resp.Body) - if err != nil { - return nil, err - } - bodyString := buf.String() - refContent = append(refContent, bodyString[:100]) +// Worker use a worker pool to process jobs and send the restuls through a channel +func Worker(jobs <-chan string, result chan<- string, f func(string) (string, error)) { + for j := range jobs { + content, err := f(j) + if err != nil { + fmt.Printf("error getting reference %s: %v", j, err) } + result <- content } - return refContent, nil } diff --git a/scrape_test.go b/scrape_test.go new file mode 100644 index 0000000..ec78b23 --- /dev/null +++ b/scrape_test.go @@ -0,0 +1,72 @@ +package newsletter + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestWorker(t *testing.T) { + const numJobs = 3 + jobs := make(chan string, numJobs) + result := make(chan string, numJobs) + + f := func(s string) (string, error) { + time.Sleep(100 * time.Millisecond) + return fmt.Sprintf("job %s done", s), nil + } + + go Worker(jobs, result, f) + + jobs <- "job1" + jobs <- "job2" + jobs <- "job3" + + close(jobs) + + for i := 0; i < numJobs; i++ { + r := <-result + if r != fmt.Sprintf("job job%d done", i+1) { + t.Errorf("expected job%d done, got %s", i+1, r) + } + } +} + +func TestGetReferences(t *testing.T) { + wantBody := "Hello, World!" + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + + _, _ = w.Write([]byte(wantBody)) + })) + + defer server.Close() + + got, err := GetReferences(server.URL) + if err != nil { + t.Errorf("error getting reference: %v", err) + } + + if got != wantBody { + t.Errorf("expected %s, got %s", wantBody, got) + } +} + +func TestGetReferences_Status500(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + + defer server.Close() + + got, err := GetReferences(server.URL) + if err != nil { + t.Errorf("error getting reference: %v", err) + } + + if got != "" { + t.Errorf("expected empty body, got %s", got) + } +}