Skip to content

Commit

Permalink
✨ scrape v0
Browse files Browse the repository at this point in the history
  • Loading branch information
perebaj committed Jan 18, 2024
1 parent 11219a8 commit 83fdb55
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 51 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ test:
go test ./... -timeout 10s -race; \
fi

## Show the tests coverage
.PHONY: coverage
coverage:
go test -coverprofile=c.out
go tool cover -html=c.out

.PHONY: integration-test
integration-test:
go test -timeout 5s -tags=integration ./... -v
Expand Down
33 changes: 1 addition & 32 deletions cmd/newsletter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,9 @@
package main

import (
"context"
"fmt"
"time"

"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)

// var references = []string{
// "https://research.swtch.com/feed.atom",
// "https://apenwarr.ca/sitemap.txt",
// }

func main() {

ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()

client, err := mongo.Connect(ctx, options.Client().ApplyURI("mongodb://mongodb:27017"))
if err != nil {
panic(err)
}
err = client.Ping(ctx, nil)
if err != nil {
panic(err)
}

fmt.Println("Connected to MongoDB!")

collection := client.Database("newsletter").Collection("newsletter")
res, err := collection.InsertOne(ctx, map[string]string{"name": "pi", "value": "3.14159"})
if err != nil {
panic(err)
}
id := res.InsertedID
fmt.Println(id)
fmt.Println("Hello, World!")
}
1 change: 0 additions & 1 deletion mongodb/newsletter.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ func (m *NLStorage) SaveSite(ctx context.Context, sites []Site) error {
database := m.client.Database(m.DBName)
collection := database.Collection("sites")

//parse sites to []interface{} to use InsertMany
var docs []interface{}
for _, site := range sites {
docs = append(docs, site)
Expand Down
49 changes: 31 additions & 18 deletions scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,45 @@ package newsletter

import (
"bytes"
"fmt"
"log/slog"
"net/http"
)

// GetReferences returns the content of the references
func GetReferences(references []string) ([]string, error) {
refContent := make([]string, len(references))
// GetReferences returns the content of a url as a string
func GetReferences(ref string) (string, error) {
resp, err := http.Get(ref)
if err != nil {
return "", err
}

defer func() {
_ = resp.Body.Close()
}()

for _, ref := range references {
resp, err := http.Get(ref)
var bodyString string
if resp.StatusCode == 200 {
buf := new(bytes.Buffer)
_, err := buf.ReadFrom(resp.Body)
if err != nil {
return nil, err
return "", err
}
bodyString = buf.String()
} else {
slog.Warn(fmt.Sprintf("%s returned status code %d", ref, resp.StatusCode))
return "", nil
}

defer func() {
_ = resp.Body.Close()
}()
return bodyString, nil
}

if resp.StatusCode == 200 {
buf := new(bytes.Buffer)
_, err := buf.ReadFrom(resp.Body)
if err != nil {
return nil, err
}
bodyString := buf.String()
refContent = append(refContent, bodyString[:100])
// Worker use a worker pool to process jobs and send the restuls through a channel
func Worker(jobs <-chan string, result chan<- string, f func(string) (string, error)) {
for j := range jobs {
content, err := f(j)
if err != nil {
fmt.Printf("error getting reference %s: %v", j, err)
}
result <- content
}
return refContent, nil
}
72 changes: 72 additions & 0 deletions scrape_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package newsletter

import (
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
)

func TestWorker(t *testing.T) {
const numJobs = 3
jobs := make(chan string, numJobs)
result := make(chan string, numJobs)

f := func(s string) (string, error) {
time.Sleep(100 * time.Millisecond)
return fmt.Sprintf("job %s done", s), nil
}

go Worker(jobs, result, f)

jobs <- "job1"
jobs <- "job2"
jobs <- "job3"

close(jobs)

for i := 0; i < numJobs; i++ {
r := <-result
if r != fmt.Sprintf("job job%d done", i+1) {
t.Errorf("expected job%d done, got %s", i+1, r)
}
}
}

func TestGetReferences(t *testing.T) {
wantBody := "Hello, World!"
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)

_, _ = w.Write([]byte(wantBody))
}))

defer server.Close()

got, err := GetReferences(server.URL)
if err != nil {
t.Errorf("error getting reference: %v", err)
}

if got != wantBody {
t.Errorf("expected %s, got %s", wantBody, got)
}
}

func TestGetReferences_Status500(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))

defer server.Close()

got, err := GetReferences(server.URL)
if err != nil {
t.Errorf("error getting reference: %v", err)
}

if got != "" {
t.Errorf("expected empty body, got %s", got)
}
}

0 comments on commit 83fdb55

Please sign in to comment.