From 6d8cc42d841ed585f759fa88fe85ff5c47f7737e Mon Sep 17 00:00:00 2001 From: jojo Date: Sat, 20 Jan 2024 23:11:37 -0300 Subject: [PATCH] :sparkles: scrape comparation --- Makefile | 7 ++- mongodb/newsletter.go | 59 +++++++++++++---------- mongodb/newsletter_test.go | 98 +++++++++++++++++++++++--------------- scrape.go | 68 +++++++++++++++++++------- scrape_test.go | 55 +++++++++++++++++---- 5 files changed, 197 insertions(+), 90 deletions(-) diff --git a/Makefile b/Makefile index afff236..6f9ce45 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,14 @@ coverage: go test -coverprofile=c.out go tool cover -html=c.out +## Run all tests including the integration tests (requires docker up and running). Usage `make integration-test` or `make integration-test testcase="TestFunctionName"` to run an isolated tests .PHONY: integration-test integration-test: - go test -timeout 5s -tags=integration ./... -v + if [ -n "$(testcase)" ]; then \ + go test ./... -timeout 5s -tags integration -v -run="^$(testcase)$$" ; \ + else \ + go test ./... -timeout 5s -tags integration; \ + fi ## builds the service .PHONY: service diff --git a/mongodb/newsletter.go b/mongodb/newsletter.go index ef05e39..c2b93d4 100644 --- a/mongodb/newsletter.go +++ b/mongodb/newsletter.go @@ -6,7 +6,6 @@ import ( "time" "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo/options" ) // Newsletter is the struct that gather what websites to scrape for an user email @@ -22,12 +21,13 @@ type Engineer struct { URL string `bson:"url"` } -// Site is the struct that gather the scraped content of a website -type Site struct { - UserEmail string `bson:"user_email"` +// Page is the struct that gather the scraped content of a website +type Page struct { URL string `bson:"url"` Content string `bson:"content"` ScrapeDatetime time.Time `bson:"scrape_date"` + HashMD5 [16]byte `bson:"hash_md5"` + IsMostRecent bool `bson:"is_most_recent"` } // SaveNewsletter saves a newsletter in the database @@ -59,7 +59,7 @@ func (m *NLStorage) DistinctEngineerURLs(ctx context.Context) ([]interface{}, er resp, err := collection.Distinct(ctx, "url", bson.M{}) if err != nil { - return nil, fmt.Errorf("error getting engineers: %w", err) + return nil, fmt.Errorf("error getting engineers: %v", err) } return resp, nil @@ -82,13 +82,13 @@ func (m *NLStorage) Newsletter() ([]Newsletter, error) { return newsletters, nil } -// SaveSite saves a site in the database -func (m *NLStorage) SaveSite(ctx context.Context, sites []Site) error { +// SavePage saves a site in the database +func (m *NLStorage) SavePage(ctx context.Context, pages []Page) error { database := m.client.Database(m.DBName) - collection := database.Collection("sites") + collection := database.Collection("pages") var docs []interface{} - for _, site := range sites { + for _, site := range pages { docs = append(docs, site) } _, err := collection.InsertMany(ctx, docs) @@ -98,25 +98,36 @@ func (m *NLStorage) SaveSite(ctx context.Context, sites []Site) error { return nil } -// Sites returns given an user email and a URL, the last scraped content of that URL -func (m *NLStorage) Sites(usrEmail, URL string) ([]Site, error) { +// Page returns the last scraped content of a given url +func (m *NLStorage) Page(ctx context.Context, url string) ([]Page, error) { + var page []Page database := m.client.Database(m.DBName) - collection := database.Collection("sites") - max := int64(2) - - filter := bson.M{"user_email": usrEmail, "url": URL} - sort := bson.D{{Key: "scrape_date", Value: -1}} - opts := options.Find().SetSort(sort) - opts.Limit = &max + collection := database.Collection("pages") + + pipeline := []bson.M{ + { + "$match": bson.M{ + "url": url, + }, + }, + { + "$sort": bson.M{ + "scrape_date": -1, + }, + }, + { + "$limit": 1, + }, + } - cursor, err := collection.Find(context.Background(), filter, opts) + cursor, err := collection.Aggregate(ctx, pipeline) if err != nil { - return nil, err + return page, fmt.Errorf("error getting page: %v", err) } - var sites []Site - if err = cursor.All(context.Background(), &sites); err != nil { - return nil, err + if err = cursor.All(ctx, &page); err != nil { + return page, fmt.Errorf("error decoding page: %v", err) } - return sites, nil + + return page, nil } diff --git a/mongodb/newsletter_test.go b/mongodb/newsletter_test.go index a8baf15..a8c3745 100644 --- a/mongodb/newsletter_test.go +++ b/mongodb/newsletter_test.go @@ -5,6 +5,7 @@ package mongodb import ( "context" + "crypto/md5" "fmt" "os" "reflect" @@ -90,80 +91,99 @@ func TestNLStorageNewsletter(t *testing.T) { t.Cleanup(teardown(ctx, client, DBName)) } -func TestNLStorageSaveSite(t *testing.T) { +func TestNLStorageSavePage(t *testing.T) { ctx := context.Background() client, DBName := setup(ctx, t) database := client.Database(DBName) - collection := database.Collection("sites") + collection := database.Collection("pages") - want := []Site{ - {UserEmail: "j@gmail.com", URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 14, 15, 30, 0, 0, time.UTC)}, - {UserEmail: "j@gmail.com", URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 14, 15, 30, 0, 0, time.UTC)}, - {UserEmail: "jj@gmail.com", URL: "https://www.jj.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 14, 15, 30, 0, 0, time.UTC)}, + want := []Page{ + {IsMostRecent: true, URL: "https://www.google.com", Content: "HTML", HashMD5: md5.Sum([]byte("HTML")), ScrapeDatetime: time.Date(2023, time.August, 13, 15, 30, 0, 0, time.UTC)}, } - NLStorage := NewNLStorage(client, DBName) - err := NLStorage.SaveSite(ctx, want) - + storage := NewNLStorage(client, DBName) + err := storage.SavePage(ctx, want) if err != nil { - t.Fatal("error saving site", err) + t.Fatal("error saving page", err) } - var got []Site + var got []Page cursor, err := collection.Find(context.Background(), bson.M{}) if err != nil { - t.Fatal("error finding site", err) + t.Fatal("error finding page", err) } if err := cursor.All(ctx, &got); err != nil { - t.Fatal("error decoding site", err) + t.Fatal("error decoding page", err) } - if len(got) == 3 { - if !reflect.DeepEqual(got, want) { - t.Fatalf("got %v, want %v", got, want) + if len(got) == 1 { + if !reflect.DeepEqual(got[0], want[0]) { + t.Fatalf("got %v, want %v", got[0], want[0]) } } else { - t.Fatal("expected 2 sites, got", len(got)) - } + t.Fatal("expected 1 page, got", len(got)) + } + + // if len(page) == 0 { + // want[0].Updated = true + // err := NLStorage.SavePage(ctx, want) + // if err != nil { + // t.Fatal("error saving page", err) + // } + // } else { + // // Verifing if the new content is newest than the last one + // lastScrapedPage, err := NLStorage.Page(ctx, want[0].URL) + // if err != nil { + // t.Fatal("error getting page", err) + // } + + // if lastScrapedPage[0].HashMD5 != want[0].HashMD5 { + // want[0].Updated = true + // err = NLStorage.SavePage(ctx, want) + // if err != nil { + // t.Fatal("error saving page", err) + // } + // } else { + // t.Fatal("page already scraped") + // want[0].Updated = false + // err = NLStorage.SavePage(ctx, want) + // if err != nil { + // t.Fatal("error saving page", err) + // } + // } t.Cleanup(teardown(ctx, client, DBName)) } -func TestNLStorageSites(t *testing.T) { +func TestNLStoragePage(t *testing.T) { ctx := context.Background() client, DBName := setup(ctx, t) - want := []Site{ - {UserEmail: "j@gmail.com", URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 13, 15, 30, 0, 0, time.UTC)}, - {UserEmail: "j@gmail.com", URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 12, 15, 30, 0, 0, time.UTC)}, - {UserEmail: "j@gmail.com", URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 11, 15, 30, 0, 0, time.UTC)}, + want := []Page{ + {URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 13, 15, 30, 0, 0, time.UTC)}, + {URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 11, 15, 30, 0, 0, time.UTC)}, + {URL: "https://www.google.com", Content: "HTML", ScrapeDatetime: time.Date(2023, time.August, 12, 15, 30, 0, 0, time.UTC)}, } - NLStorage := NewNLStorage(client, DBName) - err := NLStorage.SaveSite(ctx, want) + storage := NewNLStorage(client, DBName) + err := storage.SavePage(ctx, want) if err != nil { - t.Fatal("error saving site", err) + t.Fatal("error saving page", err) } - got, err := NLStorage.Sites("j@gmail.com", "https://www.google.com") + got, err := storage.Page(ctx, "https://www.google.com") if err != nil { - t.Fatal("error getting site", err) + t.Fatal("error getting page", err) } - if len(got) == 2 { - assert(t, got[0].UserEmail, want[0].UserEmail) - assert(t, got[0].URL, want[0].URL) - assert(t, got[0].Content, want[0].Content) - assert(t, got[0].ScrapeDatetime, want[0].ScrapeDatetime) - - assert(t, got[1].UserEmail, want[1].UserEmail) - assert(t, got[1].URL, want[1].URL) - assert(t, got[1].Content, want[1].Content) - assert(t, got[1].ScrapeDatetime, want[1].ScrapeDatetime) + if len(got) == 1 { + if !reflect.DeepEqual(got[0], want[0]) { + t.Fatalf("got %v, want %v", got[0], want[0]) + } } else { - t.Fatal("expected 2 sites, got", len(got)) + t.Fatal("expected 1 page, got", len(got)) } t.Cleanup(teardown(ctx, client, DBName)) diff --git a/scrape.go b/scrape.go index 1b4b765..a4be077 100644 --- a/scrape.go +++ b/scrape.go @@ -4,6 +4,7 @@ package newsletter import ( "bytes" "context" + "crypto/md5" "fmt" "log/slog" "net/http" @@ -15,22 +16,24 @@ import ( "github.com/perebaj/newsletter/mongodb" ) -// PageContent is the struct that gather important information of a website -type PageContent struct { - Content string - URL string +// Page is the struct that gather important information of a website +type Page struct { + Content string + URL string + ScrapeDateTime time.Time } // Storage is the interface that wraps the basic methods to save and get data from the database type Storage interface { - SaveSite(ctx context.Context, site []mongodb.Site) error + SavePage(ctx context.Context, site []mongodb.Page) error DistinctEngineerURLs(ctx context.Context) ([]interface{}, error) + Page(ctx context.Context, url string) ([]mongodb.Page, error) } // Crawler contains the necessary information to run the crawler type Crawler struct { URLch chan string - resultCh chan PageContent + resultCh chan Page signalCh chan os.Signal MaxJobs int wg *sync.WaitGroup @@ -42,7 +45,7 @@ type Crawler struct { func NewCrawler(maxJobs int, s time.Duration, signalCh chan os.Signal) *Crawler { return &Crawler{ URLch: make(chan string), - resultCh: make(chan PageContent), + resultCh: make(chan Page), signalCh: signalCh, wg: &sync.WaitGroup{}, MaxJobs: maxJobs, @@ -63,7 +66,7 @@ func (c *Crawler) Run(ctx context.Context, s Storage, f func(string) (string, er slog.Debug("fetching engineers") gotURLs, err := s.DistinctEngineerURLs(ctx) if err != nil { - slog.Error("error getting engineers", "error", err) + slog.Error("error getting engineers: %v", err) c.signalCh <- syscall.SIGTERM } @@ -80,23 +83,52 @@ func (c *Crawler) Run(ctx context.Context, s Storage, f func(string) (string, er }() go func() { - for v := range c.resultCh { + for r := range c.resultCh { slog.Debug("saving fetched sites response") - err := s.SaveSite(ctx, []mongodb.Site{ - { - URL: v.URL, - Content: v.Content, - ScrapeDatetime: time.Now().UTC(), - }, - }) + + lastScrapedPage, err := s.Page(ctx, r.URL) if err != nil { - slog.Error("error saving site result", "error", err) + slog.Error("error gerrting Page: %v", err) + c.signalCh <- syscall.SIGTERM + } + + newPage := pageComparation(lastScrapedPage, r) + + err = s.SavePage(ctx, newPage) + if err != nil { + slog.Error("error saving site result: %v", err) c.signalCh <- syscall.SIGTERM } } }() } +// pageComparation verify if the content of a website has changed and assign the flag updated to true if it has changed or false otherwise. +func pageComparation(lastScrapedPage []mongodb.Page, recentScrapedPage Page) []mongodb.Page { + hashMD5 := md5.Sum([]byte(recentScrapedPage.Content)) + newPage := []mongodb.Page{ + { + URL: recentScrapedPage.URL, + Content: recentScrapedPage.Content, + ScrapeDatetime: recentScrapedPage.ScrapeDateTime, + HashMD5: hashMD5, + }, + } + + // If the page does not exist, it is the first time that the page is being scraped + // so it is considered the most recent version. + if len(lastScrapedPage) == 0 { + newPage[0].IsMostRecent = true + } else { + if lastScrapedPage[0].HashMD5 != hashMD5 { + newPage[0].IsMostRecent = true + } else { + newPage[0].IsMostRecent = false + } + } + return newPage +} + // Worker use a worker pool to process jobs and send the restuls through a channel func (c *Crawler) Worker(f func(string) (string, error)) { defer c.wg.Done() @@ -105,7 +137,7 @@ func (c *Crawler) Worker(f func(string) (string, error)) { if err != nil { slog.Error(fmt.Sprintf("error getting reference: %s", url), "error", err) } - c.resultCh <- PageContent{Content: content, URL: url} + c.resultCh <- Page{Content: content, URL: url, ScrapeDateTime: time.Now().UTC()} } } diff --git a/scrape_test.go b/scrape_test.go index fee07b7..16278d4 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -2,6 +2,7 @@ package newsletter import ( "context" + "crypto/md5" "net/http" "net/http/httptest" "os" @@ -13,6 +14,46 @@ import ( const fakeURL = "http://fakeurl.test" +func TestPageComparation(t *testing.T) { + recentScrapedPage := Page{ + Content: "Hello, World!", + URL: fakeURL, + ScrapeDateTime: time.Now().UTC(), + } + + lastScrapedPage := []mongodb.Page{ + { + Content: "Hello, World!", + URL: fakeURL, + ScrapeDatetime: time.Now().UTC().Add(-time.Duration(1) * time.Hour), + HashMD5: md5.Sum([]byte("Hello, World!")), + }, + } + + newPage := pageComparation(lastScrapedPage, recentScrapedPage) + + if newPage[0].IsMostRecent { + t.Errorf("expected false, got %v", newPage[0].IsMostRecent) + } + + lastScrapedPage[0].Content = "Hello, World! 2" + lastScrapedPage[0].HashMD5 = md5.Sum([]byte("Hello, World! 2")) + + newPage = pageComparation(lastScrapedPage, recentScrapedPage) + + if !newPage[0].IsMostRecent { + t.Errorf("expected true, got %v", newPage[0].IsMostRecent) + } + + lastScrapedPage = []mongodb.Page{} + + newPage = pageComparation(lastScrapedPage, recentScrapedPage) + + if !newPage[0].IsMostRecent { + t.Errorf("expected true, got %v", newPage[0].IsMostRecent) + } +} + // Even not verifying the result, this test is useful to check if the crawler is running properly, since it is // using Mocks for the Storage and the Fetch function. func TestCrawlerRun(t *testing.T) { @@ -75,23 +116,21 @@ func TestGetReferences_Status500(t *testing.T) { } } -// TODO: Move the StorageMock to a separate file, preferable in the same package(mongodb) -type StorageMock interface { - SaveSite(ctx context.Context, site []mongodb.Site) error - DistinctEngineerURLs(ctx context.Context) ([]interface{}, error) -} - type StorageMockImpl struct { } -func NewStorageMock() StorageMock { +func NewStorageMock() StorageMockImpl { return StorageMockImpl{} } -func (s StorageMockImpl) SaveSite(_ context.Context, _ []mongodb.Site) error { +func (s StorageMockImpl) SavePage(_ context.Context, _ []mongodb.Page) error { return nil } func (s StorageMockImpl) DistinctEngineerURLs(_ context.Context) ([]interface{}, error) { return []interface{}{fakeURL}, nil } + +func (s StorageMockImpl) Page(_ context.Context, _ string) ([]mongodb.Page, error) { + return []mongodb.Page{}, nil +}