From 03452b88b72da341f39338f9d22f55a2350a5e58 Mon Sep 17 00:00:00 2001 From: pops64 Date: Wed, 28 Aug 2024 05:16:25 -0400 Subject: [PATCH] Fix for the 7k siteURL page scrape for WetVR Scraper (#1821) * Fix for the 7k siteURL page scrape Switched to a counter based method that stops when a p tag with "Latest" no longer present on the siteURL page. This is the end of valid sceneURLs. Also had to add in a method to prevent multiple firings of the OnHTML callback. This method repeats with 13 siteURL visits every time which is the current number of valid siteURLs for WetVR * Format --- pkg/scrape/wetvr.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/scrape/wetvr.go b/pkg/scrape/wetvr.go index 4de99ce98..6abccfa7c 100644 --- a/pkg/scrape/wetvr.go +++ b/pkg/scrape/wetvr.go @@ -2,6 +2,7 @@ package scrape import ( "encoding/json" + "fmt" "strings" "sync" "time" @@ -21,6 +22,7 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- sceneCollector := createCollector("wetvr.com") siteCollector := createCollector("wetvr.com") + pageCnt := 1 sceneCollector.OnHTML(`div#trailer_player`, func(e *colly.HTMLElement) { sc := models.ScrapedScene{} @@ -89,10 +91,17 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- out <- sc }) - siteCollector.OnHTML(`ul a.page-link`, func(e *colly.HTMLElement) { + //This p with Latest only exists on pages container sceneURLs. If the index page doesn't have this we have reached the end of sceneURLs + // Stop search for more siteURLs + siteCollector.OnHTML(`div:has(p:contains("Latest"))`, func(e *colly.HTMLElement) { + // Check to make sure we aren't getting multiple firings from pages we have already incremented the count on if !limitScraping { - pageURL := e.Request.AbsoluteURL(e.Attr("href")) - siteCollector.Visit(pageURL) + if e.Request.URL.String() == `https://wetvr.com/?page=`+fmt.Sprint(pageCnt) { + pageCnt += 1 + + pageURL := e.Request.AbsoluteURL(`https://wetvr.com/?page=` + fmt.Sprint(pageCnt)) + siteCollector.Visit(pageURL) + } } }) @@ -133,8 +142,9 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- ctx.Put("scene-date", "") } sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil) - } else { - siteCollector.Visit("https://wetvr.com/") + } else if pageCnt == 1 { + // Only visit page 1 on start up of scraper + siteCollector.Visit("https://wetvr.com/?page=1") } if updateSite {