Skip to content

Commit

Permalink
Fix for the 7k siteURL page scrape for WetVR Scraper (xbapps#1821)
Browse files Browse the repository at this point in the history
* Fix for the 7k siteURL page scrape

Switched to a counter based method that stops when a p tag with "Latest" no longer present on the siteURL page. This is the end of valid sceneURLs. Also had to add in a method to prevent multiple firings of the OnHTML callback. This method repeats with 13 siteURL visits every time which is the current number of valid siteURLs for WetVR

* Format
  • Loading branch information
pops64 authored Aug 28, 2024
1 parent 4d214be commit 03452b8
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions pkg/scrape/wetvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package scrape

import (
"encoding/json"
"fmt"
"strings"
"sync"
"time"
Expand All @@ -21,6 +22,7 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-

sceneCollector := createCollector("wetvr.com")
siteCollector := createCollector("wetvr.com")
pageCnt := 1

sceneCollector.OnHTML(`div#trailer_player`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
Expand Down Expand Up @@ -89,10 +91,17 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-
out <- sc
})

siteCollector.OnHTML(`ul a.page-link`, func(e *colly.HTMLElement) {
//This p with Latest only exists on pages container sceneURLs. If the index page doesn't have this we have reached the end of sceneURLs
// Stop search for more siteURLs
siteCollector.OnHTML(`div:has(p:contains("Latest"))`, func(e *colly.HTMLElement) {
// Check to make sure we aren't getting multiple firings from pages we have already incremented the count on
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
if e.Request.URL.String() == `https://wetvr.com/?page=`+fmt.Sprint(pageCnt) {
pageCnt += 1

pageURL := e.Request.AbsoluteURL(`https://wetvr.com/?page=` + fmt.Sprint(pageCnt))
siteCollector.Visit(pageURL)
}
}
})

Expand Down Expand Up @@ -133,8 +142,9 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-
ctx.Put("scene-date", "")
}
sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
} else {
siteCollector.Visit("https://wetvr.com/")
} else if pageCnt == 1 {
// Only visit page 1 on start up of scraper
siteCollector.Visit("https://wetvr.com/?page=1")
}

if updateSite {
Expand Down

0 comments on commit 03452b8

Please sign in to comment.