Fix broken WankzVR scraper & three studios added. (#266)

MilfVR - Minor change to have pagination order by recent. SLR - RealHotVR studio added. - Synopsis minor fix WankzVR - Fix broken scraper due to website layout changes. - Change to regular cover instead of the wide banner. ZexyVR/WankitNowVR - New scraper file added doing both sites.
theRealKLH · Apr 21, 2020 · 6c72cb1 · 6c72cb1
1 parent cc31e5b
commit 6c72cb1
Show file tree

Hide file tree

Showing 4 changed files with 220 additions and 24 deletions.
diff --git a/pkg/scrape/milfvr.go b/pkg/scrape/milfvr.go
@@ -105,7 +105,7 @@ func MilfVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
 		}
 	})
 
-	siteCollector.Visit("https://www.milfvr.com/videos")
+	siteCollector.Visit("https://www.milfvr.com/videos?o=d")
 
 	if updateSite {
 		updateSiteLastUpdate(scraperID)

diff --git a/pkg/scrape/slrstudios.go b/pkg/scrape/slrstudios.go
@@ -46,8 +46,10 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
 		})
 
 		// Synopsis
-		e.ForEach(`div#tabs-about div:last-child div:nth-child(2)`, func(id int, e *colly.HTMLElement) {
-			sc.Synopsis = strings.TrimSpace(e.Text)
+		e.ForEach(`div#tabs-about div.u-mb--four`, func(id int, e *colly.HTMLElement) {
+			if !strings.Contains(e.Text, "Released:") {
+				sc.Synopsis = strings.TrimSpace(e.Text)
+			}
 		})
 
 		// Skipping some very generic and useless tags
@@ -230,6 +232,11 @@ func StripzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out cha
 	return SexLikeReal(wg, updateSite, knownScenes, out, "stripzvr", "StripzVR", "N1ck Inc.")
 }
 
+// RealHotVR.com doesn't have complete scene index, pagination stops after two pages
+func RealHotVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
+	return SexLikeReal(wg, updateSite, knownScenes, out, "realhotvr", "RealHotVR", "RealHotVR")
+}
+
 func init() {
 	registerScraper("slr-originals", "SLR Originals", "https://www.sexlikereal.com/s/refactor/images/favicons/android-icon-192x192.png", SLROriginals)
 	registerScraper("istripper", "iStripper (SLR)", "https://www.istripper.com/favicons/istripper/apple-icon-120x120.png", iStripper)
@@ -243,4 +250,5 @@ func init() {
 	registerScraper("pervrt", "perVRt/Terrible (SLR)", "https://mcdn.vrporn.com/files/20181218151630/pervrt-logo.jpg", perVRt)
 	registerScraper("leninacrowne", "LeninaCrowne (SLR)", "https://mcdn.vrporn.com/files/20190711135807/terrible_logo-e1562878668857_400x400_acf_cropped.jpg", LeninaCrowne)
 	registerScraper("stripzvr", "StripzVR (SLR)", "https://www.stripzvr.com/wp-content/uploads/2018/09/cropped-favicon-192x192.jpg", StripzVR)
+	registerScraper("realhotvr", "RealHotVR (SLR)", "https://g8iek4luc8.ent-cdn.com/templates/realhotvr/images/favicon.jpg", RealHotVR)
 }
diff --git a/pkg/scrape/wankzvr.go b/pkg/scrape/wankzvr.go
@@ -34,18 +34,18 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
 		sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
 
 		// Title
-		e.ForEach(`header h1`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`h1.detail__title`, func(id int, e *colly.HTMLElement) {
 			sc.Title = e.Text
 		})
 
 		// Date
-		e.ForEach(`div.date`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div.detail__date_time span.detail__date`, func(id int, e *colly.HTMLElement) {
 			tmpDate, _ := goment.New(e.Text, "DD MMMM, YYYY")
 			sc.Released = tmpDate.Format("YYYY-MM-DD")
 		})
 
 		// Duration
-		e.ForEach(`div.duration`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div.detail__date_time span.time`, func(id int, e *colly.HTMLElement) {
 			if id == 1 {
 				tmpDuration, err := strconv.Atoi(strings.TrimSpace(strings.Replace(e.Text, "minutes", "", -1)))
 				if err == nil {
@@ -61,53 +61,50 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
 		sc.Filenames = append(sc.Filenames, "wankzvr-"+base+"180_180x180_3dh_LR.mp4")
 
 		// Cover URLs
-		e.ForEach(`div.swiper-slide img`, func(id int, e *colly.HTMLElement) {
-			if id == 0 {
-				sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(e.Attr("src")))
-			}
-		})
+		for _, x := range []string{"cover", "hero"} {
+			tmpCover := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/" + x + "/large.jpg"
+			sc.Covers = append(sc.Covers, tmpCover)
+		}
 
 		// Gallery
-		e.ForEach(`div.swiper-slide img.lazyload`, func(id int, e *colly.HTMLElement) {
-			if id > 0 {
-				sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("data-src")))
-			}
-		})
+		for _, x := range []string{"1", "2", "3", "4", "5", "6"} {
+			tmpGallery := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/thumbs/1024_" + x + ".jpg"
+			sc.Gallery = append(sc.Gallery, tmpGallery)
+		}
 
 		// Synopsis
-		e.ForEach(`p.description`, func(id int, e *colly.HTMLElement) {
-			sc.Synopsis = strings.TrimSpace(strings.Replace(e.Text, " Read more", "", -1))
+		e.ForEach(`div.detail__txt`, func(id int, e *colly.HTMLElement) {
+			sc.Synopsis = strings.TrimSpace(e.Text + e.ChildText("span.more__body"))
 		})
 
 		// Tags
-		e.ForEach(`div.tags a`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div.tag-list__body a.tag`, func(id int, e *colly.HTMLElement) {
 			sc.Tags = append(sc.Tags, e.Text)
 		})
 
 		// Cast
-		e.ForEach(`header h4 a`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div.detail__models a`, func(id int, e *colly.HTMLElement) {
 			sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
 		})
 
 		out <- sc
 	})
 
-	siteCollector.OnHTML(`nav.pager a`, func(e *colly.HTMLElement) {
+	siteCollector.OnHTML(`ul.pagenav__list a.pagenav__link`, func(e *colly.HTMLElement) {
 		pageURL := e.Request.AbsoluteURL(e.Attr("href"))
 		siteCollector.Visit(pageURL)
 	})
 
-	siteCollector.OnHTML(`div.contentContainer article a`, func(e *colly.HTMLElement) {
+	siteCollector.OnHTML(`ul.cards-list a.card__video`, func(e *colly.HTMLElement) {
 		sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
-		sceneURL = strings.Replace(sceneURL, "/preview", "", -1)
 
 		// If scene exist in database, there's no need to scrape
 		if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
 			sceneCollector.Visit(sceneURL)
 		}
 	})
 
-	siteCollector.Visit("https://www.wankzvr.com/videos")
+	siteCollector.Visit("https://www.wankzvr.com/videos?o=d")
 
 	if updateSite {
 		updateSiteLastUpdate(scraperID)

diff --git a/pkg/scrape/zexywankitnow.go b/pkg/scrape/zexywankitnow.go
@@ -0,0 +1,191 @@
+package scrape
+
+import (
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/gocolly/colly"
+	"github.com/mozillazg/go-slugify"
+	"github.com/nleeper/goment"
+	"github.com/thoas/go-funk"
+	"github.com/xbapps/xbvr/pkg/models"
+)
+
+func TwoWebMediaSite(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, scraperID string, siteID string, URL string) error {
+	defer wg.Done()
+	logScrapeStart(scraperID, siteID)
+
+	sceneCollector := createCollector("wankitnowvr.com", "zexyvr.com")
+	siteCollector := createCollector("wankitnowvr.com", "zexyvr.com")
+
+	// Regex preparation
+	reDateDuration := regexp.MustCompile(`Released\son\s(.*)\n+\s+Duration\s+:\s+(\d+):\d+`)
+	reCastTags := regexp.MustCompile(`(?:zexyvr|wankitnowvr)\.com\/(models|videos)\/+`)
+	reTagCat := regexp.MustCompile(`(.*)\s+\((.*)\)`)
+	reFilename := regexp.MustCompile(`videos\/(?U:([a-z\d\-]+))(?:(?:-|_)preview)?(_\d{4}.*\.mp4)`)
+
+	sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
+		sc := models.ScrapedScene{}
+		sc.SceneType = "VR"
+		sc.Studio = "2WebMedia"
+		sc.Site = siteID
+		sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]
+
+		// SiteID, Scene ID - get from URL
+		tmp := strings.Split(sc.HomepageURL, "/")
+		sc.SiteID = tmp[len(tmp)-1]
+		sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
+
+		// Cover / ID
+		e.ForEach(`deo-video`, func(id int, e *colly.HTMLElement) {
+			sc.Covers = append(sc.Covers, strings.Split(e.Attr("cover-image"), "?")[0]+"?h=900")
+		})
+		// Note: not all scenes have a deo-video element, only a regular img cover instead
+		if len(sc.Covers) == 0 {
+			e.ForEach(`div.container.pt-5 > div > div > img`, func(id int, e *colly.HTMLElement) {
+				sc.Covers = append(sc.Covers, strings.Split(e.Attr("src"), "?")[0]+"?h=900")
+			})
+		}
+
+		// Gallery
+		// Note: Limiting gallery to 900px in height as some are huge by default
+		e.ForEach(`div.gallery > div`, func(id int, e *colly.HTMLElement) {
+			if id > 0 {
+				sc.Gallery = append(sc.Gallery, strings.Split(e.ChildAttr("div.view > a", "href"), "?")[0]+"?h=900")
+			}
+		})
+
+		// Title
+		e.ForEach(`div.container.pt-5 h2`, func(id int, e *colly.HTMLElement) {
+			sc.Title = strings.TrimSpace(e.Text)
+		})
+
+		// Synopsis
+		e.ForEach(`div.container.pt-5 h2 + p`, func(id int, e *colly.HTMLElement) {
+			sc.Synopsis = strings.TrimSpace(e.Text)
+		})
+
+		//Note: Date/Duration info is currently all inside the same div element...
+		e.ForEach(`div.container.pt-5 p.text-muted`, func(id int, e *colly.HTMLElement) {
+			tmpDateDurationParts := reDateDuration.FindStringSubmatch(e.Text)
+
+			// Date
+			if len(tmpDateDurationParts[1]) > 0 {
+				tmpDate, _ := goment.New(tmpDateDurationParts[1], "MMM DD, YYYY")
+				sc.Released = tmpDate.Format("YYYY-MM-DD")
+			}
+
+			// Duration
+			if len(tmpDateDurationParts[2]) > 0 {
+				tmpDuration, err := strconv.Atoi(tmpDateDurationParts[2])
+				if err == nil {
+					sc.Duration = tmpDuration
+				}
+			}
+		})
+
+		// Cast & Tags
+		// Note: Cast/Tags links are currently all inside the same div element...
+		e.ForEach(`div.container.pt-5 p.text-muted > a`, func(id int, e *colly.HTMLElement) {
+			tmpURLParts := reCastTags.FindStringSubmatch(e.Attr("href"))
+			if len(tmpURLParts[1]) > 0 {
+				if tmpURLParts[1] == "models" {
+					// Cast
+					sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
+				} else if tmpURLParts[1] == "videos" {
+					// Tags
+					tmpTagParts := reTagCat.FindStringSubmatch(e.Text)
+					// Format is "tag (tag-category)" and we're removing the category part but some tags need fixing
+					switch strings.ToLower(tmpTagParts[2]) {
+					case "breasts":
+						//only has tags like "enhanced/natural/small/medium/large/huge"
+						if strings.ToLower(tmpTagParts[1]) == "large" {
+							tmpTagParts[1] = "big tits"
+						} else {
+							tmpTagParts[1] = tmpTagParts[1] + " tits"
+						}
+					case "eyes":
+						//some are like "gray eyes" while others are like "blue", but tag must include "eyes"
+						if !strings.Contains(strings.ToLower(tmpTagParts[1]), "eyes") {
+							tmpTagParts[1] = tmpTagParts[1] + " eyes"
+						}
+					case "lingerie":
+						//only has the lingerie color so just use "lingerie" instead
+						tmpTagParts[1] = "lingerie"
+					case "nationality":
+						//only change "english" to "british"
+						if strings.ToLower(tmpTagParts[1]) == "english" {
+							tmpTagParts[1] = "british"
+						}
+						//all other tags are fine to use as is.
+					}
+
+					if tmpTagParts[1] != "" {
+						sc.Tags = append(sc.Tags, strings.TrimSpace(strings.ToLower(tmpTagParts[1])))
+					}
+				}
+			}
+		})
+
+		// Filenames
+		// Best guess, using trailer filenames and removing the preview-part of it.
+		e.ForEach(`deo-video source`, func(id int, e *colly.HTMLElement) {
+			tmpFilename := reFilename.FindStringSubmatch(e.Attr("src"))
+			if len(tmpFilename) > 1 {
+				sc.Filenames = append(sc.Filenames, tmpFilename[1]+tmpFilename[2])
+			}
+		})
+		// Note: not all scenes have a deo-video element, in which case do a best guess from URL instead:
+		if len(sc.Filenames) == 0 {
+			tmpURLParts := strings.Split(e.Request.URL.Path, "/")
+			if len(tmpURLParts) > 1 {
+				baseStart := strings.Replace(tmpURLParts[2], "+", "_", -1)
+				filenames := []string{"_1920", "_2160", "_2880", "_3840", "_5760"}
+				baseEnd := "_180x180_3dh_180_sbs.mp4"
+				for i := range filenames {
+					filenames[i] = baseStart + filenames[i] + baseEnd
+				}
+				sc.Filenames = filenames
+			}
+		}
+
+		out <- sc
+	})
+
+	siteCollector.OnHTML(`ul.pagination a.page-link`, func(e *colly.HTMLElement) {
+		pageURL := e.Request.AbsoluteURL(e.Attr("href"))
+		siteCollector.Visit(pageURL)
+	})
+
+	siteCollector.OnHTML(`div.container div.card > a`, func(e *colly.HTMLElement) {
+		sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
+
+		// If scene exist in database, there's no need to scrape
+		if !funk.ContainsString(knownScenes, sceneURL) {
+			sceneCollector.Visit(sceneURL)
+		}
+	})
+
+	siteCollector.Visit(URL)
+
+	if updateSite {
+		updateSiteLastUpdate(scraperID)
+	}
+	logScrapeFinished(scraperID, siteID)
+	return nil
+}
+
+func WankitNowVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
+	return TwoWebMediaSite(wg, updateSite, knownScenes, out, "wankitnowvr", "WankitNowVR", "https://wankitnowvr.com/videos/")
+}
+
+func ZexyVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
+	return TwoWebMediaSite(wg, updateSite, knownScenes, out, "zexyvr", "ZexyVR", "https://zexyvr.com/videos/")
+}
+
+func init() {
+	registerScraper("wankitnowvr", "WankitNowVR", "https://mcdn.vrporn.com/files/20190103150250/wankitnow-profile.jpg", WankitNowVR)
+	registerScraper("zexyvr", "ZexyVR", "https://mcdn.vrporn.com/files/20190103151557/zexyvr-profile.jpg", ZexyVR)
+}