Skip to content

Commit

Permalink
Fix broken WankzVR scraper & three studios added. (#266)
Browse files Browse the repository at this point in the history
MilfVR
- Minor change to have pagination order by recent.

SLR
- RealHotVR studio added.
- Synopsis minor fix

WankzVR
- Fix broken scraper due to website layout changes.
- Change to regular cover instead of the wide banner.

ZexyVR/WankitNowVR
- New scraper file added doing both sites.
  • Loading branch information
Aerowen authored Apr 21, 2020
1 parent cc31e5b commit 6c72cb1
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 24 deletions.
2 changes: 1 addition & 1 deletion pkg/scrape/milfvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func MilfVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
}
})

siteCollector.Visit("https://www.milfvr.com/videos")
siteCollector.Visit("https://www.milfvr.com/videos?o=d")

if updateSite {
updateSiteLastUpdate(scraperID)
Expand Down
12 changes: 10 additions & 2 deletions pkg/scrape/slrstudios.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
})

// Synopsis
e.ForEach(`div#tabs-about div:last-child div:nth-child(2)`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
e.ForEach(`div#tabs-about div.u-mb--four`, func(id int, e *colly.HTMLElement) {
if !strings.Contains(e.Text, "Released:") {
sc.Synopsis = strings.TrimSpace(e.Text)
}
})

// Skipping some very generic and useless tags
Expand Down Expand Up @@ -230,6 +232,11 @@ func StripzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out cha
return SexLikeReal(wg, updateSite, knownScenes, out, "stripzvr", "StripzVR", "N1ck Inc.")
}

// RealHotVR.com doesn't have complete scene index, pagination stops after two pages
func RealHotVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return SexLikeReal(wg, updateSite, knownScenes, out, "realhotvr", "RealHotVR", "RealHotVR")
}

func init() {
registerScraper("slr-originals", "SLR Originals", "https://www.sexlikereal.com/s/refactor/images/favicons/android-icon-192x192.png", SLROriginals)
registerScraper("istripper", "iStripper (SLR)", "https://www.istripper.com/favicons/istripper/apple-icon-120x120.png", iStripper)
Expand All @@ -243,4 +250,5 @@ func init() {
registerScraper("pervrt", "perVRt/Terrible (SLR)", "https://mcdn.vrporn.com/files/20181218151630/pervrt-logo.jpg", perVRt)
registerScraper("leninacrowne", "LeninaCrowne (SLR)", "https://mcdn.vrporn.com/files/20190711135807/terrible_logo-e1562878668857_400x400_acf_cropped.jpg", LeninaCrowne)
registerScraper("stripzvr", "StripzVR (SLR)", "https://www.stripzvr.com/wp-content/uploads/2018/09/cropped-favicon-192x192.jpg", StripzVR)
registerScraper("realhotvr", "RealHotVR (SLR)", "https://g8iek4luc8.ent-cdn.com/templates/realhotvr/images/favicon.jpg", RealHotVR)
}
39 changes: 18 additions & 21 deletions pkg/scrape/wankzvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

// Title
e.ForEach(`header h1`, func(id int, e *colly.HTMLElement) {
e.ForEach(`h1.detail__title`, func(id int, e *colly.HTMLElement) {
sc.Title = e.Text
})

// Date
e.ForEach(`div.date`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__date_time span.detail__date`, func(id int, e *colly.HTMLElement) {
tmpDate, _ := goment.New(e.Text, "DD MMMM, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
})

// Duration
e.ForEach(`div.duration`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__date_time span.time`, func(id int, e *colly.HTMLElement) {
if id == 1 {
tmpDuration, err := strconv.Atoi(strings.TrimSpace(strings.Replace(e.Text, "minutes", "", -1)))
if err == nil {
Expand All @@ -61,53 +61,50 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
sc.Filenames = append(sc.Filenames, "wankzvr-"+base+"180_180x180_3dh_LR.mp4")

// Cover URLs
e.ForEach(`div.swiper-slide img`, func(id int, e *colly.HTMLElement) {
if id == 0 {
sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(e.Attr("src")))
}
})
for _, x := range []string{"cover", "hero"} {
tmpCover := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/" + x + "/large.jpg"
sc.Covers = append(sc.Covers, tmpCover)
}

// Gallery
e.ForEach(`div.swiper-slide img.lazyload`, func(id int, e *colly.HTMLElement) {
if id > 0 {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("data-src")))
}
})
for _, x := range []string{"1", "2", "3", "4", "5", "6"} {
tmpGallery := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/thumbs/1024_" + x + ".jpg"
sc.Gallery = append(sc.Gallery, tmpGallery)
}

// Synopsis
e.ForEach(`p.description`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(strings.Replace(e.Text, " Read more", "", -1))
e.ForEach(`div.detail__txt`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text + e.ChildText("span.more__body"))
})

// Tags
e.ForEach(`div.tags a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.tag-list__body a.tag`, func(id int, e *colly.HTMLElement) {
sc.Tags = append(sc.Tags, e.Text)
})

// Cast
e.ForEach(`header h4 a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__models a`, func(id int, e *colly.HTMLElement) {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
})

out <- sc
})

siteCollector.OnHTML(`nav.pager a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`ul.pagenav__list a.pagenav__link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.contentContainer article a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`ul.cards-list a.card__video`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
sceneURL = strings.Replace(sceneURL, "/preview", "", -1)

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit("https://www.wankzvr.com/videos")
siteCollector.Visit("https://www.wankzvr.com/videos?o=d")

if updateSite {
updateSiteLastUpdate(scraperID)
Expand Down
191 changes: 191 additions & 0 deletions pkg/scrape/zexywankitnow.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package scrape

import (
"regexp"
"strconv"
"strings"
"sync"

"github.com/gocolly/colly"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/xbapps/xbvr/pkg/models"
)

func TwoWebMediaSite(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, scraperID string, siteID string, URL string) error {
defer wg.Done()
logScrapeStart(scraperID, siteID)

sceneCollector := createCollector("wankitnowvr.com", "zexyvr.com")
siteCollector := createCollector("wankitnowvr.com", "zexyvr.com")

// Regex preparation
reDateDuration := regexp.MustCompile(`Released\son\s(.*)\n+\s+Duration\s+:\s+(\d+):\d+`)
reCastTags := regexp.MustCompile(`(?:zexyvr|wankitnowvr)\.com\/(models|videos)\/+`)
reTagCat := regexp.MustCompile(`(.*)\s+\((.*)\)`)
reFilename := regexp.MustCompile(`videos\/(?U:([a-z\d\-]+))(?:(?:-|_)preview)?(_\d{4}.*\.mp4)`)

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.SceneType = "VR"
sc.Studio = "2WebMedia"
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]

// SiteID, Scene ID - get from URL
tmp := strings.Split(sc.HomepageURL, "/")
sc.SiteID = tmp[len(tmp)-1]
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

// Cover / ID
e.ForEach(`deo-video`, func(id int, e *colly.HTMLElement) {
sc.Covers = append(sc.Covers, strings.Split(e.Attr("cover-image"), "?")[0]+"?h=900")
})
// Note: not all scenes have a deo-video element, only a regular img cover instead
if len(sc.Covers) == 0 {
e.ForEach(`div.container.pt-5 > div > div > img`, func(id int, e *colly.HTMLElement) {
sc.Covers = append(sc.Covers, strings.Split(e.Attr("src"), "?")[0]+"?h=900")
})
}

// Gallery
// Note: Limiting gallery to 900px in height as some are huge by default
e.ForEach(`div.gallery > div`, func(id int, e *colly.HTMLElement) {
if id > 0 {
sc.Gallery = append(sc.Gallery, strings.Split(e.ChildAttr("div.view > a", "href"), "?")[0]+"?h=900")
}
})

// Title
e.ForEach(`div.container.pt-5 h2`, func(id int, e *colly.HTMLElement) {
sc.Title = strings.TrimSpace(e.Text)
})

// Synopsis
e.ForEach(`div.container.pt-5 h2 + p`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
})

//Note: Date/Duration info is currently all inside the same div element...
e.ForEach(`div.container.pt-5 p.text-muted`, func(id int, e *colly.HTMLElement) {
tmpDateDurationParts := reDateDuration.FindStringSubmatch(e.Text)

// Date
if len(tmpDateDurationParts[1]) > 0 {
tmpDate, _ := goment.New(tmpDateDurationParts[1], "MMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
}

// Duration
if len(tmpDateDurationParts[2]) > 0 {
tmpDuration, err := strconv.Atoi(tmpDateDurationParts[2])
if err == nil {
sc.Duration = tmpDuration
}
}
})

// Cast & Tags
// Note: Cast/Tags links are currently all inside the same div element...
e.ForEach(`div.container.pt-5 p.text-muted > a`, func(id int, e *colly.HTMLElement) {
tmpURLParts := reCastTags.FindStringSubmatch(e.Attr("href"))
if len(tmpURLParts[1]) > 0 {
if tmpURLParts[1] == "models" {
// Cast
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
} else if tmpURLParts[1] == "videos" {
// Tags
tmpTagParts := reTagCat.FindStringSubmatch(e.Text)
// Format is "tag (tag-category)" and we're removing the category part but some tags need fixing
switch strings.ToLower(tmpTagParts[2]) {
case "breasts":
//only has tags like "enhanced/natural/small/medium/large/huge"
if strings.ToLower(tmpTagParts[1]) == "large" {
tmpTagParts[1] = "big tits"
} else {
tmpTagParts[1] = tmpTagParts[1] + " tits"
}
case "eyes":
//some are like "gray eyes" while others are like "blue", but tag must include "eyes"
if !strings.Contains(strings.ToLower(tmpTagParts[1]), "eyes") {
tmpTagParts[1] = tmpTagParts[1] + " eyes"
}
case "lingerie":
//only has the lingerie color so just use "lingerie" instead
tmpTagParts[1] = "lingerie"
case "nationality":
//only change "english" to "british"
if strings.ToLower(tmpTagParts[1]) == "english" {
tmpTagParts[1] = "british"
}
//all other tags are fine to use as is.
}

if tmpTagParts[1] != "" {
sc.Tags = append(sc.Tags, strings.TrimSpace(strings.ToLower(tmpTagParts[1])))
}
}
}
})

// Filenames
// Best guess, using trailer filenames and removing the preview-part of it.
e.ForEach(`deo-video source`, func(id int, e *colly.HTMLElement) {
tmpFilename := reFilename.FindStringSubmatch(e.Attr("src"))
if len(tmpFilename) > 1 {
sc.Filenames = append(sc.Filenames, tmpFilename[1]+tmpFilename[2])
}
})
// Note: not all scenes have a deo-video element, in which case do a best guess from URL instead:
if len(sc.Filenames) == 0 {
tmpURLParts := strings.Split(e.Request.URL.Path, "/")
if len(tmpURLParts) > 1 {
baseStart := strings.Replace(tmpURLParts[2], "+", "_", -1)
filenames := []string{"_1920", "_2160", "_2880", "_3840", "_5760"}
baseEnd := "_180x180_3dh_180_sbs.mp4"
for i := range filenames {
filenames[i] = baseStart + filenames[i] + baseEnd
}
sc.Filenames = filenames
}
}

out <- sc
})

siteCollector.OnHTML(`ul.pagination a.page-link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.container div.card > a`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit(URL)

if updateSite {
updateSiteLastUpdate(scraperID)
}
logScrapeFinished(scraperID, siteID)
return nil
}

func WankitNowVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return TwoWebMediaSite(wg, updateSite, knownScenes, out, "wankitnowvr", "WankitNowVR", "https://wankitnowvr.com/videos/")
}

func ZexyVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return TwoWebMediaSite(wg, updateSite, knownScenes, out, "zexyvr", "ZexyVR", "https://zexyvr.com/videos/")
}

func init() {
registerScraper("wankitnowvr", "WankitNowVR", "https://mcdn.vrporn.com/files/20190103150250/wankitnow-profile.jpg", WankitNowVR)
registerScraper("zexyvr", "ZexyVR", "https://mcdn.vrporn.com/files/20190103151557/zexyvr-profile.jpg", ZexyVR)
}

0 comments on commit 6c72cb1

Please sign in to comment.