Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix broken WankzVR scraper & three studios added. #266

Merged
merged 1 commit into from
Apr 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/scrape/milfvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func MilfVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
}
})

siteCollector.Visit("https://www.milfvr.com/videos")
siteCollector.Visit("https://www.milfvr.com/videos?o=d")

if updateSite {
updateSiteLastUpdate(scraperID)
Expand Down
12 changes: 10 additions & 2 deletions pkg/scrape/slrstudios.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
})

// Synopsis
e.ForEach(`div#tabs-about div:last-child div:nth-child(2)`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
e.ForEach(`div#tabs-about div.u-mb--four`, func(id int, e *colly.HTMLElement) {
if !strings.Contains(e.Text, "Released:") {
sc.Synopsis = strings.TrimSpace(e.Text)
}
})

// Skipping some very generic and useless tags
Expand Down Expand Up @@ -230,6 +232,11 @@ func StripzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out cha
return SexLikeReal(wg, updateSite, knownScenes, out, "stripzvr", "StripzVR", "N1ck Inc.")
}

// RealHotVR.com doesn't have complete scene index, pagination stops after two pages
func RealHotVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return SexLikeReal(wg, updateSite, knownScenes, out, "realhotvr", "RealHotVR", "RealHotVR")
}

func init() {
registerScraper("slr-originals", "SLR Originals", "https://www.sexlikereal.com/s/refactor/images/favicons/android-icon-192x192.png", SLROriginals)
registerScraper("istripper", "iStripper (SLR)", "https://www.istripper.com/favicons/istripper/apple-icon-120x120.png", iStripper)
Expand All @@ -243,4 +250,5 @@ func init() {
registerScraper("pervrt", "perVRt/Terrible (SLR)", "https://mcdn.vrporn.com/files/20181218151630/pervrt-logo.jpg", perVRt)
registerScraper("leninacrowne", "LeninaCrowne (SLR)", "https://mcdn.vrporn.com/files/20190711135807/terrible_logo-e1562878668857_400x400_acf_cropped.jpg", LeninaCrowne)
registerScraper("stripzvr", "StripzVR (SLR)", "https://www.stripzvr.com/wp-content/uploads/2018/09/cropped-favicon-192x192.jpg", StripzVR)
registerScraper("realhotvr", "RealHotVR (SLR)", "https://g8iek4luc8.ent-cdn.com/templates/realhotvr/images/favicon.jpg", RealHotVR)
}
39 changes: 18 additions & 21 deletions pkg/scrape/wankzvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

// Title
e.ForEach(`header h1`, func(id int, e *colly.HTMLElement) {
e.ForEach(`h1.detail__title`, func(id int, e *colly.HTMLElement) {
sc.Title = e.Text
})

// Date
e.ForEach(`div.date`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__date_time span.detail__date`, func(id int, e *colly.HTMLElement) {
tmpDate, _ := goment.New(e.Text, "DD MMMM, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
})

// Duration
e.ForEach(`div.duration`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__date_time span.time`, func(id int, e *colly.HTMLElement) {
if id == 1 {
tmpDuration, err := strconv.Atoi(strings.TrimSpace(strings.Replace(e.Text, "minutes", "", -1)))
if err == nil {
Expand All @@ -61,53 +61,50 @@ func WankzVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan
sc.Filenames = append(sc.Filenames, "wankzvr-"+base+"180_180x180_3dh_LR.mp4")

// Cover URLs
e.ForEach(`div.swiper-slide img`, func(id int, e *colly.HTMLElement) {
if id == 0 {
sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(e.Attr("src")))
}
})
for _, x := range []string{"cover", "hero"} {
tmpCover := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/" + x + "/large.jpg"
sc.Covers = append(sc.Covers, tmpCover)
}

// Gallery
e.ForEach(`div.swiper-slide img.lazyload`, func(id int, e *colly.HTMLElement) {
if id > 0 {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("data-src")))
}
})
for _, x := range []string{"1", "2", "3", "4", "5", "6"} {
tmpGallery := "https://cdns-i.wankzvr.com/" + sc.SiteID[0:1] + "/" + sc.SiteID[0:4] + "/" + sc.SiteID + "/thumbs/1024_" + x + ".jpg"
sc.Gallery = append(sc.Gallery, tmpGallery)
}

// Synopsis
e.ForEach(`p.description`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(strings.Replace(e.Text, " Read more", "", -1))
e.ForEach(`div.detail__txt`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text + e.ChildText("span.more__body"))
})

// Tags
e.ForEach(`div.tags a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.tag-list__body a.tag`, func(id int, e *colly.HTMLElement) {
sc.Tags = append(sc.Tags, e.Text)
})

// Cast
e.ForEach(`header h4 a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.detail__models a`, func(id int, e *colly.HTMLElement) {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
})

out <- sc
})

siteCollector.OnHTML(`nav.pager a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`ul.pagenav__list a.pagenav__link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.contentContainer article a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`ul.cards-list a.card__video`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
sceneURL = strings.Replace(sceneURL, "/preview", "", -1)

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit("https://www.wankzvr.com/videos")
siteCollector.Visit("https://www.wankzvr.com/videos?o=d")

if updateSite {
updateSiteLastUpdate(scraperID)
Expand Down
191 changes: 191 additions & 0 deletions pkg/scrape/zexywankitnow.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package scrape

import (
"regexp"
"strconv"
"strings"
"sync"

"github.com/gocolly/colly"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/xbapps/xbvr/pkg/models"
)

func TwoWebMediaSite(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, scraperID string, siteID string, URL string) error {
defer wg.Done()
logScrapeStart(scraperID, siteID)

sceneCollector := createCollector("wankitnowvr.com", "zexyvr.com")
siteCollector := createCollector("wankitnowvr.com", "zexyvr.com")

// Regex preparation
reDateDuration := regexp.MustCompile(`Released\son\s(.*)\n+\s+Duration\s+:\s+(\d+):\d+`)
reCastTags := regexp.MustCompile(`(?:zexyvr|wankitnowvr)\.com\/(models|videos)\/+`)
reTagCat := regexp.MustCompile(`(.*)\s+\((.*)\)`)
reFilename := regexp.MustCompile(`videos\/(?U:([a-z\d\-]+))(?:(?:-|_)preview)?(_\d{4}.*\.mp4)`)

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.SceneType = "VR"
sc.Studio = "2WebMedia"
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]

// SiteID, Scene ID - get from URL
tmp := strings.Split(sc.HomepageURL, "/")
sc.SiteID = tmp[len(tmp)-1]
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

// Cover / ID
e.ForEach(`deo-video`, func(id int, e *colly.HTMLElement) {
sc.Covers = append(sc.Covers, strings.Split(e.Attr("cover-image"), "?")[0]+"?h=900")
})
// Note: not all scenes have a deo-video element, only a regular img cover instead
if len(sc.Covers) == 0 {
e.ForEach(`div.container.pt-5 > div > div > img`, func(id int, e *colly.HTMLElement) {
sc.Covers = append(sc.Covers, strings.Split(e.Attr("src"), "?")[0]+"?h=900")
})
}

// Gallery
// Note: Limiting gallery to 900px in height as some are huge by default
e.ForEach(`div.gallery > div`, func(id int, e *colly.HTMLElement) {
if id > 0 {
sc.Gallery = append(sc.Gallery, strings.Split(e.ChildAttr("div.view > a", "href"), "?")[0]+"?h=900")
}
})

// Title
e.ForEach(`div.container.pt-5 h2`, func(id int, e *colly.HTMLElement) {
sc.Title = strings.TrimSpace(e.Text)
})

// Synopsis
e.ForEach(`div.container.pt-5 h2 + p`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
})

//Note: Date/Duration info is currently all inside the same div element...
e.ForEach(`div.container.pt-5 p.text-muted`, func(id int, e *colly.HTMLElement) {
tmpDateDurationParts := reDateDuration.FindStringSubmatch(e.Text)

// Date
if len(tmpDateDurationParts[1]) > 0 {
tmpDate, _ := goment.New(tmpDateDurationParts[1], "MMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
}

// Duration
if len(tmpDateDurationParts[2]) > 0 {
tmpDuration, err := strconv.Atoi(tmpDateDurationParts[2])
if err == nil {
sc.Duration = tmpDuration
}
}
})

// Cast & Tags
// Note: Cast/Tags links are currently all inside the same div element...
e.ForEach(`div.container.pt-5 p.text-muted > a`, func(id int, e *colly.HTMLElement) {
tmpURLParts := reCastTags.FindStringSubmatch(e.Attr("href"))
if len(tmpURLParts[1]) > 0 {
if tmpURLParts[1] == "models" {
// Cast
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
} else if tmpURLParts[1] == "videos" {
// Tags
tmpTagParts := reTagCat.FindStringSubmatch(e.Text)
// Format is "tag (tag-category)" and we're removing the category part but some tags need fixing
switch strings.ToLower(tmpTagParts[2]) {
case "breasts":
//only has tags like "enhanced/natural/small/medium/large/huge"
if strings.ToLower(tmpTagParts[1]) == "large" {
tmpTagParts[1] = "big tits"
} else {
tmpTagParts[1] = tmpTagParts[1] + " tits"
}
case "eyes":
//some are like "gray eyes" while others are like "blue", but tag must include "eyes"
if !strings.Contains(strings.ToLower(tmpTagParts[1]), "eyes") {
tmpTagParts[1] = tmpTagParts[1] + " eyes"
}
case "lingerie":
//only has the lingerie color so just use "lingerie" instead
tmpTagParts[1] = "lingerie"
case "nationality":
//only change "english" to "british"
if strings.ToLower(tmpTagParts[1]) == "english" {
tmpTagParts[1] = "british"
}
//all other tags are fine to use as is.
}

if tmpTagParts[1] != "" {
sc.Tags = append(sc.Tags, strings.TrimSpace(strings.ToLower(tmpTagParts[1])))
}
}
}
})

// Filenames
// Best guess, using trailer filenames and removing the preview-part of it.
e.ForEach(`deo-video source`, func(id int, e *colly.HTMLElement) {
tmpFilename := reFilename.FindStringSubmatch(e.Attr("src"))
if len(tmpFilename) > 1 {
sc.Filenames = append(sc.Filenames, tmpFilename[1]+tmpFilename[2])
}
})
// Note: not all scenes have a deo-video element, in which case do a best guess from URL instead:
if len(sc.Filenames) == 0 {
tmpURLParts := strings.Split(e.Request.URL.Path, "/")
if len(tmpURLParts) > 1 {
baseStart := strings.Replace(tmpURLParts[2], "+", "_", -1)
filenames := []string{"_1920", "_2160", "_2880", "_3840", "_5760"}
baseEnd := "_180x180_3dh_180_sbs.mp4"
for i := range filenames {
filenames[i] = baseStart + filenames[i] + baseEnd
}
sc.Filenames = filenames
}
}

out <- sc
})

siteCollector.OnHTML(`ul.pagination a.page-link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.container div.card > a`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit(URL)

if updateSite {
updateSiteLastUpdate(scraperID)
}
logScrapeFinished(scraperID, siteID)
return nil
}

func WankitNowVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return TwoWebMediaSite(wg, updateSite, knownScenes, out, "wankitnowvr", "WankitNowVR", "https://wankitnowvr.com/videos/")
}

func ZexyVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
return TwoWebMediaSite(wg, updateSite, knownScenes, out, "zexyvr", "ZexyVR", "https://zexyvr.com/videos/")
}

func init() {
registerScraper("wankitnowvr", "WankitNowVR", "https://mcdn.vrporn.com/files/20190103150250/wankitnow-profile.jpg", WankitNowVR)
registerScraper("zexyvr", "ZexyVR", "https://mcdn.vrporn.com/files/20190103151557/zexyvr-profile.jpg", ZexyVR)
}