Skip to content

Commit

Permalink
VRConk scraper update and scene migration (#621)
Browse files Browse the repository at this point in the history
* vrconk refactor

VRConk is now using VRBangers web code.

* vrconk-renum-scenes

VRConk is now using VRBangers web code.

Scene-IDs are updated. Multiple scenes are no longer on the website.

Scenes 1-24 - Unlisted scenes added by PR #307 no longer exist on the website.

The following scenes (31, 34, 55, 57, 82, 83, 85, 87-91, 93, 102, 105, 112, 131, 166, 167) are either from other sites (ex. twisted stepsisters-realjamvr) or renamed (ex. "Cocaine and Blowjob" is now "Head before bed" or "Addicted For VRConk" is now "Addicted to VRConk") or just plain MISSING (ex. pain and gain xxx-Vina Sky)

Scenes that got updated scene-ids appear to have lost their "dupes" from a prior PR. The data is still duped in the DB, but it doesn't reflect in xbvr interface. Scenes that were not updated could still be duped (ex. fuck it like its hot).

* Update migrations.go

linter fix

Co-authored-by: crwxaj <[email protected]>
  • Loading branch information
theRealKLH and crwxaj authored Jan 10, 2022
1 parent 5b8cc8e commit 66fde9d
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 63 deletions.
83 changes: 83 additions & 0 deletions pkg/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,89 @@ func Migrate() {
}
}

return nil
},
},
{
// VRConk is now using VRBangers code. renumbering scenes
ID: "0030-fix-vrconk-ids",
Migrate: func(tx *gorm.DB) error {
// old slug -> new slug
slugMapping := map[string]string{
"vrconk-scene": "vrconk-scene-0",
}

// site -> slug -> id
re := regexp.MustCompile(`(.*)-\d+$`)
newScenes := map[string]map[string]string{}
newSceneId := func(site string, slug string) (string, error) {
mapping, ok := newScenes[site]
if !ok {
mapping = map[string]string{}
queryParams := "page=1&type=videos&sort=latest&show_custom_video=1&bonus-video=1&limit=1000"
url := fmt.Sprintf("https://content.%s.com/api/content/v1/videos?%s", strings.ToLower(site), queryParams)
r, err := resty.R().SetHeader("User-Agent", scrape.UserAgent).Get(url)
if err != nil {
return "", err
}
items := gjson.Get(r.String(), "data.items")
if !items.Exists() {
return "", fmt.Errorf("invalid response from %s API: no scenes found", site)
}
for _, scene := range items.Array() {
id, slug := scene.Get("id").String(), scene.Get("slug").String()
if id == "" || slug == "" {
return "", fmt.Errorf("invalid response from %s API: no id or slug found", site)
}
mapping[slug] = slugify.Slugify(site) + "-" + id[15:]
}
newScenes[site] = mapping
}
return mapping[slug], nil
}

var scenes []models.Scene
err := tx.Where("studio = ?", "VRCONK").Find(&scenes).Error
if err != nil {
return err
}
for _, scene := range scenes {
trimmedURL := strings.TrimRight(scene.SceneURL, "/")
dir, base := path.Split(trimmedURL)
matches := re.FindStringSubmatch(base)
slug, ok := slugMapping[matches[1]]
if !ok {
slug = slugify.Slugify(matches[1])
}

sceneID, err := newSceneId(scene.Site, slug)
if err != nil {
return err
}
if sceneID == "" {
common.Log.Warnf("Could not update scene %s", scene.SceneID)
continue
}

// update all actions referring to this scene by its scene_id
err = tx.Model(&models.Action{}).Where("scene_id = ?", scene.SceneID).Update("scene_id", sceneID).Error
if err != nil {
return err
}

// update the scene itself
// with trailing slash for consistency with scraped data, to avoid these scenes being re-scraped
scene.SceneURL = dir + slug + "/"
scene.SceneID = sceneID
err = tx.Save(&scene).Error
if err != nil {
return err
}
}

// since scenes have new IDs, we need to re-index them
tasks.SearchIndex()

return nil
},
},
Expand Down
21 changes: 13 additions & 8 deletions pkg/models/model_tag.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ func ConvertTag(t string) string {

if funk.Contains([]string{"180", "60fps", "60 fps", "5k", "5k+", "big dick", "big cocks",
"axaxqxrrysrwqua", "girl-boy", "virtual reality", "sex", "new",
"virtual reality porn", "vr porn", "180 vr porn", "xxxsex vr",
"virtual reality porn", "vr porn", "8k-vr-porn", "7k-vr-porn", "6k-vr-porn", "5k-vr-porn",
"4k-vr-porn", "180 vr porn", "xxxsex vr",
"xxx vr porn", "VRconk", "sex onbed", "pornstars", "vr", "vrp",
"bg", "coming soon", "vr 1080p porn",
}, t) {
Expand Down Expand Up @@ -77,7 +78,7 @@ func ConvertTag(t string) string {
return "threesome fmm"
}

if funk.Contains([]string{"big boobs", "big tits porn"}, t) {
if funk.Contains([]string{"busty", "big boobs", "big tits porn", "big-tits"}, t) {
return "big tits"
}

Expand Down Expand Up @@ -181,11 +182,11 @@ func ConvertTag(t string) string {
return "voyeur"
}

if funk.Contains([]string{"small boobs", "small natural tits"}, t) {
if funk.Contains([]string{"small boobs", "small natural tits", "small-tits"}, t) {
return "small tits"
}

if funk.Contains([]string{"natural boobs"}, t) {
if funk.Contains([]string{"natural boobs", "natural-tits"}, t) {
return "natural tits"
}

Expand All @@ -201,14 +202,18 @@ func ConvertTag(t string) string {
return "pussy licking"
}

if funk.Contains([]string{"pussy cumshot"}, t) {
if funk.Contains([]string{"pussy cumshot", "cum-on-pussy"}, t) {
return "cum on pussy"
}

if funk.Contains([]string{"tits cumshoot"}, t) {
if funk.Contains([]string{"tits cumshoot", "tits cumshot"}, t) {
return "cum on tits"
}

if funk.Contains([]string{"body-cumshot"}, t) {
return "cum on body"
}

if funk.Contains([]string{"hairy", "hairy bush"}, t) {
return "hairy pussy"
}
Expand Down Expand Up @@ -249,8 +254,8 @@ func ConvertTag(t string) string {
return "cum on ass"
}

if funk.Contains([]string{"busty"}, t) {
return "big tits"
if funk.Contains([]string{"big-ass"}, t) {
return "big ass"
}

if funk.Contains([]string{"mature mother"}, t) {
Expand Down
150 changes: 95 additions & 55 deletions pkg/scrape/vrconk.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
package scrape

import (
"strconv"
"math"
"strings"
"sync"

"github.com/gocolly/colly"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/tidwall/gjson"
"github.com/xbapps/xbvr/pkg/models"
"mvdan.cc/xurls/v2"
"gopkg.in/resty.v1"
)

func VRCONK(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error {
Expand All @@ -29,83 +30,122 @@ func VRCONK(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]

// Scene ID - get from URL
tmp := strings.Split(sc.HomepageURL, "/")
s := strings.Split(tmp[len(tmp)-1], "-")
sc.SiteID = s[len(s)-1]
content_id := strings.Split(strings.Replace(sc.HomepageURL, "//", "/", -1), "/")[3]

//https://content.vrconk.com
contentURL := strings.Replace("https://vrconk.com", "//", "//content.", 1)

r, _ := resty.R().
SetHeader("User-Agent", UserAgent).
Get("https://content." + sc.Site + ".com/api/content/v1/videos/" + content_id)

JsonMetadata := r.String()

//if not valid scene...
if gjson.Get(JsonMetadata, "status.message").String() != "Ok" {
return
}

//Scene ID - back 8 of the"id" via api response
sc.SiteID = strings.TrimSpace(gjson.Get(JsonMetadata, "data.item.id").String()[15:])

//Scene ID - use PlayaId for scene-id instead of "random" using id
// playaId := gjson.Get(JsonMetadata, "data.item.playaId").Int()
// sc.SiteID = strconv.Itoa(int(playaId))
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

rxRelaxed := xurls.Relaxed()
sc.Title = strings.TrimSpace(e.ChildText(`div.item-tr-inner-col h1`))
sc.Covers = append(sc.Covers, rxRelaxed.FindString(e.ChildAttr(`div.splash-screen`, "style")))
// Title
sc.Title = strings.TrimSpace(gjson.Get(JsonMetadata, "data.item.title").String())

e.ForEach(`.gallery-block figure > a`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("href")))
})
// Filenames - VRCONK_Ballerina_8K_180x180_3dh.mp4
baseName := sc.Site + "_" + strings.TrimSpace(gjson.Get(JsonMetadata, "data.item.videoSettings.videoShortName").String()) + "_"
filenames := []string{"8K_180x180_3dh", "6K_180x180_3dh", "5K_180x180_3dh", "4K_180x180_3dh", "HD_180x180_3dh", "HQ_180x180_3dh", "PSVRHQ_180x180_3dh", "UHD_180x180_3dh", "PSVRHQ_180_sbs", "PSVR_mono", "HQ_mono360", "HD_mono360", "PSVRHQ_ou", "UHD_3dv", "HD_3dv", "HQ_3dv"}

e.ForEach(`.stats-list li`, func(id int, e *colly.HTMLElement) {
// <li><span class="icon i-clock"></span><span class="sub-label">40:54</span></li>
c := e.ChildAttr(`span`, "class")
if strings.Contains(c, "i-clock") {
tmpDuration, err := strconv.Atoi(strings.Split(e.ChildText(`.sub-label`), ":")[0])
if err == nil {
sc.Duration = tmpDuration / 60
}
}
for i := range filenames {
filenames[i] = baseName + filenames[i] + ".mp4"
}

if strings.Contains(c, "i-calendar") {
tmpDate, _ := goment.New(e.ChildText(`.sub-label`))
sc.Released = tmpDate.Format("YYYY-MM-DD")
}
})
sc.Filenames = filenames

// Tags and Cast
unfilteredTags := []string{}
e.ForEach(`.tags-block`, func(id int, e *colly.HTMLElement) {
c := e.ChildText(`.sub-label`)
if strings.Contains(c, "Categories:") || strings.Contains(c, "Tags:") {
e.ForEach(`a`, func(id int, ce *colly.HTMLElement) {
unfilteredTags = append(unfilteredTags, strings.TrimSpace(ce.Text))
})
}
// Date & Duration
tmpDate, _ := goment.Unix(gjson.Get(JsonMetadata, "data.item.publishedAt").Int())
sc.Released = tmpDate.Format("YYYY-MM-DD")
tmpDuration := gjson.Get(JsonMetadata, "data.item.videoSettings.duration").Float()
sc.Duration = int(math.Floor((tmpDuration / 60) + 0/5))

if strings.Contains(c, "Models:") {
e.ForEach(`a`, func(id int, ce *colly.HTMLElement) {
sc.Cast = append(sc.Cast, strings.TrimSpace(ce.Text))
})
// Cover URLs
e.ForEach(`meta[property="og:image"]`, func(id int, e *colly.HTMLElement) {
tmpCover := strings.Split(e.Request.AbsoluteURL(e.Attr("content")), "?")[0]
if tmpCover != "https://vrconk.com/wp-content/uploads/2020/03/VR-Conk-Logo.jpg" {
sc.Covers = append(sc.Covers, tmpCover)
}

})

sc.Tags = funk.FilterString(unfilteredTags, func(t string) bool {
return !funk.ContainsString(sc.Cast, t)
})
sc.Covers = append(sc.Covers, e.ChildAttr(`section.banner picture img`, "src"))
sc.Covers = append(sc.Covers, e.ChildAttr(`section.base-content__bg img[class="object-fit-cover base-border overflow-hidden hero-img"]`, "src"))

// Gallery - https://content.vrconk.com/uploads/2021/08/611b4e0ca5c54351494706_XL.jpg
gallerytmp := gjson.Get(JsonMetadata, "data.item.galleryImages.#.previews.#(sizeAlias==XL).permalink")
for _, v := range gallerytmp.Array() {
sc.Gallery = append(sc.Gallery, contentURL+v.Str)
}

// Synopsis
e.ForEach(`.d-desc`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
})
sc.Synopsis = strings.TrimSpace(strings.Replace(e.ChildText(`div.video-item__description div.short-text`), `arrow_drop_up`, ``, -1))
//sc.Synopsis = strings.TrimSpace(gjson.Get(JsonMetadata, "data.item.description").String())

// Tags
tagstmp := gjson.Get(JsonMetadata, "data.item.categories.#.slug")
for _, v := range tagstmp.Array() {
sc.Tags = append(sc.Tags, v.Str)
}

// Positions - 1:"Sitting",2:"Missionary",3:"Standing",4:"Lying",5:"On the knees",6:"Close-up"
var position string
positions := gjson.Get(JsonMetadata, "data.item.videoTechBar.positions")
for _, i := range positions.Array() {
switch i.Int() {
case 1:
position = "sitting"
case 2:
position = "missionary"
case 3:
position = "standing"
case 4:
position = "lying"
case 5:
position = "on the knees"
case 6:
position = "close-up"
}
sc.Tags = append(sc.Tags, strings.TrimSpace(position))
}

// Cast
casttmp := gjson.Get(JsonMetadata, "data.item.models.#.title")
for _, v := range casttmp.Array() {
sc.Cast = append(sc.Cast, strings.TrimSpace(v.Str))
}

out <- sc
})

siteCollector.OnHTML(`a[data-mb="shuffle-thumbs"]`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.OnHTML(`div.video-item-info-title a`, func(e *colly.HTMLElement) {
url := strings.Split(e.Attr("href"), "?")[0]
sceneURL := e.Request.AbsoluteURL(url)

if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/signup") {
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.OnHTML(`nav.pagination a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`.pagination-next a`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
if !strings.Contains(pageURL, "/user/join") {
siteCollector.Visit(pageURL)
}

siteCollector.Visit(pageURL)
})

siteCollector.Visit("https://vrconk.com/videos")
siteCollector.Visit("https://vrconk.com/videos/?sort=latest&bonus-video=1")

// Edge-cases: Some early scenes are unlisted in both scenes and model index
// #1-10 + 15 by FantAsia, #11-14, 19, 23 by Miss K. #22, 25 by Emi.
Expand All @@ -117,7 +157,7 @@ func VRCONK(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
"fun-with-real-vr-amateur-19", "juicy-holes-22", "rabbit-fuck-23", "amateur-chick-in-the-kitchen-25"}

for _, scene := range unlistedscenes {
sceneURL := "https://vrconk.com/" + scene
sceneURL := "https://vrconk.com/videos/" + scene
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
}
Expand Down

0 comments on commit 66fde9d

Please sign in to comment.