Skip to content

Commit

Permalink
scraper: Fix for SexbabesVR scraper (#1847)
Browse files Browse the repository at this point in the history
* Fix For SexbabesVR Scraper

The scene id in the the webpage now seems to be 614 for all scenes. Causing all scenes to be rescraped and never adding new scenes.

This pulls the poster url which appears to have a unique identifier in the 2nd to last directory .

Also updated the cover URL to pull the image used for the thumbnail on the index page. As the latest scene has has a SBS image for the cover where the thumbnail contains a more useful image

All appears functional

* Remove Debug Prompts

* Fix for the blank Synopsis

There are three separate variations on how they have this information posted depending on the age of the scene.  A random sampling over all scenes shows that the synopsis is successfully being scraped

* Add Migration Code

It ran once I am unsure of how to properly test it tho.

* Fix Logic

* Improve Migration Code

Added some error handling incase the website is unreachable.

Added logic to ensure we only check scenes originating from SexBabesVR. Check only scenes starting at 600 as this is where the reported divergence between sceneID sources numbering occurred.  And only update scenes that diverge in id
  • Loading branch information
pops64 authored Oct 11, 2024
1 parent b0512d9 commit 322e62e
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 9 deletions.
86 changes: 86 additions & 0 deletions pkg/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ import (
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"time"

"github.com/go-resty/resty/v2"
"github.com/gocolly/colly/v2"
"github.com/jinzhu/gorm"
"github.com/markphelps/optional"
"github.com/mozillazg/go-slugify"
Expand Down Expand Up @@ -1986,6 +1988,90 @@ func Migrate() {
return tx.Model(&models.Tag{}).Exec("delete from tags where `count` = 0").Error
},
},
{
// Had to switch to a differnt sceneID source causing a shift in sceneIDs
ID: "0080-fix-SexBabesVR-ids",
Migrate: func(tx *gorm.DB) error {
newSceneId := func(site string, url string) (string, int) {
sceneID := ""
statusCode := 200

sceneCollector := colly.NewCollector(
colly.AllowedDomains("sexbabesvr.com"),
)

sceneCollector.OnError(func(r *colly.Response, err error) {
common.Log.Errorf("Error visiting %s %s", r.Request.URL, err)
statusCode = r.StatusCode
})

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {

// Scene ID
e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
posterURL := e.Request.AbsoluteURL(e.Attr("poster"))
tmp := strings.Split(posterURL, "/")
sceneID = slugify.Slugify(site) + "-" + tmp[len(tmp)-2]
})
})

sceneCollector.Visit(url)

return sceneID, statusCode
}

var scenes []models.Scene
err := tx.Where("studio = ?", "SexBabesVR").Find(&scenes).Error
if err != nil {
return err
}
for _, scene := range scenes {

// Need both the siteID string and the sceneID has interger for logic
tmp := strings.Split(scene.SceneID, "-")
sceneIDint, _ := strconv.Atoi(tmp[1])

// Check to make we only are updating scenes orginating on SexbabsVR and only starting at scene 600, sc.SiteID is is not accurate in terms of alt sites
// Scene 600 is where the scene IDs start to merge when changing our scene ID source for SexBabesVR
if tmp[0] == "sexbabesvr" && sceneIDint >= 600 {

common.Log.Infoln("Checking sceneid:", scene.SceneID)
sceneID, statusCode := newSceneId(scene.Site, scene.SceneURL)

if statusCode != 200 {
return err
}

if sceneID == "" {
common.Log.Warnf("Could not update scene %s", scene.SceneID)
continue
}

if scene.SceneID != sceneID {
// update all actions referring to this scene by its scene_id
err = tx.Model(&models.Action{}).Where("scene_id = ?", scene.SceneID).Update("scene_id", sceneID).Error
if err != nil {
return err
}

// update the scene itself
common.Log.Infoln("Updating sceneid:", scene.SceneID, "to", sceneID)
scene.SceneID = sceneID
err = tx.Save(&scene).Error
if err != nil {
return err
}
}

}
}

// since scenes have new IDs, we need to re-index them
tasks.SearchIndex()

return nil
},
},
})

if err := m.Migrate(); err != nil {
Expand Down
38 changes: 29 additions & 9 deletions pkg/scrape/sexbabesvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,18 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]

// Scene ID -
// Scene ID
e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
sc.SiteID = e.Attr("data-scene")
posterURL := e.Request.AbsoluteURL(e.Attr("poster"))
tmp := strings.Split(posterURL, "/")
sc.SiteID = tmp[len(tmp)-2]
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
sc.Covers = append(sc.Covers, strings.Replace(e.Attr("poster"), "/videoDetail2x", "", -1))
})

// Cover Url
coverURL := e.Request.Ctx.GetAny("coverURL").(string)
sc.Covers = append(sc.Covers, coverURL)

// Title
e.ForEach(`div.video-detail__description--container h1`, func(id int, e *colly.HTMLElement) {
sc.Title = strings.TrimSpace(e.Text)
Expand All @@ -48,10 +53,22 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
})

// Synopsis
e.ForEach(`div.video-detail>div.container>p`, func(id int, e *colly.HTMLElement) {
// Handle blank <p></p> surrounding the synopsis
if strings.TrimSpace(e.Text) != "" {
sc.Synopsis = strings.TrimSpace(e.Text)
e.ForEach(`div.list-of-categories__p`, func(id int, e *colly.HTMLElement) {
synopsis := e.Text

if synopsis == "" {
synopsis = e.ChildText(`p.ql-align-justify`)

if synopsis == "" {
e.ForEach(`div`, func(id int, e *colly.HTMLElement) {
synopsis = synopsis + " " + strings.TrimSpace(e.Text)
})

}
}

if strings.TrimSpace(synopsis) != "" {
sc.Synopsis = strings.TrimSpace(synopsis)
}
})

Expand Down Expand Up @@ -104,10 +121,13 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
})

siteCollector.OnHTML(`div.videos__content`, func(e *colly.HTMLElement) {
e.ForEach(`a.video-container__description--title`, func(cnt int, e *colly.HTMLElement) {
e.ForEach(`a.video-container__image`, func(cnt int, e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
coverURL := e.ChildAttr("a.video-container__image img", "data-src")
ctx := colly.NewContext()
ctx.Put("coverURL", coverURL)
sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
}
})
})
Expand Down

0 comments on commit 322e62e

Please sign in to comment.