From 322e62e6c715143b47b74648c2028085cf5c6bfd Mon Sep 17 00:00:00 2001 From: pops64 Date: Fri, 11 Oct 2024 11:03:41 -0400 Subject: [PATCH] scraper: Fix for SexbabesVR scraper (#1847) * Fix For SexbabesVR Scraper The scene id in the the webpage now seems to be 614 for all scenes. Causing all scenes to be rescraped and never adding new scenes. This pulls the poster url which appears to have a unique identifier in the 2nd to last directory . Also updated the cover URL to pull the image used for the thumbnail on the index page. As the latest scene has has a SBS image for the cover where the thumbnail contains a more useful image All appears functional * Remove Debug Prompts * Fix for the blank Synopsis There are three separate variations on how they have this information posted depending on the age of the scene. A random sampling over all scenes shows that the synopsis is successfully being scraped * Add Migration Code It ran once I am unsure of how to properly test it tho. * Fix Logic * Improve Migration Code Added some error handling incase the website is unreachable. Added logic to ensure we only check scenes originating from SexBabesVR. Check only scenes starting at 600 as this is where the reported divergence between sceneID sources numbering occurred. And only update scenes that diverge in id --- pkg/migrations/migrations.go | 86 ++++++++++++++++++++++++++++++++++++ pkg/scrape/sexbabesvr.go | 38 ++++++++++++---- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/pkg/migrations/migrations.go b/pkg/migrations/migrations.go index 8b8419b11..c2b0ce823 100644 --- a/pkg/migrations/migrations.go +++ b/pkg/migrations/migrations.go @@ -9,10 +9,12 @@ import ( "path/filepath" "regexp" "runtime" + "strconv" "strings" "time" "github.com/go-resty/resty/v2" + "github.com/gocolly/colly/v2" "github.com/jinzhu/gorm" "github.com/markphelps/optional" "github.com/mozillazg/go-slugify" @@ -1986,6 +1988,90 @@ func Migrate() { return tx.Model(&models.Tag{}).Exec("delete from tags where `count` = 0").Error }, }, + { + // Had to switch to a differnt sceneID source causing a shift in sceneIDs + ID: "0080-fix-SexBabesVR-ids", + Migrate: func(tx *gorm.DB) error { + newSceneId := func(site string, url string) (string, int) { + sceneID := "" + statusCode := 200 + + sceneCollector := colly.NewCollector( + colly.AllowedDomains("sexbabesvr.com"), + ) + + sceneCollector.OnError(func(r *colly.Response, err error) { + common.Log.Errorf("Error visiting %s %s", r.Request.URL, err) + statusCode = r.StatusCode + }) + + sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { + + // Scene ID + e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) { + posterURL := e.Request.AbsoluteURL(e.Attr("poster")) + tmp := strings.Split(posterURL, "/") + sceneID = slugify.Slugify(site) + "-" + tmp[len(tmp)-2] + }) + }) + + sceneCollector.Visit(url) + + return sceneID, statusCode + } + + var scenes []models.Scene + err := tx.Where("studio = ?", "SexBabesVR").Find(&scenes).Error + if err != nil { + return err + } + for _, scene := range scenes { + + // Need both the siteID string and the sceneID has interger for logic + tmp := strings.Split(scene.SceneID, "-") + sceneIDint, _ := strconv.Atoi(tmp[1]) + + // Check to make we only are updating scenes orginating on SexbabsVR and only starting at scene 600, sc.SiteID is is not accurate in terms of alt sites + // Scene 600 is where the scene IDs start to merge when changing our scene ID source for SexBabesVR + if tmp[0] == "sexbabesvr" && sceneIDint >= 600 { + + common.Log.Infoln("Checking sceneid:", scene.SceneID) + sceneID, statusCode := newSceneId(scene.Site, scene.SceneURL) + + if statusCode != 200 { + return err + } + + if sceneID == "" { + common.Log.Warnf("Could not update scene %s", scene.SceneID) + continue + } + + if scene.SceneID != sceneID { + // update all actions referring to this scene by its scene_id + err = tx.Model(&models.Action{}).Where("scene_id = ?", scene.SceneID).Update("scene_id", sceneID).Error + if err != nil { + return err + } + + // update the scene itself + common.Log.Infoln("Updating sceneid:", scene.SceneID, "to", sceneID) + scene.SceneID = sceneID + err = tx.Save(&scene).Error + if err != nil { + return err + } + } + + } + } + + // since scenes have new IDs, we need to re-index them + tasks.SearchIndex() + + return nil + }, + }, }) if err := m.Migrate(); err != nil { diff --git a/pkg/scrape/sexbabesvr.go b/pkg/scrape/sexbabesvr.go index 16bb81e22..6579c8a13 100644 --- a/pkg/scrape/sexbabesvr.go +++ b/pkg/scrape/sexbabesvr.go @@ -30,13 +30,18 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out sc.Site = siteID sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] - // Scene ID - + // Scene ID e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) { - sc.SiteID = e.Attr("data-scene") + posterURL := e.Request.AbsoluteURL(e.Attr("poster")) + tmp := strings.Split(posterURL, "/") + sc.SiteID = tmp[len(tmp)-2] sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID - sc.Covers = append(sc.Covers, strings.Replace(e.Attr("poster"), "/videoDetail2x", "", -1)) }) + // Cover Url + coverURL := e.Request.Ctx.GetAny("coverURL").(string) + sc.Covers = append(sc.Covers, coverURL) + // Title e.ForEach(`div.video-detail__description--container h1`, func(id int, e *colly.HTMLElement) { sc.Title = strings.TrimSpace(e.Text) @@ -48,10 +53,22 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out }) // Synopsis - e.ForEach(`div.video-detail>div.container>p`, func(id int, e *colly.HTMLElement) { - // Handle blank

surrounding the synopsis - if strings.TrimSpace(e.Text) != "" { - sc.Synopsis = strings.TrimSpace(e.Text) + e.ForEach(`div.list-of-categories__p`, func(id int, e *colly.HTMLElement) { + synopsis := e.Text + + if synopsis == "" { + synopsis = e.ChildText(`p.ql-align-justify`) + + if synopsis == "" { + e.ForEach(`div`, func(id int, e *colly.HTMLElement) { + synopsis = synopsis + " " + strings.TrimSpace(e.Text) + }) + + } + } + + if strings.TrimSpace(synopsis) != "" { + sc.Synopsis = strings.TrimSpace(synopsis) } }) @@ -104,10 +121,13 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out }) siteCollector.OnHTML(`div.videos__content`, func(e *colly.HTMLElement) { - e.ForEach(`a.video-container__description--title`, func(cnt int, e *colly.HTMLElement) { + e.ForEach(`a.video-container__image`, func(cnt int, e *colly.HTMLElement) { sceneURL := e.Request.AbsoluteURL(e.Attr("href")) if !funk.ContainsString(knownScenes, sceneURL) { - sceneCollector.Visit(sceneURL) + coverURL := e.ChildAttr("a.video-container__image img", "data-src") + ctx := colly.NewContext() + ctx.Put("coverURL", coverURL) + sceneCollector.Request("GET", sceneURL, nil, ctx, nil) } }) })