From 98b72a034cf7528dedcc8c965b928cf59313ac74 Mon Sep 17 00:00:00 2001 From: tarrawhitefan <149216089+tarrawhitefan@users.noreply.github.com> Date: Sun, 5 Nov 2023 15:29:11 +0100 Subject: [PATCH] scraper: Validate cover image for VirtualReal websites (#1458) Some of the covers provided by the VirtualReal websites are using a pixel format that is not compatible with golang's `image/jpeg` decoder and also with some VR players like HereSphere. It should be kinda safe to assume that if a image can't be decoded by the golang decoders it may not work in some players. This patch validates the cover images against the golang decoders, and tries to use some gallery image as cover, if no valid coner image is found. This is the only scraper I had this issue so far, but more scrapers may need the same changes. --- pkg/scrape/virtualrealporn.go | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/pkg/scrape/virtualrealporn.go b/pkg/scrape/virtualrealporn.go index 36be99d25..cef44c682 100644 --- a/pkg/scrape/virtualrealporn.go +++ b/pkg/scrape/virtualrealporn.go @@ -1,9 +1,11 @@ package scrape import ( + "bytes" "encoding/json" "fmt" "html" + "image" "strconv" "strings" "sync" @@ -20,11 +22,18 @@ func VirtualRealPornSite(wg *sync.WaitGroup, updateSite bool, knownScenes []stri logScrapeStart(scraperID, siteID) page := 1 + imageCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com") sceneCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com") siteCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com") castCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com") castCollector.AllowURLRevisit = true + imageCollector.OnResponse(func(r *colly.Response) { + if _, _, err := image.Decode(bytes.NewReader(r.Body)); err == nil { + r.Ctx.Put("valid", "1") + } + }) + sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { sc := models.ScrapedScene{} sc.ScraperID = scraperID @@ -54,14 +63,30 @@ func VirtualRealPornSite(wg *sync.WaitGroup, updateSite bool, knownScenes []stri // Cover URLs e.ForEach(`meta[property="og:image"]`, func(id int, e *colly.HTMLElement) { - if id == 0 { - sc.Covers = append(sc.Covers, strings.Split(e.Request.AbsoluteURL(e.Attr("content")), "?")[0]) + if len(sc.Covers) == 0 { + u := strings.Split(e.Request.AbsoluteURL(e.Attr("content")), "?")[0] + ctx := colly.NewContext() + if err := imageCollector.Request("GET", u, nil, ctx, nil); err == nil { + if ctx.Get("valid") != "" { + sc.Covers = append(sc.Covers, u) + } + } } }) // Gallery e.ForEach(`figure[itemprop="associatedMedia"] a`, func(id int, e *colly.HTMLElement) { - sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0])) + if len(sc.Covers) == 0 { + u := e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0]) + ctx := colly.NewContext() + if err := imageCollector.Request("GET", u, nil, ctx, nil); err == nil { + if ctx.Get("valid") != "" { + sc.Covers = append(sc.Covers, u) + } + } + } else { + sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0])) + } }) // Tags