Skip to content

Commit

Permalink
scraper: Validate cover image for VirtualReal websites (#1458)
Browse files Browse the repository at this point in the history
Some of the covers provided by the VirtualReal websites are using a
pixel format that is not compatible with golang's `image/jpeg` decoder
and also with some VR players like HereSphere. It should be kinda safe
to assume that if a image can't be decoded by the golang decoders it may
not work in some players.

This patch validates the cover images against the golang decoders, and
tries to use some gallery image as cover, if no valid coner image is
found.

This is the only scraper I had this issue so far, but more scrapers may
need the same changes.
  • Loading branch information
tarrawhitefan authored Nov 5, 2023
1 parent b368ba5 commit 98b72a0
Showing 1 changed file with 28 additions and 3 deletions.
31 changes: 28 additions & 3 deletions pkg/scrape/virtualrealporn.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package scrape

import (
"bytes"
"encoding/json"
"fmt"
"html"
"image"
"strconv"
"strings"
"sync"
Expand All @@ -20,11 +22,18 @@ func VirtualRealPornSite(wg *sync.WaitGroup, updateSite bool, knownScenes []stri
logScrapeStart(scraperID, siteID)
page := 1

imageCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com")
sceneCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com")
siteCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com")
castCollector := createCollector("virtualrealporn.com", "virtualrealtrans.com", "virtualrealgay.com", "virtualrealpassion.com", "virtualrealamateurporn.com")
castCollector.AllowURLRevisit = true

imageCollector.OnResponse(func(r *colly.Response) {
if _, _, err := image.Decode(bytes.NewReader(r.Body)); err == nil {
r.Ctx.Put("valid", "1")
}
})

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
Expand Down Expand Up @@ -54,14 +63,30 @@ func VirtualRealPornSite(wg *sync.WaitGroup, updateSite bool, knownScenes []stri

// Cover URLs
e.ForEach(`meta[property="og:image"]`, func(id int, e *colly.HTMLElement) {
if id == 0 {
sc.Covers = append(sc.Covers, strings.Split(e.Request.AbsoluteURL(e.Attr("content")), "?")[0])
if len(sc.Covers) == 0 {
u := strings.Split(e.Request.AbsoluteURL(e.Attr("content")), "?")[0]
ctx := colly.NewContext()
if err := imageCollector.Request("GET", u, nil, ctx, nil); err == nil {
if ctx.Get("valid") != "" {
sc.Covers = append(sc.Covers, u)
}
}
}
})

// Gallery
e.ForEach(`figure[itemprop="associatedMedia"] a`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0]))
if len(sc.Covers) == 0 {
u := e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0])
ctx := colly.NewContext()
if err := imageCollector.Request("GET", u, nil, ctx, nil); err == nil {
if ctx.Get("valid") != "" {
sc.Covers = append(sc.Covers, u)
}
}
} else {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(strings.Split(e.Attr("href"), "?")[0]))
}
})

// Tags
Expand Down

0 comments on commit 98b72a0

Please sign in to comment.