-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scraper: Add Up Close VR Scraper (#1853)
* WIP Full Site Scrape Only Currently scrapes the whole site. Needs logic to prevent rescrapes of scenes already processed. Needs logic to handle single scene scrapes. This is a completely different body request then whole site. The JSON is in the same format for single scenes so whole site logic can be reused for extracting data. Tags need filtering as original series and adult time original are redundant tags. Code needs to be cleaned and formatted * Final Works. Tested both single scene and full site. All data available is retrieved. There is a bug when scraping single scene sites that the pop up doesn't show to save it. Unsure if it is bug in my XBVR or something in my code. * Remove Junk lines * More clean up * Code Clean Up
- Loading branch information
Showing
1 changed file
with
158 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
package scrape | ||
|
||
import ( | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
|
||
"github.com/go-resty/resty/v2" | ||
"github.com/gocolly/colly/v2" | ||
"github.com/mozillazg/go-slugify" | ||
"github.com/nleeper/goment" | ||
"github.com/thoas/go-funk" | ||
"github.com/tidwall/gjson" | ||
"github.com/xbapps/xbvr/pkg/models" | ||
) | ||
|
||
func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, singeScrapeAdditionalInfo string, limitScraping bool) error { | ||
// this scraper is non-standard in that it gathers info via an api rather than scraping html pages | ||
defer wg.Done() | ||
scraperID := "upclosevr" | ||
siteID := "UpCloseVR" | ||
logScrapeStart(scraperID, siteID) | ||
|
||
siteCollector := createCollector("www.upclosevr.com") | ||
|
||
siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { | ||
apiKeyRegex := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) | ||
applicationIDRegex := regexp.MustCompile(`"applicationID":"(.+)","apiKey`) | ||
apiKey := apiKeyRegex.FindStringSubmatch(e.Text) | ||
applicationID := applicationIDRegex.FindStringSubmatch(e.Text) | ||
|
||
if len(apiKey) > 0 && len(applicationID) > 0 { | ||
pageTotal := 1 | ||
client := resty.New() | ||
|
||
for page := 0; page < pageTotal; page++ { | ||
|
||
var payloadStr string | ||
if singleSceneURL != "" { | ||
tmp := strings.Split(singleSceneURL, "/") | ||
sceneID := tmp[len(tmp)-1] | ||
payloadStr = `{"requests":[{"indexName":"all_scenes","params":"clickAnalytics=true&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%2C%5B%22clip_id%3A` + sceneID + `%22%5D%5D&facets=%5B%5D&hitsPerPage=1&tagFilters="}]}` | ||
} else { | ||
payloadStr = `{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=` + strconv.Itoa(page) + `&query=&tagFilters="}]}` | ||
} | ||
|
||
var payload = strings.NewReader(payloadStr) | ||
resp, err := client.R(). | ||
SetHeader("Origin", "https://www.upclosevr.com"). | ||
SetHeader("Referer", "https://www.upclosevr.com/"). | ||
SetHeader("User-Agent", UserAgent). | ||
SetHeader("x-algolia-api-key", apiKey[1]). | ||
SetHeader("x-algolia-application-id", applicationID[1]). | ||
SetBody(payload). | ||
Post("https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)") | ||
|
||
if err != nil { | ||
log.Errorln("UpCloseVR encourtned an error on the API Call", err) | ||
return | ||
} | ||
|
||
// Convert the resp into a json string for gjson usability | ||
jsonString := resp.String() | ||
|
||
// Check to see if there are multiple pages of results | ||
if pageTotal == 1 && singleSceneURL == "" && !limitScraping { | ||
pageTotal = int(gjson.Get(jsonString, "results.0.nbPages").Int()) | ||
} | ||
|
||
// Make sure we are getting valid response. If the hits array is zero something went wrong | ||
if len(gjson.Get(jsonString, "results.0.hits").Array()) == 0 { | ||
log.Errorln("No Results found for UpCloseVR message:", gjson.Get(jsonString, "message").String(), "response code:", gjson.Get(jsonString, "status").String()) | ||
} | ||
|
||
// iterate over each hit result | ||
for i, _ := range gjson.Get(jsonString, "results.0.hits").Array() { | ||
queryStr := `results.0.hits.` + strconv.Itoa(i) | ||
|
||
// Check to make sure we don't update scenes we have already collected | ||
sceneID := gjson.Get(jsonString, queryStr+`.clip_id`).String() | ||
sceneURL := `https://www.upclosevr.com/en/video/upclosevr/` + gjson.Get(jsonString, queryStr+`.url_title`).String() + `/` + sceneID | ||
if !funk.ContainsString(knownScenes, sceneURL) || singleSceneURL != "" { | ||
|
||
sc := models.ScrapedScene{} | ||
|
||
sc.ScraperID = scraperID | ||
sc.SceneType = "VR" | ||
sc.Studio = siteID | ||
sc.Site = siteID | ||
sc.SiteID = sceneID | ||
sc.HomepageURL = sceneURL | ||
|
||
// Scene ID | ||
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID | ||
|
||
// Date | ||
tmpDate, _ := goment.New(gjson.Get(jsonString, queryStr+`.release_date`).String(), "YYYY-MM-DD") | ||
sc.Released = tmpDate.Format("YYYY-MM-DD") | ||
|
||
// Cover | ||
sc.Covers = append(sc.Covers, `https://transform.gammacdn.com/movies/`+gjson.Get(jsonString, queryStr+`.pictures.1920x1080`).String()) | ||
|
||
// Synopsis | ||
sc.Synopsis = strings.TrimSpace(strings.Replace(gjson.Get(jsonString, queryStr+`.description`).String(), "</br></br>", " ", -1)) | ||
|
||
// Title | ||
sc.Title = strings.TrimSpace(gjson.Get(jsonString, queryStr+`.title`).String()) | ||
log.Infoln(`Scraping ` + sc.Title) | ||
|
||
// Cast - Females Only can be update to include males if wanted | ||
sc.ActorDetails = make(map[string]models.ActorDetails) | ||
for i, name := range gjson.Get(jsonString, queryStr+`.female_actors.#.name`).Array() { | ||
sc.Cast = append(sc.Cast, name.String()) | ||
|
||
actorQuery := queryStr + `.female_actors.` + strconv.Itoa(i) | ||
|
||
sc.ActorDetails[name.String()] = models.ActorDetails{ | ||
Source: scraperID + " scrape", | ||
ProfileUrl: `https://www.upclosevr.com/en/pornstar/view/` + gjson.Get(jsonString, actorQuery+`.url_name`).String() + `/` + gjson.Get(jsonString, actorQuery+`.actor_id`).String(), | ||
} | ||
} | ||
|
||
// Junk Tags we don't want to add to scene data | ||
skiptags := map[string]bool{ | ||
"Original Series": true, | ||
"Adult Time Original": true, | ||
} | ||
|
||
// Tags | ||
for _, name := range gjson.Get(jsonString, queryStr+`.categories.#.name`).Array() { | ||
if !skiptags[name.String()] { | ||
sc.Tags = append(sc.Tags, name.String()) | ||
} | ||
} | ||
|
||
// Duration is in total seconds | ||
sc.Duration = int(gjson.Get(jsonString, queryStr+`.length`).Int()) / 60 | ||
|
||
out <- sc | ||
} | ||
} | ||
} | ||
} | ||
}) | ||
|
||
siteCollector.Visit("https://www.upclosevr.com/en/videos") | ||
|
||
if updateSite { | ||
updateSiteLastUpdate(scraperID) | ||
} | ||
logScrapeFinished(scraperID, siteID) | ||
return nil | ||
} | ||
|
||
func init() { | ||
registerScraper("upclosevr", "Up Close VR", "https://static01-cms-fame.gammacdn.com/upclosevr/m/3ixx4xg65im880g8/UpClose-VR_Favicon_114x114.png", "upclosevr.com", UpCloseVR) | ||
} |