Skip to content

Commit

Permalink
Create Pixhost.to crawler (#436)
Browse files Browse the repository at this point in the history
* Create Pixhost.to crawler

* Ruff changes
  • Loading branch information
jbsparrow authored Jan 5, 2025
1 parent 85698ee commit c9a9e19
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 0 deletions.
1 change: 1 addition & 0 deletions cyberdrop_dl/managers/client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DOWNLOAD_ERROR_ETAGS = {
"d835884373f4d6c8f24742ceabe74946": "Imgur image has been removed",
"65b7753c-528a": "SC Scrape Image",
"5c4fb843-ece": "PixHost Removed Image",
}

DDOS_GUARD_CHALLENGE_TITLES = ["Just a moment...", "DDoS-Guard"]
Expand Down
1 change: 1 addition & 0 deletions cyberdrop_dl/scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from cyberdrop_dl.scraper.crawlers.omegascans_crawler import OmegaScansCrawler
from cyberdrop_dl.scraper.crawlers.pimpandhost_crawler import PimpAndHostCrawler
from cyberdrop_dl.scraper.crawlers.pixeldrain_crawler import PixelDrainCrawler
from cyberdrop_dl.scraper.crawlers.pixhost_crawler import PixHostCrawler
from cyberdrop_dl.scraper.crawlers.postimg_crawler import PostImgCrawler
from cyberdrop_dl.scraper.crawlers.realbooru_crawler import RealBooruCrawler
from cyberdrop_dl.scraper.crawlers.reddit_crawler import RedditCrawler
Expand Down
81 changes: 81 additions & 0 deletions cyberdrop_dl/scraper/crawlers/pixhost_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from __future__ import annotations

import contextlib
from typing import TYPE_CHECKING

from aiolimiter import AsyncLimiter
from yarl import URL

from cyberdrop_dl.clients.errors import MaxChildrenError
from cyberdrop_dl.scraper.crawler import Crawler
from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext

if TYPE_CHECKING:
from bs4 import BeautifulSoup

from cyberdrop_dl.managers.manager import Manager


class PixHostCrawler(Crawler):
primary_base_domain = URL("https://pixhost.to/")

def __init__(self, manager: Manager) -> None:
super().__init__(manager, "pixhost", "PixHost")
self.request_limiter = AsyncLimiter(10, 1)

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

async def fetch(self, scrape_item: ScrapeItem) -> None:
"""Determines where to send the scrape item based on the url."""
task_id = self.scraping_progress.add_task(scrape_item.url)
url_parts = scrape_item.url.parts

if "gallery" in url_parts:
await self.gallery(scrape_item)
elif "show" in url_parts:
await self.image(scrape_item)

self.scraping_progress.remove_task(task_id)

@error_handling_wrapper
async def gallery(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a gallery."""
scrape_item.type = FILE_HOST_ALBUM
scrape_item.children = scrape_item.children_limit = 0

with contextlib.suppress(IndexError, TypeError):
scrape_item.children_limit = (
self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type]
)
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

title = soup.select_one("a[class=link] h2").text
scrape_item.add_to_parent_title(f"{title} (PixHost)")

images = soup.select("div[class=images] a img")
for image in images:
link = image.get("src")
if not link:
continue
link = link.replace("https://t", "https://img").replace("/thumbs/", "/images/")
link = URL(link)
filename, ext = get_filename_and_ext(link.name)
await self.handle_file(link, scrape_item, filename, ext)
scrape_item.children += 1
if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit:
raise MaxChildrenError(origin=scrape_item)

@error_handling_wrapper
async def image(self, scrape_item: ScrapeItem) -> None:
"""Scrapes an image."""
if await self.check_complete_from_referer(scrape_item):
return

async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

link = URL(soup.select_one("img[class=image-img]").get("src"))
filename, ext = get_filename_and_ext(link.name)
await self.handle_file(link, scrape_item, filename, ext)

0 comments on commit c9a9e19

Please sign in to comment.