diff --git a/cyberdrop_dl/managers/client_manager.py b/cyberdrop_dl/managers/client_manager.py index 1532e181..d3258c9b 100644 --- a/cyberdrop_dl/managers/client_manager.py +++ b/cyberdrop_dl/managers/client_manager.py @@ -29,6 +29,7 @@ DOWNLOAD_ERROR_ETAGS = { "d835884373f4d6c8f24742ceabe74946": "Imgur image has been removed", "65b7753c-528a": "SC Scrape Image", + "5c4fb843-ece": "PixHost Removed Image", } DDOS_GUARD_CHALLENGE_TITLES = ["Just a moment...", "DDoS-Guard"] diff --git a/cyberdrop_dl/scraper/__init__.py b/cyberdrop_dl/scraper/__init__.py index 2174e852..3f9bf262 100644 --- a/cyberdrop_dl/scraper/__init__.py +++ b/cyberdrop_dl/scraper/__init__.py @@ -30,6 +30,7 @@ from cyberdrop_dl.scraper.crawlers.omegascans_crawler import OmegaScansCrawler from cyberdrop_dl.scraper.crawlers.pimpandhost_crawler import PimpAndHostCrawler from cyberdrop_dl.scraper.crawlers.pixeldrain_crawler import PixelDrainCrawler +from cyberdrop_dl.scraper.crawlers.pixhost_crawler import PixHostCrawler from cyberdrop_dl.scraper.crawlers.postimg_crawler import PostImgCrawler from cyberdrop_dl.scraper.crawlers.realbooru_crawler import RealBooruCrawler from cyberdrop_dl.scraper.crawlers.reddit_crawler import RedditCrawler diff --git a/cyberdrop_dl/scraper/crawlers/pixhost_crawler.py b/cyberdrop_dl/scraper/crawlers/pixhost_crawler.py new file mode 100644 index 00000000..3b71858c --- /dev/null +++ b/cyberdrop_dl/scraper/crawlers/pixhost_crawler.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import contextlib +from typing import TYPE_CHECKING + +from aiolimiter import AsyncLimiter +from yarl import URL + +from cyberdrop_dl.clients.errors import MaxChildrenError +from cyberdrop_dl.scraper.crawler import Crawler +from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem +from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + + from cyberdrop_dl.managers.manager import Manager + + +class PixHostCrawler(Crawler): + primary_base_domain = URL("https://pixhost.to/") + + def __init__(self, manager: Manager) -> None: + super().__init__(manager, "pixhost", "PixHost") + self.request_limiter = AsyncLimiter(10, 1) + + """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" + + async def fetch(self, scrape_item: ScrapeItem) -> None: + """Determines where to send the scrape item based on the url.""" + task_id = self.scraping_progress.add_task(scrape_item.url) + url_parts = scrape_item.url.parts + + if "gallery" in url_parts: + await self.gallery(scrape_item) + elif "show" in url_parts: + await self.image(scrape_item) + + self.scraping_progress.remove_task(task_id) + + @error_handling_wrapper + async def gallery(self, scrape_item: ScrapeItem) -> None: + """Scrapes a gallery.""" + scrape_item.type = FILE_HOST_ALBUM + scrape_item.children = scrape_item.children_limit = 0 + + with contextlib.suppress(IndexError, TypeError): + scrape_item.children_limit = ( + self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type] + ) + async with self.request_limiter: + soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) + + title = soup.select_one("a[class=link] h2").text + scrape_item.add_to_parent_title(f"{title} (PixHost)") + + images = soup.select("div[class=images] a img") + for image in images: + link = image.get("src") + if not link: + continue + link = link.replace("https://t", "https://img").replace("/thumbs/", "/images/") + link = URL(link) + filename, ext = get_filename_and_ext(link.name) + await self.handle_file(link, scrape_item, filename, ext) + scrape_item.children += 1 + if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: + raise MaxChildrenError(origin=scrape_item) + + @error_handling_wrapper + async def image(self, scrape_item: ScrapeItem) -> None: + """Scrapes an image.""" + if await self.check_complete_from_referer(scrape_item): + return + + async with self.request_limiter: + soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) + + link = URL(soup.select_one("img[class=image-img]").get("src")) + filename, ext = get_filename_and_ext(link.name) + await self.handle_file(link, scrape_item, filename, ext)