-
-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create Pixhost.to crawler * Ruff changes
- Loading branch information
Showing
3 changed files
with
83 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from __future__ import annotations | ||
|
||
import contextlib | ||
from typing import TYPE_CHECKING | ||
|
||
from aiolimiter import AsyncLimiter | ||
from yarl import URL | ||
|
||
from cyberdrop_dl.clients.errors import MaxChildrenError | ||
from cyberdrop_dl.scraper.crawler import Crawler | ||
from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem | ||
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext | ||
|
||
if TYPE_CHECKING: | ||
from bs4 import BeautifulSoup | ||
|
||
from cyberdrop_dl.managers.manager import Manager | ||
|
||
|
||
class PixHostCrawler(Crawler): | ||
primary_base_domain = URL("https://pixhost.to/") | ||
|
||
def __init__(self, manager: Manager) -> None: | ||
super().__init__(manager, "pixhost", "PixHost") | ||
self.request_limiter = AsyncLimiter(10, 1) | ||
|
||
"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" | ||
|
||
async def fetch(self, scrape_item: ScrapeItem) -> None: | ||
"""Determines where to send the scrape item based on the url.""" | ||
task_id = self.scraping_progress.add_task(scrape_item.url) | ||
url_parts = scrape_item.url.parts | ||
|
||
if "gallery" in url_parts: | ||
await self.gallery(scrape_item) | ||
elif "show" in url_parts: | ||
await self.image(scrape_item) | ||
|
||
self.scraping_progress.remove_task(task_id) | ||
|
||
@error_handling_wrapper | ||
async def gallery(self, scrape_item: ScrapeItem) -> None: | ||
"""Scrapes a gallery.""" | ||
scrape_item.type = FILE_HOST_ALBUM | ||
scrape_item.children = scrape_item.children_limit = 0 | ||
|
||
with contextlib.suppress(IndexError, TypeError): | ||
scrape_item.children_limit = ( | ||
self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type] | ||
) | ||
async with self.request_limiter: | ||
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) | ||
|
||
title = soup.select_one("a[class=link] h2").text | ||
scrape_item.add_to_parent_title(f"{title} (PixHost)") | ||
|
||
images = soup.select("div[class=images] a img") | ||
for image in images: | ||
link = image.get("src") | ||
if not link: | ||
continue | ||
link = link.replace("https://t", "https://img").replace("/thumbs/", "/images/") | ||
link = URL(link) | ||
filename, ext = get_filename_and_ext(link.name) | ||
await self.handle_file(link, scrape_item, filename, ext) | ||
scrape_item.children += 1 | ||
if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: | ||
raise MaxChildrenError(origin=scrape_item) | ||
|
||
@error_handling_wrapper | ||
async def image(self, scrape_item: ScrapeItem) -> None: | ||
"""Scrapes an image.""" | ||
if await self.check_complete_from_referer(scrape_item): | ||
return | ||
|
||
async with self.request_limiter: | ||
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) | ||
|
||
link = URL(soup.select_one("img[class=image-img]").get("src")) | ||
filename, ext = get_filename_and_ext(link.name) | ||
await self.handle_file(link, scrape_item, filename, ext) |