diff --git a/cyberdrop_dl/clients/errors.py b/cyberdrop_dl/clients/errors.py index 253d196eb..c4c51696b 100644 --- a/cyberdrop_dl/clients/errors.py +++ b/cyberdrop_dl/clients/errors.py @@ -6,16 +6,15 @@ from yarl import URL +from cyberdrop_dl.utils.constants import VALIDATION_ERROR_FOOTER + if TYPE_CHECKING: + from requests import Response from yaml.constructor import ConstructorError from cyberdrop_dl.scraper.crawler import ScrapeItem from cyberdrop_dl.utils.data_enums_classes.url_objects import MediaItem -VALIDATION_ERROR_FOOTER = """ -Read the documentation for guidance on how to resolve this error: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options -Please note, this is not a bug. Do not open issues related to this""" - class CDLBaseError(Exception): """Base exception for cyberdrop-dl errors.""" @@ -105,6 +104,39 @@ def __init__(self, origin: ScrapeItem | MediaItem | URL | None = None) -> None: super().__init__(ui_message, origin=origin) +class MediaFireError(CDLBaseError): + def __init__( + self, status: str | int, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None + ) -> None: + """This error will be thrown when a scrape fails.""" + ui_message = f"{status} MediaFire Error" + super().__init__(ui_message, message=message, status=status, origin=origin) + + +class RealDebridError(CDLBaseError): + """Base RealDebrid API error.""" + + def __init__(self, response: Response, error_codes: dict[int, str]) -> None: + url = URL(response.url) + self.path = url.path + try: + JSONResp: dict = response.json() + code = JSONResp.get("error_code") + if code == 16: + code = 7 + error = error_codes.get(code, "Unknown error") + + except AttributeError: + code = response.status_code + error = f"{code} - {HTTPStatus(code).phrase}" + + error = error.capitalize() + + """This error will be thrown when a scrape fails.""" + ui_message = f"{code} RealDebrid Error" + super().__init__(ui_message, message=error, status=code, origin=url) + + class ScrapeError(CDLBaseError): def __init__( self, status: str | int, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None diff --git a/cyberdrop_dl/managers/client_manager.py b/cyberdrop_dl/managers/client_manager.py index d3258c9b6..8453725f3 100644 --- a/cyberdrop_dl/managers/client_manager.py +++ b/cyberdrop_dl/managers/client_manager.py @@ -164,7 +164,7 @@ async def check_http_status( with contextlib.suppress(ContentTypeError): JSON_Resp: dict = await response.json() if "status" in JSON_Resp and "notFound" in JSON_Resp["status"]: - raise ScrapeError(HTTPStatus.NOT_FOUND, origin=origin) + raise ScrapeError(404, origin=origin) if "data" in JSON_Resp and "error" in JSON_Resp["data"]: raise ScrapeError(JSON_Resp["status"], JSON_Resp["data"]["error"], origin=origin) diff --git a/cyberdrop_dl/managers/real_debrid/api.py b/cyberdrop_dl/managers/real_debrid/api.py index e46ef2009..47d1c1c4c 100644 --- a/cyberdrop_dl/managers/real_debrid/api.py +++ b/cyberdrop_dl/managers/real_debrid/api.py @@ -9,7 +9,8 @@ from requests.exceptions import RequestException from yarl import URL -from cyberdrop_dl.managers.real_debrid.errors import RealDebridError +from cyberdrop_dl.clients.errors import RealDebridError +from cyberdrop_dl.managers.real_debrid.errors import ERROR_CODES if TYPE_CHECKING: from collections.abc import Generator @@ -84,7 +85,7 @@ def handle_response(response: Response) -> dict | str | None: response.raise_for_status() JSONResp: dict = response.json() except RequestException: - raise RealDebridError(response) from None + raise RealDebridError(response, ERROR_CODES) from None except AttributeError: return response.text else: diff --git a/cyberdrop_dl/managers/real_debrid/errors.py b/cyberdrop_dl/managers/real_debrid/errors.py index f861c1e3b..9349d1b5f 100644 --- a/cyberdrop_dl/managers/real_debrid/errors.py +++ b/cyberdrop_dl/managers/real_debrid/errors.py @@ -1,13 +1,3 @@ -from __future__ import annotations - -from http import HTTPStatus -from typing import TYPE_CHECKING - -from yarl import URL - -if TYPE_CHECKING: - from requests import Response - ERROR_CODES = { -1: "Internal error", 1: "Missing parameter", @@ -47,24 +37,3 @@ 35: "Infringing file", 36: "Fair Usage Limit", } - - -class RealDebridError(BaseException): - """Base RealDebrid API error.""" - - def __init__(self, response: Response) -> None: - self.path = URL(response.url).path - try: - JSONResp: dict = response.json() - self.code = JSONResp.get("error_code") - if self.code == 16: - self.code = 7 - self.error = ERROR_CODES.get(self.code, "Unknown error") - - except AttributeError: - self.code = response.status_code - self.error = f"{self.code} - {HTTPStatus(self.code).phrase}" - - self.error = self.error.capitalize() - self.msg = f"{self.code}: {self.error} at {self.path}" - super().__init__(self.msg) diff --git a/cyberdrop_dl/managers/realdebrid_manager.py b/cyberdrop_dl/managers/realdebrid_manager.py index 53f99222a..505a8e4b5 100644 --- a/cyberdrop_dl/managers/realdebrid_manager.py +++ b/cyberdrop_dl/managers/realdebrid_manager.py @@ -6,8 +6,8 @@ from re import Pattern from typing import TYPE_CHECKING +from cyberdrop_dl.clients.errors import RealDebridError from cyberdrop_dl.managers.real_debrid.api import RealDebridApi -from cyberdrop_dl.managers.real_debrid.errors import RealDebridError from cyberdrop_dl.utils.logger import log warnings.simplefilter(action="ignore", category=FutureWarning) diff --git a/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py b/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py index 77eda53ef..dbe2872cb 100644 --- a/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py @@ -163,7 +163,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: link = link_container.get(src_selector) if link_container else None if not link: - raise ScrapeError(422, f"Could not find source for: {scrape_item.url}", origin=scrape_item) + raise ScrapeError(422, "Couldn't find source", origin=scrape_item) link = URL(link) date = None diff --git a/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py b/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py index 5bed97517..30ec11c31 100644 --- a/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py @@ -101,7 +101,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: title = self.create_title(soup.select_one(self.profile_title_selector).get("content"), None, None) - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): links = soup.select(self.profile_item_selector) for link in links: link = link.get("href") @@ -113,16 +113,16 @@ async def profile(self, scrape_item: ScrapeItem) -> None: new_scrape_item = self.create_scrape_item( scrape_item, link, - title, - True, + new_title_part=title, + part_of_album=True, add_parent=scrape_item.url, ) - await self.fetch(new_scrape_item) + self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper async def album(self, scrape_item: ScrapeItem) -> None: """Scrapes an album.""" - album_id = scrape_item.url.parts[2] + album_id = scrape_item.url.parts[2].rsplit(".")[-1] results = await self.get_album_results(album_id) scrape_item.album_id = album_id scrape_item.part_of_album = True @@ -169,7 +169,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: new_scrape_item = self.create_scrape_item(scrape_item, sub_album_link, "", True) self.manager.task_group.create_task(self.run(new_scrape_item)) - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): links = soup.select(self.album_img_selector) for link in links: link = link.get("src") @@ -179,9 +179,9 @@ async def album(self, scrape_item: ScrapeItem) -> None: new_scrape_item = self.create_scrape_item( scrape_item, link, - title, - True, - album_id, + new_title_part=title, + part_of_album=True, + album_id=album_id, add_parent=scrape_item.url, ) if not self.check_album_results(link, results): @@ -200,7 +200,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: link = URL(soup.select_one("div[id=image-viewer] img").get("src")) link = link.with_name(link.name.replace(".md.", ".").replace(".th.", ".")) except AttributeError: - raise ScrapeError(404, f"Could not find img source for {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(422, "Couldn't find img source", origin=scrape_item) from None desc_rows = soup.select("p[class*=description-meta]") date = None @@ -227,12 +227,12 @@ async def handle_direct_link(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" - async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: + async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]: """Generator of website pages.""" - page_url = await self.get_sort_by_new_url(url) + page_url = await self.get_sort_by_new_url(scrape_item.url) while True: async with self.request_limiter: - soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url) + soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url, origin=scrape_item) next_page = soup.select_one(self.next_page_selector) yield soup if next_page: diff --git a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py index cfb1467b7..165972ed8 100644 --- a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py @@ -66,8 +66,8 @@ async def album(self, scrape_item: ScrapeItem) -> None: title = self.create_title(soup.select_one("h1[id=title]").text, scrape_item.album_id, None) except AttributeError: raise ScrapeError( - 404, - message="No album information found in response content", + 422, + message="Unable to parse album information from response content", origin=scrape_item, ) from None diff --git a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py index b0fa3320e..266a42fb9 100644 --- a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py @@ -106,7 +106,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: def check_json_response(self, json_resp: dict, scrape_item: ScrapeItem | None = None) -> None: """Parses and raises errors from json response.""" if json_resp["status"] == "error-notFound": - raise ScrapeError(404, "Album not found", origin=scrape_item) + raise ScrapeError(404, origin=scrape_item) json_resp: dict = json_resp["data"] is_password_protected = json_resp.get("password") @@ -150,7 +150,7 @@ async def get_account_token(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: json_resp = await self.client.post_data(self.domain, create_account_address, data={}) if json_resp["status"] != "ok": - raise ScrapeError(403, "Couldn't generate GoFile token", origin=scrape_item) + raise ScrapeError(401, "Couldn't generate GoFile API token", origin=scrape_item) self.api_key = json_resp["data"]["token"] self.headers["Authorization"] = f"Bearer {self.api_key}" @@ -170,6 +170,6 @@ async def get_website_token(self, scrape_item: ScrapeItem, update: bool = False) text = await self.client.get_text(self.domain, self.js_address, origin=scrape_item) match = re.search(WT_REGEX, str(text)) if not match: - raise ScrapeError(403, "Couldn't generate GoFile websiteToken", origin=scrape_item) + raise ScrapeError(401, "Couldn't generate GoFile websiteToken", origin=scrape_item) self.website_token = match.group(1) self.manager.cache_manager.save("gofile_website_token", self.website_token) diff --git a/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py b/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py index a145958ca..48afff158 100644 --- a/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py @@ -52,7 +52,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) if "The specified gallery could not be found" in soup.text: - raise ScrapeError(404, f"Gallery not found: {scrape_item.url}", origin=scrape_item) + raise ScrapeError(404, origin=scrape_item) scrape_item.album_id = scrape_item.url.parts[2] scrape_item.part_of_album = True diff --git a/cyberdrop_dl/scraper/crawlers/imgur_crawler.py b/cyberdrop_dl/scraper/crawlers/imgur_crawler.py index 612fae03c..7b7115b3f 100644 --- a/cyberdrop_dl/scraper/crawlers/imgur_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgur_crawler.py @@ -48,7 +48,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: if self.imgur_client_id == "": log("To scrape imgur content, you need to provide a client id", 30) raise LoginError(message="No Imgur Client ID provided") - await self.check_imgur_credits() + await self.check_imgur_credits(scrape_item) scrape_item.type = FILE_HOST_ALBUM scrape_item.children = scrape_item.children_limit = 0 @@ -101,7 +101,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: if self.imgur_client_id == "": log("To scrape imgur content, you need to provide a client id", 30) raise LoginError(message="No Imgur Client ID provided") - await self.check_imgur_credits() + await self.check_imgur_credits(scrape_item) image_id = scrape_item.url.parts[-1] async with self.request_limiter: @@ -129,9 +129,11 @@ async def handle_direct(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" - async def check_imgur_credits(self) -> None: + async def check_imgur_credits(self, scrape_item: ScrapeItem | None = None) -> None: """Checks the remaining credits.""" - credits_obj = await self.client.get_json(self.domain, self.imgur_api / "credits", headers_inc=self.headers) + credits_obj = await self.client.get_json( + self.domain, self.imgur_api / "credits", headers_inc=self.headers, origin=scrape_item + ) self.imgur_client_remaining = credits_obj["data"]["ClientRemaining"] if self.imgur_client_remaining < 100: - raise ScrapeError(429, "Imgur API rate limit reached") + raise ScrapeError(429, "Imgur API rate limit reached", origin=scrape_item) diff --git a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py index 387f602a3..756c3afca 100644 --- a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py @@ -1,7 +1,6 @@ from __future__ import annotations import calendar -import contextlib import datetime from typing import TYPE_CHECKING @@ -9,7 +8,7 @@ from mediafire import MediaFireApi, api from yarl import URL -from cyberdrop_dl.clients.errors import MaxChildrenError, ScrapeError +from cyberdrop_dl.clients.errors import MediaFireError from cyberdrop_dl.scraper.crawler import Crawler from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext @@ -48,17 +47,10 @@ async def folder(self, scrape_item: ScrapeItem) -> None: try: folder_details: dict[str, dict] = self.api.folder_get_info(folder_key=folder_key) except api.MediaFireApiError as e: - raise ScrapeError(status=f"MF - {e.message}", origin=scrape_item) from None + raise MediaFireError(status=e.code, message=e.message, origin=scrape_item) from None title = self.create_title(folder_details["folder_info"]["name"], folder_key, None) - scrape_item.type = FILE_HOST_ALBUM - scrape_item.children = scrape_item.children_limit = 0 - - with contextlib.suppress(IndexError, TypeError): - scrape_item.children_limit = ( - self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type] - ) - + scrape_item.set_type(FILE_HOST_ALBUM, self.manager) scrape_item.album_id = folder_key scrape_item.part_of_album = True @@ -73,7 +65,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: chunk_size=chunk_size, ) except api.MediaFireApiError as e: - raise ScrapeError(status=f"MF - {e.message}", origin=scrape_item) from None + raise MediaFireError(status=e.code, message=e.message, origin=scrape_item) from None files = folder_contents["folder_content"]["files"] @@ -90,9 +82,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: add_parent=scrape_item.url, ) self.manager.task_group.create_task(self.run(new_scrape_item)) - scrape_item.children += 1 - if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: - raise MaxChildrenError(origin=scrape_item) + scrape_item.add_children() if folder_contents["folder_content"]["more_chunks"] == "yes": chunk += 1 diff --git a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py index eced33d23..7e582ff8a 100644 --- a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py @@ -11,7 +11,6 @@ from cyberdrop_dl.clients.errors import MaxChildrenError, ScrapeError from cyberdrop_dl.scraper.crawler import Crawler from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem -from cyberdrop_dl.utils.logger import log from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext if TYPE_CHECKING: @@ -65,7 +64,7 @@ async def series(self, scrape_item: ScrapeItem) -> None: break if not series_id: - raise ScrapeError(404, "series_id not found", origin=scrape_item) + raise ScrapeError(422, "Unable to parse series_id from html", origin=scrape_item) page_number = 1 number_per_page = 30 @@ -101,7 +100,6 @@ async def chapter(self, scrape_item: ScrapeItem) -> None: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) if "This chapter is premium" in soup.get_text(): - log("Scrape Failed: This chapter is premium", 40) raise ScrapeError(401, "This chapter is premium", origin=scrape_item) title_parts = soup.select_one("title").get_text().split(" - ") diff --git a/cyberdrop_dl/scraper/crawlers/reddit_crawler.py b/cyberdrop_dl/scraper/crawlers/reddit_crawler.py index 20c19f418..e99b9f5cb 100644 --- a/cyberdrop_dl/scraper/crawlers/reddit_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/reddit_crawler.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar import asyncpraw @@ -23,10 +24,11 @@ from cyberdrop_dl.managers.manager import Manager +@dataclass class Post: - id: int = None title: str date: int + id: int = None @property def number(self): @@ -208,9 +210,9 @@ async def media(self, scrape_item: ScrapeItem, reddit: asyncpraw.Reddit) -> None try: post = await reddit.submission(url=head["location"]) except asyncprawcore.exceptions.Forbidden: - raise ScrapeError(403, "Forbidden", origin=scrape_item) from None + raise ScrapeError(403, origin=scrape_item) from None except asyncprawcore.exceptions.NotFound: - raise ScrapeError(404, "Not Found", origin=scrape_item) from None + raise ScrapeError(404, origin=scrape_item) from None await self.post(scrape_item, post, reddit) return diff --git a/cyberdrop_dl/scraper/crawlers/saint_crawler.py b/cyberdrop_dl/scraper/crawlers/saint_crawler.py index 34ef9d9d7..12dec2713 100644 --- a/cyberdrop_dl/scraper/crawlers/saint_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/saint_crawler.py @@ -77,6 +77,6 @@ async def video(self, scrape_item: ScrapeItem) -> None: try: link = URL(soup.select_one("video[id=main-video] source").get("src")) except AttributeError: - raise ScrapeError(404, f"Could not find video source for {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(422, "Couldn't find video source", origin=scrape_item) from None filename, ext = get_filename_and_ext(link.name) await self.handle_file(link, scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py b/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py index 8ed46b9a3..a3eded04b 100644 --- a/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py @@ -97,8 +97,8 @@ async def video(self, scrape_item: ScrapeItem) -> None: link = URL(src) except AttributeError: if "This is a private" in soup.text: - raise ScrapeError(403, f"Private video: {scrape_item.url}", origin=scrape_item) from None - raise ScrapeError(404, f"Could not find video source for {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(401, "Private video", origin=scrape_item) from None + raise ScrapeError(422, "Couldn't find video source", origin=scrape_item) from None title = soup.select_one("title").text.rsplit(" - TOKYO Motion")[0].strip() @@ -116,7 +116,7 @@ async def albums(self, scrape_item: ScrapeItem) -> None: if user_title not in scrape_item.parent_title.split("/"): scrape_item.add_to_parent_title(user_title) - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): albums = soup.select(self.album_selector) for album in albums: link = album.get("href") @@ -153,9 +153,9 @@ async def album(self, scrape_item: ScrapeItem) -> None: if title not in scrape_item.parent_title.split("/"): scrape_item.add_to_parent_title(title) - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): if "This is a private" in soup.text: - raise ScrapeError(403, f"Private album: {scrape_item.url}", origin=scrape_item) + raise ScrapeError(401, "Private album", origin=scrape_item) images = soup.select(self.image_div_selector) for image in images: link = image.select_one(self.image_thumb_selector) @@ -185,8 +185,8 @@ async def image(self, scrape_item: ScrapeItem) -> None: link = URL(src) except AttributeError: if "This is a private" in soup.text: - raise ScrapeError(403, f"Private Photo: {scrape_item.url}", origin=scrape_item) from None - raise ScrapeError(404, f"Could not find image source for {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(401, "Private Photo", origin=scrape_item) from None + raise ScrapeError(422, "Couldn't find image source", origin=scrape_item) from None filename, ext = get_filename_and_ext(link.name) await self.handle_file(link, scrape_item, filename, ext) @@ -225,7 +225,7 @@ async def search(self, scrape_item: ScrapeItem) -> None: selector = self.album_selector scraper = self.album - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): results = soup.select(self.search_div_selector) for result in results: link = result.select_one(selector) @@ -258,9 +258,9 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: if title not in scrape_item.parent_title.split("/"): scrape_item.add_to_parent_title(title) - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): if "This is a private" in soup.text: - raise ScrapeError(403, f"Private playlist: {scrape_item.url}", origin=scrape_item) + raise ScrapeError(401, "Private playlist", origin=scrape_item) videos = soup.select(self.video_div_selector) for video in videos: link = video.select_one(self.video_selector) @@ -275,12 +275,12 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: new_scrape_item = self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) await self.video(new_scrape_item) - async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: + async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]: """Generator of website pages.""" - page_url = url + page_url = scrape_item.url while True: async with self.request_limiter: - soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url) + soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url, origin=scrape_item) next_page = soup.select_one(self.next_page_selector) yield soup if next_page: diff --git a/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py b/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py index 0d27b3c34..a0a02144a 100644 --- a/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py @@ -109,15 +109,15 @@ async def video(self, scrape_item: ScrapeItem) -> None: except (AttributeError, TypeError): if ajax_soup and "You must be registered to download this video" in ajax_soup.text: - raise ScrapeError(403, f"Invalid PHPSESSID: {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(403, "Invalid cookies, PHPSESSID", origin=scrape_item) from None if "TRAFFIC VERIFICATION" in soup.text: await asyncio.sleep(self.wait_time) self.wait_time = min(self.wait_time + 10, MAX_WAIT) self.rate_limit = max(self.rate_limit * 0.8, MIN_RATE_LIMIT) self.request_limiter = AsyncLimiter(self.rate_limit, 60) - raise ScrapeError(429, f"Too many request: {scrape_item.url}", origin=scrape_item) from None - raise ScrapeError(404, f"Could not find video source for {scrape_item.url}", origin=scrape_item) from None + raise ScrapeError(429, origin=scrape_item) from None + raise ScrapeError(422, "Couldn't find video source", origin=scrape_item) from None # NOTE: hardcoding the extension to prevent quering the final server URL # final server URL is always different so it can not be saved to db. @@ -145,11 +145,11 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: # Not a valid URL else: - raise ScrapeError(400, f"Unsupported URL format: {scrape_item.url}", origin=scrape_item) + raise ScrapeError(400, "Unsupported URL format", origin=scrape_item) scrape_item.part_of_album = True - async for soup in self.web_pager(scrape_item.url): + async for soup in self.web_pager(scrape_item): videos = soup.select("a[data-anim='4']") for video in videos: link = video.get("href") @@ -163,9 +163,9 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: new_scrape_item = self.create_scrape_item(scrape_item, link, title, add_parent=scrape_item.url) await self.video(new_scrape_item) - async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: + async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]: """Generator of website pages.""" - page_url = url + page_url = scrape_item.url rate_limited = True while True: attempt = 1 @@ -189,7 +189,7 @@ async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: await asyncio.sleep(self.wait_time) if rate_limited: - raise ScrapeError(429, f"Too many request: {url}") + raise ScrapeError(429, origin=scrape_item) next_page = soup.select_one("div.page-list") next_page = next_page.find("a", string="Next") if next_page else None diff --git a/cyberdrop_dl/scraper/scraper.py b/cyberdrop_dl/scraper/scraper.py index 2950d2bae..75d1b2193 100644 --- a/cyberdrop_dl/scraper/scraper.py +++ b/cyberdrop_dl/scraper/scraper.py @@ -154,8 +154,9 @@ async def load_links(self) -> None: for title in links: for url in links[title]: item = self.create_item_from_link(url) - item.add_to_parent_title(title) - item.part_of_album = True + if title: + item.add_to_parent_title(title) + item.part_of_album = True if self.filter_items(item): items.append(item) for item in items: diff --git a/cyberdrop_dl/utils/constants.py b/cyberdrop_dl/utils/constants.py index ef41e6d43..e010e25ca 100644 --- a/cyberdrop_dl/utils/constants.py +++ b/cyberdrop_dl/utils/constants.py @@ -21,6 +21,8 @@ "tracebacks_extra_lines": 2, "locals_max_length": 20, } +VALIDATION_ERROR_FOOTER = """Please read the documentation for guidance on how to resolve this error: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options +This is not a bug. Do not open issues related to this""" # regex diff --git a/cyberdrop_dl/utils/utilities.py b/cyberdrop_dl/utils/utilities.py index cd2067d73..6f79d711b 100644 --- a/cyberdrop_dl/utils/utilities.py +++ b/cyberdrop_dl/utils/utilities.py @@ -17,7 +17,6 @@ from yarl import URL from cyberdrop_dl.clients.errors import CDLBaseError, NoExtensionError -from cyberdrop_dl.managers.real_debrid.errors import RealDebridError from cyberdrop_dl.utils import constants from cyberdrop_dl.utils.logger import log, log_debug, log_spacer, log_with_color @@ -44,9 +43,6 @@ async def wrapper(self: Crawler | Downloader, *args, **kwargs): log_message_short = e_ui_failure = e.ui_message log_message = f"{e.ui_message} - {e.message}" if e.ui_message != e.message else e.message origin = e.origin - except RealDebridError as e: - log_message_short = log_message = f"RealDebridError - {e.error}" - e_ui_failure = f"RD - {e.error}" except TimeoutError: log_message_short = log_message = e_ui_failure = "Timeout" except ClientConnectorError as e: diff --git a/cyberdrop_dl/utils/yaml.py b/cyberdrop_dl/utils/yaml.py index 71cdaf17f..37e56f03b 100644 --- a/cyberdrop_dl/utils/yaml.py +++ b/cyberdrop_dl/utils/yaml.py @@ -12,6 +12,7 @@ from yarl import URL from cyberdrop_dl.clients.errors import InvalidYamlError +from cyberdrop_dl.utils.constants import VALIDATION_ERROR_FOOTER class TimedeltaSerializer(BaseModel): @@ -39,9 +40,6 @@ def _save_timedelta(dumper: yaml.Dumper, value: timedelta): yaml.add_representer(timedelta, _save_timedelta) yaml.add_representer(URL, _save_as_str) -VALIDATION_ERROR_FOOTER = """Please read the documentation for guidance on how to resolve this error: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options -Please note, this is not a bug. Do not open issues related to this""" - def save(file: Path, data: BaseModel | dict) -> None: """Saves a dict to a yaml file."""