Skip to content

Commit

Permalink
fix: general crawler fixes (jbsparrow#428)
Browse files Browse the repository at this point in the history
* fix: use a dataclass for reddit posts

Should fix jbsparrow#426

* refactor: pass `scrape_item` as origin for `web_pager` (chevereto)

* fix: "Loose Files" not being created (all crawlers)

* refactor: add custom MediaFireError (mediafire)

* fix: scrape error codes

* refactor: make `RealDebridError` Inherit from `CDLBaseError`

* refactor: move `RealDebridError` to `clients.errors`

* refactor: move `VALIDATION_ERROR_FOOTER ` to `constants`

* fix: undo crawler semaphore

Moved to PR jbsparrow#425
  • Loading branch information
NTFSvolume authored and datawhores committed Jan 14, 2025
1 parent 0acf8c9 commit 160902f
Show file tree
Hide file tree
Showing 21 changed files with 107 additions and 116 deletions.
40 changes: 36 additions & 4 deletions cyberdrop_dl/clients/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@

from yarl import URL

from cyberdrop_dl.utils.constants import VALIDATION_ERROR_FOOTER

if TYPE_CHECKING:
from requests import Response
from yaml.constructor import ConstructorError

from cyberdrop_dl.scraper.crawler import ScrapeItem
from cyberdrop_dl.utils.data_enums_classes.url_objects import MediaItem

VALIDATION_ERROR_FOOTER = """
Read the documentation for guidance on how to resolve this error: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options
Please note, this is not a bug. Do not open issues related to this"""


class CDLBaseError(Exception):
"""Base exception for cyberdrop-dl errors."""
Expand Down Expand Up @@ -105,6 +104,39 @@ def __init__(self, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
super().__init__(ui_message, origin=origin)


class MediaFireError(CDLBaseError):
def __init__(
self, status: str | int, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None
) -> None:
"""This error will be thrown when a scrape fails."""
ui_message = f"{status} MediaFire Error"
super().__init__(ui_message, message=message, status=status, origin=origin)


class RealDebridError(CDLBaseError):
"""Base RealDebrid API error."""

def __init__(self, response: Response, error_codes: dict[int, str]) -> None:
url = URL(response.url)
self.path = url.path
try:
JSONResp: dict = response.json()
code = JSONResp.get("error_code")
if code == 16:
code = 7
error = error_codes.get(code, "Unknown error")

except AttributeError:
code = response.status_code
error = f"{code} - {HTTPStatus(code).phrase}"

error = error.capitalize()

"""This error will be thrown when a scrape fails."""
ui_message = f"{code} RealDebrid Error"
super().__init__(ui_message, message=error, status=code, origin=url)


class ScrapeError(CDLBaseError):
def __init__(
self, status: str | int, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/managers/client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ async def check_http_status(
with contextlib.suppress(ContentTypeError):
JSON_Resp: dict = await response.json()
if "status" in JSON_Resp and "notFound" in JSON_Resp["status"]:
raise ScrapeError(HTTPStatus.NOT_FOUND, origin=origin)
raise ScrapeError(404, origin=origin)
if "data" in JSON_Resp and "error" in JSON_Resp["data"]:
raise ScrapeError(JSON_Resp["status"], JSON_Resp["data"]["error"], origin=origin)

Expand Down
5 changes: 3 additions & 2 deletions cyberdrop_dl/managers/real_debrid/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from requests.exceptions import RequestException
from yarl import URL

from cyberdrop_dl.managers.real_debrid.errors import RealDebridError
from cyberdrop_dl.clients.errors import RealDebridError
from cyberdrop_dl.managers.real_debrid.errors import ERROR_CODES

if TYPE_CHECKING:
from collections.abc import Generator
Expand Down Expand Up @@ -84,7 +85,7 @@ def handle_response(response: Response) -> dict | str | None:
response.raise_for_status()
JSONResp: dict = response.json()
except RequestException:
raise RealDebridError(response) from None
raise RealDebridError(response, ERROR_CODES) from None
except AttributeError:
return response.text
else:
Expand Down
31 changes: 0 additions & 31 deletions cyberdrop_dl/managers/real_debrid/errors.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
from __future__ import annotations

from http import HTTPStatus
from typing import TYPE_CHECKING

from yarl import URL

if TYPE_CHECKING:
from requests import Response

ERROR_CODES = {
-1: "Internal error",
1: "Missing parameter",
Expand Down Expand Up @@ -47,24 +37,3 @@
35: "Infringing file",
36: "Fair Usage Limit",
}


class RealDebridError(BaseException):
"""Base RealDebrid API error."""

def __init__(self, response: Response) -> None:
self.path = URL(response.url).path
try:
JSONResp: dict = response.json()
self.code = JSONResp.get("error_code")
if self.code == 16:
self.code = 7
self.error = ERROR_CODES.get(self.code, "Unknown error")

except AttributeError:
self.code = response.status_code
self.error = f"{self.code} - {HTTPStatus(self.code).phrase}"

self.error = self.error.capitalize()
self.msg = f"{self.code}: {self.error} at {self.path}"
super().__init__(self.msg)
2 changes: 1 addition & 1 deletion cyberdrop_dl/managers/realdebrid_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from re import Pattern
from typing import TYPE_CHECKING

from cyberdrop_dl.clients.errors import RealDebridError
from cyberdrop_dl.managers.real_debrid.api import RealDebridApi
from cyberdrop_dl.managers.real_debrid.errors import RealDebridError
from cyberdrop_dl.utils.logger import log

warnings.simplefilter(action="ignore", category=FutureWarning)
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ async def file(self, scrape_item: ScrapeItem) -> None:

link = link_container.get(src_selector) if link_container else None
if not link:
raise ScrapeError(422, f"Could not find source for: {scrape_item.url}", origin=scrape_item)
raise ScrapeError(422, "Couldn't find source", origin=scrape_item)

link = URL(link)
date = None
Expand Down
26 changes: 13 additions & 13 deletions cyberdrop_dl/scraper/crawlers/chevereto_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None:

title = self.create_title(soup.select_one(self.profile_title_selector).get("content"), None, None)

async for soup in self.web_pager(scrape_item.url):
async for soup in self.web_pager(scrape_item):
links = soup.select(self.profile_item_selector)
for link in links:
link = link.get("href")
Expand All @@ -113,16 +113,16 @@ async def profile(self, scrape_item: ScrapeItem) -> None:
new_scrape_item = self.create_scrape_item(
scrape_item,
link,
title,
True,
new_title_part=title,
part_of_album=True,
add_parent=scrape_item.url,
)
await self.fetch(new_scrape_item)
self.manager.task_group.create_task(self.run(new_scrape_item))

@error_handling_wrapper
async def album(self, scrape_item: ScrapeItem) -> None:
"""Scrapes an album."""
album_id = scrape_item.url.parts[2]
album_id = scrape_item.url.parts[2].rsplit(".")[-1]
results = await self.get_album_results(album_id)
scrape_item.album_id = album_id
scrape_item.part_of_album = True
Expand Down Expand Up @@ -169,7 +169,7 @@ async def album(self, scrape_item: ScrapeItem) -> None:
new_scrape_item = self.create_scrape_item(scrape_item, sub_album_link, "", True)
self.manager.task_group.create_task(self.run(new_scrape_item))

async for soup in self.web_pager(scrape_item.url):
async for soup in self.web_pager(scrape_item):
links = soup.select(self.album_img_selector)
for link in links:
link = link.get("src")
Expand All @@ -179,9 +179,9 @@ async def album(self, scrape_item: ScrapeItem) -> None:
new_scrape_item = self.create_scrape_item(
scrape_item,
link,
title,
True,
album_id,
new_title_part=title,
part_of_album=True,
album_id=album_id,
add_parent=scrape_item.url,
)
if not self.check_album_results(link, results):
Expand All @@ -200,7 +200,7 @@ async def image(self, scrape_item: ScrapeItem) -> None:
link = URL(soup.select_one("div[id=image-viewer] img").get("src"))
link = link.with_name(link.name.replace(".md.", ".").replace(".th.", "."))
except AttributeError:
raise ScrapeError(404, f"Could not find img source for {scrape_item.url}", origin=scrape_item) from None
raise ScrapeError(422, "Couldn't find img source", origin=scrape_item) from None

desc_rows = soup.select("p[class*=description-meta]")
date = None
Expand All @@ -227,12 +227,12 @@ async def handle_direct_link(self, scrape_item: ScrapeItem) -> None:

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]:
async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]:
"""Generator of website pages."""
page_url = await self.get_sort_by_new_url(url)
page_url = await self.get_sort_by_new_url(scrape_item.url)
while True:
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url)
soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url, origin=scrape_item)
next_page = soup.select_one(self.next_page_selector)
yield soup
if next_page:
Expand Down
4 changes: 2 additions & 2 deletions cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ async def album(self, scrape_item: ScrapeItem) -> None:
title = self.create_title(soup.select_one("h1[id=title]").text, scrape_item.album_id, None)
except AttributeError:
raise ScrapeError(
404,
message="No album information found in response content",
422,
message="Unable to parse album information from response content",
origin=scrape_item,
) from None

Expand Down
6 changes: 3 additions & 3 deletions cyberdrop_dl/scraper/crawlers/gofile_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ async def album(self, scrape_item: ScrapeItem) -> None:
def check_json_response(self, json_resp: dict, scrape_item: ScrapeItem | None = None) -> None:
"""Parses and raises errors from json response."""
if json_resp["status"] == "error-notFound":
raise ScrapeError(404, "Album not found", origin=scrape_item)
raise ScrapeError(404, origin=scrape_item)

json_resp: dict = json_resp["data"]
is_password_protected = json_resp.get("password")
Expand Down Expand Up @@ -150,7 +150,7 @@ async def get_account_token(self, scrape_item: ScrapeItem) -> None:
async with self.request_limiter:
json_resp = await self.client.post_data(self.domain, create_account_address, data={})
if json_resp["status"] != "ok":
raise ScrapeError(403, "Couldn't generate GoFile token", origin=scrape_item)
raise ScrapeError(401, "Couldn't generate GoFile API token", origin=scrape_item)

self.api_key = json_resp["data"]["token"]
self.headers["Authorization"] = f"Bearer {self.api_key}"
Expand All @@ -170,6 +170,6 @@ async def get_website_token(self, scrape_item: ScrapeItem, update: bool = False)
text = await self.client.get_text(self.domain, self.js_address, origin=scrape_item)
match = re.search(WT_REGEX, str(text))
if not match:
raise ScrapeError(403, "Couldn't generate GoFile websiteToken", origin=scrape_item)
raise ScrapeError(401, "Couldn't generate GoFile websiteToken", origin=scrape_item)
self.website_token = match.group(1)
self.manager.cache_manager.save("gofile_website_token", self.website_token)
2 changes: 1 addition & 1 deletion cyberdrop_dl/scraper/crawlers/imgbox_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ async def album(self, scrape_item: ScrapeItem) -> None:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

if "The specified gallery could not be found" in soup.text:
raise ScrapeError(404, f"Gallery not found: {scrape_item.url}", origin=scrape_item)
raise ScrapeError(404, origin=scrape_item)

scrape_item.album_id = scrape_item.url.parts[2]
scrape_item.part_of_album = True
Expand Down
12 changes: 7 additions & 5 deletions cyberdrop_dl/scraper/crawlers/imgur_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ async def album(self, scrape_item: ScrapeItem) -> None:
if self.imgur_client_id == "":
log("To scrape imgur content, you need to provide a client id", 30)
raise LoginError(message="No Imgur Client ID provided")
await self.check_imgur_credits()
await self.check_imgur_credits(scrape_item)
scrape_item.type = FILE_HOST_ALBUM
scrape_item.children = scrape_item.children_limit = 0

Expand Down Expand Up @@ -101,7 +101,7 @@ async def image(self, scrape_item: ScrapeItem) -> None:
if self.imgur_client_id == "":
log("To scrape imgur content, you need to provide a client id", 30)
raise LoginError(message="No Imgur Client ID provided")
await self.check_imgur_credits()
await self.check_imgur_credits(scrape_item)

image_id = scrape_item.url.parts[-1]
async with self.request_limiter:
Expand Down Expand Up @@ -129,9 +129,11 @@ async def handle_direct(self, scrape_item: ScrapeItem) -> None:

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

async def check_imgur_credits(self) -> None:
async def check_imgur_credits(self, scrape_item: ScrapeItem | None = None) -> None:
"""Checks the remaining credits."""
credits_obj = await self.client.get_json(self.domain, self.imgur_api / "credits", headers_inc=self.headers)
credits_obj = await self.client.get_json(
self.domain, self.imgur_api / "credits", headers_inc=self.headers, origin=scrape_item
)
self.imgur_client_remaining = credits_obj["data"]["ClientRemaining"]
if self.imgur_client_remaining < 100:
raise ScrapeError(429, "Imgur API rate limit reached")
raise ScrapeError(429, "Imgur API rate limit reached", origin=scrape_item)
20 changes: 5 additions & 15 deletions cyberdrop_dl/scraper/crawlers/mediafire_crawler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from __future__ import annotations

import calendar
import contextlib
import datetime
from typing import TYPE_CHECKING

from aiolimiter import AsyncLimiter
from mediafire import MediaFireApi, api
from yarl import URL

from cyberdrop_dl.clients.errors import MaxChildrenError, ScrapeError
from cyberdrop_dl.clients.errors import MediaFireError
from cyberdrop_dl.scraper.crawler import Crawler
from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext
Expand Down Expand Up @@ -48,17 +47,10 @@ async def folder(self, scrape_item: ScrapeItem) -> None:
try:
folder_details: dict[str, dict] = self.api.folder_get_info(folder_key=folder_key)
except api.MediaFireApiError as e:
raise ScrapeError(status=f"MF - {e.message}", origin=scrape_item) from None
raise MediaFireError(status=e.code, message=e.message, origin=scrape_item) from None

title = self.create_title(folder_details["folder_info"]["name"], folder_key, None)
scrape_item.type = FILE_HOST_ALBUM
scrape_item.children = scrape_item.children_limit = 0

with contextlib.suppress(IndexError, TypeError):
scrape_item.children_limit = (
self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type]
)

scrape_item.set_type(FILE_HOST_ALBUM, self.manager)
scrape_item.album_id = folder_key
scrape_item.part_of_album = True

Expand All @@ -73,7 +65,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None:
chunk_size=chunk_size,
)
except api.MediaFireApiError as e:
raise ScrapeError(status=f"MF - {e.message}", origin=scrape_item) from None
raise MediaFireError(status=e.code, message=e.message, origin=scrape_item) from None

files = folder_contents["folder_content"]["files"]

Expand All @@ -90,9 +82,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None:
add_parent=scrape_item.url,
)
self.manager.task_group.create_task(self.run(new_scrape_item))
scrape_item.children += 1
if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit:
raise MaxChildrenError(origin=scrape_item)
scrape_item.add_children()

if folder_contents["folder_content"]["more_chunks"] == "yes":
chunk += 1
Expand Down
4 changes: 1 addition & 3 deletions cyberdrop_dl/scraper/crawlers/omegascans_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from cyberdrop_dl.clients.errors import MaxChildrenError, ScrapeError
from cyberdrop_dl.scraper.crawler import Crawler
from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem
from cyberdrop_dl.utils.logger import log
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext

if TYPE_CHECKING:
Expand Down Expand Up @@ -65,7 +64,7 @@ async def series(self, scrape_item: ScrapeItem) -> None:
break

if not series_id:
raise ScrapeError(404, "series_id not found", origin=scrape_item)
raise ScrapeError(422, "Unable to parse series_id from html", origin=scrape_item)

page_number = 1
number_per_page = 30
Expand Down Expand Up @@ -101,7 +100,6 @@ async def chapter(self, scrape_item: ScrapeItem) -> None:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

if "This chapter is premium" in soup.get_text():
log("Scrape Failed: This chapter is premium", 40)
raise ScrapeError(401, "This chapter is premium", origin=scrape_item)

title_parts = soup.select_one("title").get_text().split(" - ")
Expand Down
Loading

0 comments on commit 160902f

Please sign in to comment.