From b9178c92664c6ece4fa68ecf29f558a02f91a7f8 Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Fri, 8 Nov 2024 07:19:25 -0500 Subject: [PATCH] fix: async references --- cyberdrop_dl/clients/hash_client.py | 5 +++-- cyberdrop_dl/main.py | 7 ++++--- cyberdrop_dl/managers/live_manager.py | 17 ++++++++++------ cyberdrop_dl/managers/real_debrid/api.py | 6 ++++-- .../scraper/crawlers/celebforum_crawler.py | 16 +++++++-------- .../scraper/crawlers/chevereto_crawler.py | 17 ++++++++-------- .../scraper/crawlers/coomer_crawler.py | 12 +++++------ .../scraper/crawlers/cyberdrop_crawler.py | 6 +++--- .../scraper/crawlers/cyberfile_crawler.py | 12 +++++------ .../scraper/crawlers/ehentai_crawler.py | 10 +++++----- .../scraper/crawlers/erome_crawler.py | 4 ++-- .../scraper/crawlers/f95zone_crawler.py | 16 +++++++-------- .../scraper/crawlers/fapello_crawler.py | 6 +++--- .../scraper/crawlers/gofile_crawler.py | 4 ++-- .../scraper/crawlers/hotpic_crawler.py | 2 +- .../scraper/crawlers/imageban_crawler.py | 10 +++++----- .../scraper/crawlers/imgbb_crawler.py | 8 ++++---- .../scraper/crawlers/imgbox_crawler.py | 2 +- .../scraper/crawlers/imgur_crawler.py | 4 ++-- .../scraper/crawlers/kemono_crawler.py | 16 +++++++-------- .../scraper/crawlers/leakedmodels_crawler.py | 16 +++++++-------- .../scraper/crawlers/mediafire_crawler.py | 8 ++++---- .../scraper/crawlers/nekohouse_crawler.py | 12 +++++------ .../scraper/crawlers/nudostar_crawler.py | 16 +++++++-------- .../scraper/crawlers/nudostartv_crawler.py | 4 ++-- .../scraper/crawlers/omegascans_crawler.py | 10 +++++----- .../scraper/crawlers/pimpandhost_crawler.py | 12 +++++------ .../scraper/crawlers/pixeldrain_crawler.py | 12 +++++------ .../scraper/crawlers/postimg_crawler.py | 2 +- .../scraper/crawlers/realbooru_crawler.py | 6 +++--- .../scraper/crawlers/realdebrid_crawler.py | 20 +++++++++++-------- .../scraper/crawlers/reddit_crawler.py | 9 +++++---- .../scraper/crawlers/redgifs_crawler.py | 4 ++-- .../scraper/crawlers/rule34vault_crawler.py | 12 +++++------ .../scraper/crawlers/rule34xxx_crawler.py | 6 +++--- .../scraper/crawlers/rule34xyz_crawler.py | 8 ++++---- .../scraper/crawlers/scrolller_crawler.py | 2 +- .../scraper/crawlers/simpcity_crawler.py | 16 +++++++-------- .../crawlers/socialmediagirls_crawler.py | 16 +++++++-------- .../scraper/crawlers/tokyomotion_crawler.py | 13 ++++++------ .../scraper/crawlers/toonily_crawler.py | 6 +++--- .../scraper/crawlers/xbunker_crawler.py | 16 +++++++-------- .../scraper/crawlers/xbunkr_crawler.py | 2 +- .../scraper/crawlers/xxxbunker_crawler.py | 5 +++-- cyberdrop_dl/scraper/filters.py | 4 ++-- cyberdrop_dl/scraper/scraper.py | 6 ++++-- .../utils/database/tables/hash_table.py | 3 +-- cyberdrop_dl/utils/utilities.py | 8 ++++---- 48 files changed, 226 insertions(+), 208 deletions(-) diff --git a/cyberdrop_dl/clients/hash_client.py b/cyberdrop_dl/clients/hash_client.py index d44f939b9..f4719bbfc 100644 --- a/cyberdrop_dl/clients/hash_client.py +++ b/cyberdrop_dl/clients/hash_client.py @@ -3,9 +3,10 @@ import asyncio import time from collections import defaultdict +from collections.abc import AsyncGenerator from contextlib import asynccontextmanager from pathlib import Path -from typing import TYPE_CHECKING, AsyncGenerator +from typing import TYPE_CHECKING from send2trash import send2trash @@ -35,7 +36,7 @@ async def _hash_directory_scanner_helper(manager: Manager, path: Path): start_time = time.perf_counter() async with hash_scan_directory_context(manager): await manager.hash_manager.hash_client.hash_directory(path) - await manager.progress_manager.print_stats(start_time) + manager.progress_manager.print_stats(start_time) class HashClient: diff --git a/cyberdrop_dl/main.py b/cyberdrop_dl/main.py index 0921562f3..6b7170ed9 100644 --- a/cyberdrop_dl/main.py +++ b/cyberdrop_dl/main.py @@ -43,8 +43,6 @@ def startup() -> Manager: if not manager.args_manager.immediate_download: program_ui(manager) - return manager - except InvalidYamlError as e: print_to_console(e.message_rich) sys.exit(1) @@ -53,6 +51,9 @@ def startup() -> Manager: print_to_console("Exiting...") sys.exit(0) + else: + return manager + async def runtime(manager: Manager) -> None: """Main runtime loop for the program, this will run until all scraping and downloading is complete.""" @@ -185,7 +186,7 @@ async def director(manager: Manager) -> None: with manager.live_manager.get_main_live(stop=True): await runtime(manager) await post_runtime(manager) - except Exception as e: + except* Exception as e: log_with_color( f"An error occurred, please report this to the developer: {e}", "bold red", diff --git a/cyberdrop_dl/managers/live_manager.py b/cyberdrop_dl/managers/live_manager.py index fb825e65a..5bcb7c5df 100644 --- a/cyberdrop_dl/managers/live_manager.py +++ b/cyberdrop_dl/managers/live_manager.py @@ -1,12 +1,13 @@ from __future__ import annotations +from collections.abc import Generator from contextlib import contextmanager -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING from rich.live import Live from rich.progress import Progress, SpinnerColumn, TextColumn -from cyberdrop_dl.utils.logger import console +from cyberdrop_dl.utils.logger import console, log if TYPE_CHECKING: from rich.layout import Layout @@ -38,10 +39,14 @@ def get_live(self, layout: Layout, stop: bool = False) -> Generator[Live]: self.live.update(show, refresh=True) yield self.live - except Exception as e: - msg = f"Issue with rich live {e}" - raise Exception(msg) from e - + except* Exception as e: + msg = f"Issue with rich live: {e}" + log(msg, 50, exc_info=True) + if isinstance(e, ExceptionGroup): + for sub_exception in e.exceptions: + msg = f"Multiple exception caught: {type(sub_exception).__name__} - {sub_exception}" + log(msg, 50, exc_info=sub_exception) + raise e finally: if stop: self.live.stop() diff --git a/cyberdrop_dl/managers/real_debrid/api.py b/cyberdrop_dl/managers/real_debrid/api.py index a749a18b7..5721a597b 100644 --- a/cyberdrop_dl/managers/real_debrid/api.py +++ b/cyberdrop_dl/managers/real_debrid/api.py @@ -1,9 +1,10 @@ from __future__ import annotations import time +from collections.abc import Generator from contextlib import contextmanager from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING from requests import Session from requests.exceptions import RequestException @@ -82,11 +83,12 @@ def handle_response(response: Response) -> dict | str | None: try: response.raise_for_status() JSONResp: dict = response.json() - return JSONResp except RequestException: raise RealDebridError(response) from None except AttributeError: return response.text + else: + return JSONResp @contextmanager def rate_limiter(self, buffer: float = 0.2) -> Generator: diff --git a/cyberdrop_dl/scraper/crawlers/celebforum_crawler.py b/cyberdrop_dl/scraper/crawlers/celebforum_crawler.py index 9b6807924..9e6e0a7f0 100644 --- a/cyberdrop_dl/scraper/crawlers/celebforum_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/celebforum_crawler.py @@ -117,7 +117,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -161,7 +161,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -202,7 +202,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -241,7 +241,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -270,7 +270,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -290,7 +290,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = link.replace("ifr", "watch") link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -321,7 +321,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -338,5 +338,5 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py b/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py index 2aeecbdd4..0bba39d03 100644 --- a/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/chevereto_crawler.py @@ -3,7 +3,8 @@ import calendar import datetime import re -from typing import TYPE_CHECKING, AsyncGenerator, ClassVar +from collections.abc import AsyncGenerator +from typing import TYPE_CHECKING, ClassVar from aiolimiter import AsyncLimiter from bs4 import BeautifulSoup @@ -27,7 +28,7 @@ class CheveretoCrawler(Crawler): - JPG_CHURCH_DOMAINS = { + JPG_CHURCH_DOMAINS: ClassVar[tuple[str, ...]] = { "jpg.homes", "jpg.church", "jpg.fish", @@ -42,7 +43,7 @@ class CheveretoCrawler(Crawler): "host.church", } - PRIMARY_BASE_DOMAINS = { + PRIMARY_BASE_DOMAINS: ClassVar[dict[str, URL]] = { "imagepond.net": URL("https://imagepond.net"), "jpg.church": URL("https://jpg5.su"), "img.kiwi": URL("https://img.kiwi"), @@ -104,7 +105,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -159,7 +160,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: sub_album_link = self.primary_base_domain / sub_album_link[1:] sub_album_link = URL(sub_album_link) - new_scrape_item = await self.create_scrape_item(scrape_item, sub_album_link, "", True) + new_scrape_item = self.create_scrape_item(scrape_item, sub_album_link, "", True) self.manager.task_group.create_task(self.run(new_scrape_item)) async for soup in self.web_pager(scrape_item.url): @@ -169,7 +170,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -203,7 +204,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: break if date: - date = await self.parse_datetime(date) + date = self.parse_datetime(date) scrape_item.possible_datetime = date filename, ext = get_filename_and_ext(link.name) @@ -242,7 +243,7 @@ async def get_sort_by_new_url(url: URL) -> URL: return url.with_query({"sort": "date_desc", "page": 1}) @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/coomer_crawler.py b/cyberdrop_dl/scraper/crawlers/coomer_crawler.py index 19b10028d..97766e0ca 100644 --- a/cyberdrop_dl/scraper/crawlers/coomer_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/coomer_crawler.py @@ -71,14 +71,14 @@ async def favorites(self, scrape_item: ScrapeItem) -> None: id = user["id"] service = user["service"] url = self.primary_base_domain / service / "user" / id - new_scrape_item = await self.create_scrape_item(scrape_item, url, None, True, None, None) + new_scrape_item = self.create_scrape_item(scrape_item, url, None, True, None, None) self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper async def profile(self, scrape_item: ScrapeItem) -> None: """Scrapes a profile.""" offset = 0 - service, user = await self.get_service_and_user(scrape_item) + service, user = self.get_service_and_user(scrape_item) user_str = await self.get_user_str_from_profile(scrape_item) api_call = self.api_url / service / "user" / user scrape_item.type = FILE_HOST_PROFILE @@ -198,13 +198,13 @@ async def create_new_scrape_item( post_title = post_id + " - " + post_title new_title = self.create_title(user, None, None) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( old_scrape_item, link, new_title, True, None, - await self.parse_datetime(date), + self.parse_datetime(date), add_parent=add_parent, ) new_scrape_item.add_to_parent_title(post_title) @@ -223,7 +223,7 @@ async def get_user_str_from_profile(self, scrape_item: ScrapeItem) -> str: return soup.select_one("span[itemprop=name]").text @staticmethod - async def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: + def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: """Gets the service and user from a scrape item.""" user = scrape_item.url.parts[3] service = scrape_item.url.parts[1] @@ -238,7 +238,7 @@ async def get_service_user_and_post(scrape_item: ScrapeItem) -> tuple[str, str, return service, user, post @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string.""" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py index ce12d2860..c0f278178 100644 --- a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py @@ -72,7 +72,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: date = soup.select("p[class=title]") if date: - date = await self.parse_datetime(date[-1].text) + date = self.parse_datetime(date[-1].text) links = soup.select("div[class*=image-container] a[class=image]") for link in links: @@ -81,7 +81,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: link = self.primary_base_domain.with_path(link) link = URL(link) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -146,7 +146,7 @@ def is_cdn(url: URL) -> bool: return bool(re.match(CDN_POSSIBILITIES, url.host)) @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%d.%m.%Y") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py b/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py index ee6e99c47..de26146c0 100644 --- a/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py @@ -113,7 +113,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: elif file_id: link = URL(tile.get("dtfullurl")) if link: - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -125,7 +125,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: log(f"Couldn't find folder or file id for {scrape_item.url} element", 30) continue - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -188,7 +188,7 @@ async def shared(self, scrape_item: ScrapeItem) -> None: link = URL(tile.get("dtfullurl")) if link: - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -201,7 +201,7 @@ async def shared(self, scrape_item: ScrapeItem) -> None: log(f"Couldn't find folder or file id for {scrape_item.url} element", 30) continue - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -277,7 +277,7 @@ async def handle_content_id(self, scrape_item: ScrapeItem, content_id: int) -> N file_detail_table = ajax_soup.select('table[class="table table-bordered table-striped"]')[-1] uploaded_row = file_detail_table.select("tr")[-2] uploaded_date = uploaded_row.select_one("td[class=responsiveTable]").text.strip() - uploaded_date = await self.parse_datetime(uploaded_date) + uploaded_date = self.parse_datetime(uploaded_date) scrape_item.possible_datetime = uploaded_date filename, ext = get_filename_and_ext(link.name) @@ -286,7 +286,7 @@ async def handle_content_id(self, scrape_item: ScrapeItem, content_id: int) -> N """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%d/%m/%Y %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py index 0a1471af5..db65c3120 100644 --- a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py @@ -40,7 +40,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.image(scrape_item) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") self.scraping_progress.remove_task(task_id) @@ -51,7 +51,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) title = self.create_title(soup.select_one("h1[id=gn]").get_text(), None, None) - date = await self.parse_datetime(soup.select_one("td[class=gdt2]").get_text()) + date = self.parse_datetime(soup.select_one("td[class=gdt2]").get_text()) scrape_item.type = FILE_HOST_ALBUM scrape_item.children = scrape_item.children_limit = 0 @@ -63,7 +63,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: images = soup.select("div[class=gdtm] div a") for image in images: link = URL(image.get("href")) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -86,7 +86,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: if next_page is not None: next_page = URL(next_page.get("href")) if next_page is not None: - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -113,7 +113,7 @@ async def set_no_warnings(self, scrape_item: ScrapeItem) -> None: await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" if date.count(":") == 1: date = date + ":00" diff --git a/cyberdrop_dl/scraper/crawlers/erome_crawler.py b/cyberdrop_dl/scraper/crawlers/erome_crawler.py index 9d258e32a..722f5377b 100644 --- a/cyberdrop_dl/scraper/crawlers/erome_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/erome_crawler.py @@ -54,7 +54,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: for album in albums: link = URL(album["href"]) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) scrape_item.children += 1 if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: @@ -63,7 +63,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: next_page = soup.select_one('a[rel="next"]') if next_page: next_page = next_page.get("href").split("page=")[-1] - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, scrape_item.url.with_query(f"page={next_page}"), "", diff --git a/cyberdrop_dl/scraper/crawlers/f95zone_crawler.py b/cyberdrop_dl/scraper/crawlers/f95zone_crawler.py index 2e498a158..a7e4b8a80 100644 --- a/cyberdrop_dl/scraper/crawlers/f95zone_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/f95zone_crawler.py @@ -117,7 +117,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -160,7 +160,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -206,7 +206,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.host: await self.handle_internal_links(link, scrape_item) @@ -244,7 +244,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.host: await self.handle_internal_links(link, scrape_item) @@ -273,7 +273,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -294,7 +294,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = link.replace("ifr", "watch") link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -325,7 +325,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.host: await self.handle_internal_links(link, scrape_item) @@ -342,7 +342,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/fapello_crawler.py b/cyberdrop_dl/scraper/crawlers/fapello_crawler.py index 090c57cc0..93395962a 100644 --- a/cyberdrop_dl/scraper/crawlers/fapello_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/fapello_crawler.py @@ -68,7 +68,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: if "javascript" in post.get("href"): video_tag = post.select_one("iframe") video_link = URL(video_tag.get("src")) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, video_link, "", @@ -78,7 +78,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: await self.handle_external_links(new_scrape_item) else: link = URL(post.get("href")) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -93,7 +93,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: if next_page: next_page = next_page.get("href") if next_page: - new_scrape_item = await self.create_scrape_item(scrape_item, URL(next_page), "") + new_scrape_item = self.create_scrape_item(scrape_item, URL(next_page), "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py index 5d25737eb..28a77b0f2 100644 --- a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py @@ -117,7 +117,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: content = contents[content_id] link = None if content["type"] == "folder": - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, self.primary_base_domain / "d" / content["code"], title, @@ -142,7 +142,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: except NoExtensionError: log(f"Scrape Failed: {link} (No File Extension)", 40) await self.manager.log_manager.write_scrape_error_log(link, " No File Extension") - await self.manager.progress_manager.scrape_stats_progress.add_failure("No File Extension") + self.manager.progress_manager.scrape_stats_progress.add_failure("No File Extension") scrape_item.children += 1 if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: raise MaxChildrenError(origin=scrape_item) diff --git a/cyberdrop_dl/scraper/crawlers/hotpic_crawler.py b/cyberdrop_dl/scraper/crawlers/hotpic_crawler.py index ef1bf02d9..4cf5a7827 100644 --- a/cyberdrop_dl/scraper/crawlers/hotpic_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/hotpic_crawler.py @@ -36,7 +36,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.image(scrape_item) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") self.scraping_progress.remove_task(task_id) diff --git a/cyberdrop_dl/scraper/crawlers/imageban_crawler.py b/cyberdrop_dl/scraper/crawlers/imageban_crawler.py index e663041ba..8b3c68b4d 100644 --- a/cyberdrop_dl/scraper/crawlers/imageban_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imageban_crawler.py @@ -73,7 +73,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: link = URL("https://" + scrape_item.url.host + link_path) if link_path.startswith("/") else URL(link_path) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) scrape_item.children += 1 if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: @@ -83,7 +83,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: if next_page: link_path = next_page.get("href") link = URL("https://" + scrape_item.url.host + link_path) if link_path.startswith("/") else URL(link_path) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", True) self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -106,7 +106,7 @@ async def compilation(self, scrape_item: ScrapeItem) -> None: for image in images: link = URL(image.get("src")) - date = await self.parse_datetime(f"{(link.parts[2])}-{(link.parts[3])}-{(link.parts[4])}") + date = self.parse_datetime(f"{(link.parts[2])}-{(link.parts[3])}-{(link.parts[4])}") scrape_item.possible_datetime = date filename, ext = get_filename_and_ext(link.name) await self.handle_file(link, scrape_item, filename, ext) @@ -123,7 +123,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) - date = await self.parse_datetime( + date = self.parse_datetime( f"{(scrape_item.url.parts[2])}-{(scrape_item.url.parts[3])}-{(scrape_item.url.parts[4])}", ) scrape_item.possible_datetime = date @@ -143,7 +143,7 @@ async def handle_direct(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%d") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py b/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py index 2d8f5aec8..eba5c15b0 100644 --- a/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py @@ -69,7 +69,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: albums = soup.select("a[class='image-container --media']") for album in albums: sub_album_link = URL(album.get("href")) - new_scrape_item = await self.create_scrape_item(scrape_item, sub_album_link, title, True) + new_scrape_item = self.create_scrape_item(scrape_item, sub_album_link, title, True) self.manager.task_group.create_task(self.run(new_scrape_item)) async with self.request_limiter: @@ -82,7 +82,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: links = soup.select("a[class*=image-container]") for link in links: link = URL(link.get("href")) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -116,7 +116,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: link = URL(soup.select_one("div[id=image-viewer-container] img").get("src")) date = soup.select_one("p[class*=description-meta] span").get("title") - date = await self.parse_datetime(date) + date = self.parse_datetime(date) scrape_item.possible_datetime = date filename, ext = get_filename_and_ext(link.name) @@ -132,7 +132,7 @@ async def handle_direct_link(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py b/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py index 33c8739a3..cd554b25c 100644 --- a/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgbox_crawler.py @@ -99,7 +99,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/imgur_crawler.py b/cyberdrop_dl/scraper/crawlers/imgur_crawler.py index 7a65cd0cb..0c20e9476 100644 --- a/cyberdrop_dl/scraper/crawlers/imgur_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgur_crawler.py @@ -80,7 +80,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for image in JSON_Obj["data"]: link = URL(image["link"]) date = image["datetime"] - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -112,7 +112,7 @@ async def image(self, scrape_item: ScrapeItem) -> None: date = JSON_Obj["data"]["datetime"] link = URL(JSON_Obj["data"]["link"]) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", True, date) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", True, date) await self.handle_direct(new_scrape_item) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/kemono_crawler.py b/cyberdrop_dl/scraper/crawlers/kemono_crawler.py index c338836fc..5b70e0803 100644 --- a/cyberdrop_dl/scraper/crawlers/kemono_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/kemono_crawler.py @@ -54,7 +54,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: async def profile(self, scrape_item: ScrapeItem) -> None: """Scrapes a profile.""" offset = 0 - service, user = await self.get_service_and_user(scrape_item) + service, user = self.get_service_and_user(scrape_item) user_str = await self.get_user_str_from_profile(scrape_item) api_call = self.api_url / service / "user" / user scrape_item.type = FILE_HOST_PROFILE @@ -181,13 +181,13 @@ async def get_content_links(self, scrape_item: ScrapeItem, post: dict, user: str post_title = post_id + " - " + post_title new_title = self.create_title(user, None, None) - scrape_item = await self.create_scrape_item( + scrape_item = self.create_scrape_item( scrape_item, scrape_item.url, new_title, True, None, - await self.parse_datetime(date), + self.parse_datetime(date), ) scrape_item.add_to_parent_title(post_title) scrape_item.add_to_parent_title("Loose Files") @@ -211,7 +211,7 @@ async def get_content_links(self, scrape_item: ScrapeItem, post: dict, user: str for link in yarl_links: if "kemono" in link.host: continue - scrape_item = await self.create_scrape_item( + scrape_item = self.create_scrape_item( scrape_item, link, "", @@ -252,13 +252,13 @@ async def create_new_scrape_item( post_title = post_id + " - " + post_title new_title = self.create_title(user, None, None) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( old_scrape_item, link, new_title, True, None, - await self.parse_datetime(date), + self.parse_datetime(date), add_parent=add_parent, ) new_scrape_item.add_to_parent_title(post_title) @@ -279,7 +279,7 @@ async def get_user_str_from_profile(self, scrape_item: ScrapeItem) -> str: return soup.select_one("span[itemprop=name]").text @staticmethod - async def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: + def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: """Gets the service and user from a scrape item.""" user = scrape_item.url.parts[3] service = scrape_item.url.parts[1] @@ -294,7 +294,7 @@ async def get_service_user_and_post(scrape_item: ScrapeItem) -> tuple[str, str, return service, user, post @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" try: date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") diff --git a/cyberdrop_dl/scraper/crawlers/leakedmodels_crawler.py b/cyberdrop_dl/scraper/crawlers/leakedmodels_crawler.py index d009a0502..103cdf157 100644 --- a/cyberdrop_dl/scraper/crawlers/leakedmodels_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/leakedmodels_crawler.py @@ -125,7 +125,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -168,7 +168,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -213,7 +213,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -247,7 +247,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -276,7 +276,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -296,7 +296,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = link.replace("ifr", "watch") link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -326,7 +326,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -343,5 +343,5 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py index 8f2b17c29..97743135b 100644 --- a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py @@ -76,9 +76,9 @@ async def folder(self, scrape_item: ScrapeItem) -> None: files = folder_contents["folder_content"]["files"] for file in files: - date = await self.parse_datetime(file["created"]) + date = self.parse_datetime(file["created"]) link = URL(file["links"]["normal_download"]) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -106,7 +106,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) - date = await self.parse_datetime(soup.select("ul[class=details] li span")[-1].get_text()) + date = self.parse_datetime(soup.select("ul[class=details] li span")[-1].get_text()) scrape_item.possible_datetime = date link = URL(soup.select_one("a[id=downloadButton]").get("href")) filename, ext = get_filename_and_ext(link.name) @@ -115,7 +115,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/nekohouse_crawler.py b/cyberdrop_dl/scraper/crawlers/nekohouse_crawler.py index 707086c45..d35c04fc3 100644 --- a/cyberdrop_dl/scraper/crawlers/nekohouse_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/nekohouse_crawler.py @@ -69,7 +69,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: """Scrapes a profile.""" soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) offset, maximum_offset = await self.get_offsets(scrape_item, soup) - service, user = await self.get_service_and_user(scrape_item) + service, user = self.get_service_and_user(scrape_item) user_str = await self.get_user_str_from_profile(soup) service_call = self.primary_base_domain / service / "user" / user while offset <= maximum_offset: @@ -94,7 +94,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: continue post_link = self.primary_base_domain / post_url # Call on self.post to scrape the post by creating a new scrape item - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, post_link, "", @@ -270,13 +270,13 @@ async def create_new_scrape_item( post_title = post_id + " - " + post_title new_title = self.create_title(user, None, None) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( old_scrape_item, link, new_title, True, None, - await self.parse_datetime(date), + self.parse_datetime(date), add_parent=add_parent, ) new_scrape_item.add_to_parent_title(post_title) @@ -320,7 +320,7 @@ async def get_user_str_from_profile(self, soup: BeautifulSoup) -> str: return soup.select_one("span[itemprop=name]").text @staticmethod - async def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: + def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]: """Gets the service and user from a scrape item.""" user = scrape_item.url.parts[3] service = scrape_item.url.parts[1] @@ -335,7 +335,7 @@ async def get_service_user_and_post(scrape_item: ScrapeItem) -> tuple[str, str, return service, user, post @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" try: date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") diff --git a/cyberdrop_dl/scraper/crawlers/nudostar_crawler.py b/cyberdrop_dl/scraper/crawlers/nudostar_crawler.py index cca047449..2fe0c746a 100644 --- a/cyberdrop_dl/scraper/crawlers/nudostar_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/nudostar_crawler.py @@ -117,7 +117,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -160,7 +160,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -202,7 +202,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -238,7 +238,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -265,7 +265,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -300,7 +300,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: if link.endswith("/"): link = link[:-1] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -327,7 +327,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -345,5 +345,5 @@ async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> Non """Handles internal links.""" temp_link = URL(str(link)[:-1]) if str(link).endswith("/") else link filename, ext = get_filename_and_ext(temp_link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py b/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py index bb3be0b0e..d49a5a7c5 100644 --- a/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py @@ -41,12 +41,12 @@ async def profile(self, scrape_item: ScrapeItem) -> None: content = soup.select("div[id=list_videos_common_videos_list_items] div a") for page in content: link = URL(page.get("href")) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) await self.image(new_scrape_item) next_page = soup.select_one("li[class=next] a") if next_page: link = URL(next_page.get("href")) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py index 267d11c6d..83569073b 100644 --- a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py @@ -77,7 +77,7 @@ async def series(self, scrape_item: ScrapeItem) -> None: for chapter in JSON_Obj["data"]: chapter_url = scrape_item.url / chapter["chapter_slug"] - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, chapter_url, "", @@ -112,13 +112,13 @@ async def chapter(self, scrape_item: ScrapeItem) -> None: date = soup.select('h2[class="font-semibold font-sans text-muted-foreground text-xs"]')[-1].get_text() try: - date = await self.parse_datetime_standard(date) + date = self.parse_datetime_standard(date) except ValueError: scripts = soup.select("script") for script in scripts: if "created" in script.get_text(): date = script.get_text().split('created_at\\":\\"')[1].split(".")[0] - date = await self.parse_datetime_other(date) + date = self.parse_datetime_other(date) break scrape_item.possible_datetime = date @@ -146,13 +146,13 @@ async def handle_direct_link(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime_standard(date: str) -> int: + def parse_datetime_standard(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%m/%d/%Y") return calendar.timegm(date.timetuple()) @staticmethod - async def parse_datetime_other(date: str) -> int: + def parse_datetime_other(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py b/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py index 6ec24c0b2..56389af07 100644 --- a/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py @@ -60,12 +60,12 @@ async def album(self, scrape_item: ScrapeItem) -> None: None, ) date = soup.select_one("span[class=date-time]").get("title") - date = await self.parse_datetime(date) + date = self.parse_datetime(date) files = soup.select('a[class*="image-wrapper center-cropped im-wr"]') for file in files: link = URL(file.get("href")) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -84,7 +84,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: next_page = next_page.get("href") if next_page.startswith("/"): next_page = URL("https://pimpandhost.com" + next_page) - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "", True, None, date) + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "", True, None, date) self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -98,16 +98,16 @@ async def image(self, scrape_item: ScrapeItem) -> None: link = URL("https:" + link) if link.startswith("//") else URL(link) date = soup.select_one("span[class=date-time]").get("title") - date = await self.parse_datetime(date) + date = self.parse_datetime(date) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", True, None, date) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", True, None, date) filename, ext = get_filename_and_ext(link.name) await self.handle_file(link, new_scrape_item, filename, ext) """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.strptime(date, "%A, %B %d, %Y %I:%M:%S%p %Z") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/pixeldrain_crawler.py b/cyberdrop_dl/scraper/crawlers/pixeldrain_crawler.py index 241915983..9e8383f98 100644 --- a/cyberdrop_dl/scraper/crawlers/pixeldrain_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/pixeldrain_crawler.py @@ -62,7 +62,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: for file in JSON_Resp["files"]: link = await self.create_download_link(file["id"]) - date = await self.parse_datetime(file["date_upload"].replace("T", " ").split(".")[0].strip("Z")) + date = self.parse_datetime(file["date_upload"].replace("T", " ").split(".")[0].strip("Z")) try: filename, ext = get_filename_and_ext(file["name"]) except NoExtensionError: @@ -70,7 +70,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: filename, ext = get_filename_and_ext(file["name"] + "." + file["mime_type"].split("/")[-1]) else: raise NoExtensionError from None - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -96,7 +96,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: ) link = await self.create_download_link(JSON_Resp["id"]) - date = await self.parse_datetime(JSON_Resp["date_upload"].replace("T", " ").split(".")[0]) + date = self.parse_datetime(JSON_Resp["date_upload"].replace("T", " ").split(".")[0]) filename = ext = None try: filename, ext = get_filename_and_ext(JSON_Resp["name"]) @@ -112,7 +112,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: lines = text.split("\n") for line in lines: link = URL(line) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, "", @@ -128,7 +128,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: ) else: raise NoExtensionError from None - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", False, None, date) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", False, None, date) await self.handle_file(link, new_scrape_item, filename, ext) """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @@ -138,7 +138,7 @@ async def create_download_link(self, file_id: str) -> URL: return (self.api_address / "file" / file_id).with_query("download") @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" try: date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") diff --git a/cyberdrop_dl/scraper/crawlers/postimg_crawler.py b/cyberdrop_dl/scraper/crawlers/postimg_crawler.py index c168d7d15..6a5718c30 100644 --- a/cyberdrop_dl/scraper/crawlers/postimg_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/postimg_crawler.py @@ -63,7 +63,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for image in JSON_Resp["images"]: link = URL(image[4]) filename, ext = image[2], image[3] - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, diff --git a/cyberdrop_dl/scraper/crawlers/realbooru_crawler.py b/cyberdrop_dl/scraper/crawlers/realbooru_crawler.py index 78768935d..1435ddab0 100644 --- a/cyberdrop_dl/scraper/crawlers/realbooru_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/realbooru_crawler.py @@ -40,7 +40,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.file(scrape_item) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") self.scraping_progress.remove_task(task_id) @@ -66,7 +66,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = f"{self.primary_base_url}{link}" link = URL(link, encoded=True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) scrape_item.children += 1 if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: @@ -77,7 +77,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: next_page = next_page.get("href") if next_page is not None: next_page = scrape_item.url.with_query(next_page[1:]) if next_page.startswith("?") else URL(next_page) - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/realdebrid_crawler.py b/cyberdrop_dl/scraper/crawlers/realdebrid_crawler.py index fa55a83a7..13142171e 100644 --- a/cyberdrop_dl/scraper/crawlers/realdebrid_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/realdebrid_crawler.py @@ -28,7 +28,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: task_id = self.scraping_progress.add_task(scrape_item.url) scrape_item.url = await self.get_original_url(scrape_item) - if await self.manager.real_debrid_manager.is_supported_folder(scrape_item.url): + if self.manager.real_debrid_manager.is_supported_folder(scrape_item.url): await self.folder(scrape_item) else: await self.file(scrape_item) @@ -40,7 +40,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: """Scrapes a folder.""" original_url = scrape_item.url log(f"scraping folder with RealDebrid: {original_url}", 10) - folder_id = await self.manager.real_debrid_manager.guess_folder(original_url) + folder_id = self.manager.real_debrid_manager.guess_folder(original_url) scrape_item.album_id = folder_id scrape_item.part_of_album = True @@ -48,10 +48,10 @@ async def folder(self, scrape_item: ScrapeItem) -> None: scrape_item.add_to_parent_title(title) async with self.request_limiter: - links = await self.manager.real_debrid_manager.unrestrict_folder(original_url) + links = self.manager.real_debrid_manager.unrestrict_folder(original_url) for link in links: - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, "", @@ -77,7 +77,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: scrape_item.part_of_album = True scrape_item.add_to_parent_title(title) async with self.request_limiter: - debrid_url = await self.manager.real_debrid_manager.unrestrict_link(original_url, password) + debrid_url = self.manager.real_debrid_manager.unrestrict_link(original_url, password) if await self.check_complete_from_referer(debrid_url): return @@ -102,7 +102,9 @@ def is_self_hosted(self, url: URL) -> bool: return any(subdomain in url.host for subdomain in ("download.", "my.")) and self.domain in url.host async def get_original_url(self, scrape_item: ScrapeItem) -> URL: - if self.is_self_hosted(scrape_item.url): + log(f"Input URL: {scrape_item.url}") + if not self.is_self_hosted(scrape_item.url) or self.domain not in scrape_item.url.host: + log(f"Parsed URL: {scrape_item.url}") return scrape_item.url parts_dict = {"parts": [], "query": [], "frag": []} @@ -118,9 +120,11 @@ async def get_original_url(self, scrape_item: ScrapeItem) -> URL: path = "/".join(parts_dict["parts"]) query = MultiDict() - for i in range(0, parts_dict["query"], 2): + for i in range(0, len(parts_dict["query"]), 2): query[parts_dict[i]] = parts_dict[i + 1] frag = parts_dict["frag"] if parts_dict["frag"] else None - return URL(f"https://{original_domain}").with_path(path).with_query(query).with_fragment(frag) + parsed_url = URL(f"https://{original_domain}").with_path(path).with_query(query).with_fragment(frag) + log(f"Parsed URL: {parsed_url}") + return parsed_url diff --git a/cyberdrop_dl/scraper/crawlers/reddit_crawler.py b/cyberdrop_dl/scraper/crawlers/reddit_crawler.py index f32f1eb40..14a5e1ec0 100644 --- a/cyberdrop_dl/scraper/crawlers/reddit_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/reddit_crawler.py @@ -1,7 +1,8 @@ from __future__ import annotations import contextlib -from typing import TYPE_CHECKING, AsyncIterator +from collections.abc import AsyncIterator +from typing import TYPE_CHECKING import aiohttp import asyncpraw @@ -38,7 +39,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: if not self.reddit_personal_use_script or not self.reddit_secret: log("Reddit API credentials not found. Skipping.", 30) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Failed Login") + self.manager.progress_manager.scrape_stats_progress.add_failure("Failed Login") self.scraping_progress.remove_task(task_id) return @@ -59,7 +60,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.media(scrape_item, reddit) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unknown") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unknown") self.scraping_progress.remove_task(task_id) @@ -214,7 +215,7 @@ async def create_new_scrape_item( add_parent: URL | None = None, ) -> ScrapeItem: """Creates a new scrape item with the same parent as the old scrape item.""" - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( old_scrape_item, link, "", diff --git a/cyberdrop_dl/scraper/crawlers/redgifs_crawler.py b/cyberdrop_dl/scraper/crawlers/redgifs_crawler.py index 3b7f93ab9..094f9356f 100644 --- a/cyberdrop_dl/scraper/crawlers/redgifs_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/redgifs_crawler.py @@ -76,7 +76,7 @@ async def user(self, scrape_item: ScrapeItem) -> None: link = URL(links["sd"]) filename, ext = get_filename_and_ext(link.name) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, @@ -110,7 +110,7 @@ async def post(self, scrape_item: ScrapeItem) -> None: link = URL(links["hd"] if "hd" in links else links["sd"]) filename, ext = get_filename_and_ext(link.name) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, link, title, diff --git a/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py index 77c753e05..1b6a49825 100644 --- a/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py @@ -63,7 +63,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = f"{self.primary_base_url}{link}" link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: raise MaxChildrenError(origin=scrape_item) @@ -78,7 +78,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: ) else: next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}?page=2", encoded=True) - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -107,7 +107,7 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = f"{self.primary_base_url}{link}" link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: raise MaxChildrenError(origin=scrape_item) @@ -119,7 +119,7 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: next_page = scrape_item.url.with_query({"page": int(page) + 1}) else: next_page = scrape_item.url.with_query({"page": 2}) - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -128,7 +128,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) - date = await self.parse_datetime( + date = self.parse_datetime( soup.select_one('div[class="posted-date-full text-secondary mt-4 ng-star-inserted"]').text, ) scrape_item.date = date @@ -159,7 +159,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%b %d, %Y, %I:%M:%S %p") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py index 5db456140..fc1701981 100644 --- a/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py @@ -40,7 +40,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.file(scrape_item) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") self.scraping_progress.remove_task(task_id) @@ -68,7 +68,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = f"{self.primary_base_url}{link}" link = URL(link, encoded=True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: raise MaxChildrenError(origin=scrape_item) @@ -78,7 +78,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: next_page = next_page.get("href") if next_page is not None: next_page = scrape_item.url.with_query(next_page[1:]) if next_page.startswith("?") else URL(next_page) - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py index 2b1353689..82a2f0e31 100644 --- a/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py @@ -62,7 +62,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: if link.startswith("/"): link = f"{self.primary_base_url}{link}" link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) self.manager.task_group.create_task(self.run(new_scrape_item)) if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: raise MaxChildrenError(origin=scrape_item) @@ -74,7 +74,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/{page + 1}") else: next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/2") - new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") self.manager.task_group.create_task(self.run(new_scrape_item)) @error_handling_wrapper @@ -83,7 +83,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) - date = await self.parse_datetime( + date = self.parse_datetime( soup.select_one('div[class="posted ng-star-inserted"]').text.split("(")[1].split(")")[0], ) scrape_item.date = date @@ -108,7 +108,7 @@ async def file(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%b %d, %Y, %I:%M:%S %p") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/scrolller_crawler.py b/cyberdrop_dl/scraper/crawlers/scrolller_crawler.py index eb64e0d8d..5021f396d 100644 --- a/cyberdrop_dl/scraper/crawlers/scrolller_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/scrolller_crawler.py @@ -34,7 +34,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: await self.subreddit(scrape_item) else: log(f"Scrape Failed: Unknown URL Path for {scrape_item.url}", 40) - await self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") + self.manager.progress_manager.scrape_stats_progress.add_failure("Unsupported Link") self.scraping_progress.remove_task(task_id) diff --git a/cyberdrop_dl/scraper/crawlers/simpcity_crawler.py b/cyberdrop_dl/scraper/crawlers/simpcity_crawler.py index 5825e07c0..7913e384f 100644 --- a/cyberdrop_dl/scraper/crawlers/simpcity_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/simpcity_crawler.py @@ -122,7 +122,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: date = None with contextlib.suppress(AttributeError): date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -165,7 +165,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -210,7 +210,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -248,7 +248,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: continue @@ -277,7 +277,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -312,7 +312,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: if link.endswith("/"): link = link[:-1] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -342,7 +342,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -359,5 +359,5 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/socialmediagirls_crawler.py b/cyberdrop_dl/scraper/crawlers/socialmediagirls_crawler.py index da6449fab..eb4a722d9 100644 --- a/cyberdrop_dl/scraper/crawlers/socialmediagirls_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/socialmediagirls_crawler.py @@ -119,7 +119,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -161,7 +161,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -209,7 +209,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: return new_children try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or "smgmedia" in link.host: await self.handle_internal_links(link, scrape_item) @@ -248,7 +248,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or "smgmedia" in link.host: await self.handle_internal_links(link, scrape_item) @@ -278,7 +278,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -313,7 +313,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: if link.endswith("/"): link = link[:-1] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -343,7 +343,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or "smgmedia" in link.host: await self.handle_internal_links(link, scrape_item) @@ -360,7 +360,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) @error_handling_wrapper diff --git a/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py b/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py index 2137377a4..157509563 100644 --- a/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/tokyomotion_crawler.py @@ -2,8 +2,9 @@ import re from calendar import timegm +from collections.abc import AsyncGenerator from datetime import datetime, timedelta -from typing import TYPE_CHECKING, AsyncGenerator +from typing import TYPE_CHECKING from aiolimiter import AsyncLimiter from multidict import MultiDict @@ -125,7 +126,7 @@ async def albums(self, scrape_item: ScrapeItem) -> None: link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "albums", add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, "albums", add_parent=scrape_item.url) await self.album(new_scrape_item) @error_handling_wrapper @@ -199,9 +200,9 @@ async def profile(self, scrape_item: ScrapeItem) -> None: new_parts = ["albums", "favorite/photos", "videos", "favorite/videos"] scrapers = [self.albums, self.album, self.playlist, self.playlist] - for part, scraper in zip(new_parts, scrapers): + for part, scraper in zip(new_parts, scrapers, strict=False): link = scrape_item.url / part - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) await scraper(new_scrape_item) @error_handling_wrapper @@ -235,7 +236,7 @@ async def search(self, scrape_item: ScrapeItem) -> None: link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) await scraper(new_scrape_item) @error_handling_wrapper @@ -270,7 +271,7 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, "", add_parent=scrape_item.url) await self.video(new_scrape_item) async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: diff --git a/cyberdrop_dl/scraper/crawlers/toonily_crawler.py b/cyberdrop_dl/scraper/crawlers/toonily_crawler.py index 2ed8df63a..30f22a2d1 100644 --- a/cyberdrop_dl/scraper/crawlers/toonily_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/toonily_crawler.py @@ -65,7 +65,7 @@ async def series(self, scrape_item: ScrapeItem) -> None: chapter_path = self.primary_base_domain / chapter_path[1:] else: chapter_path = URL(chapter_path) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, chapter_path, "", @@ -102,7 +102,7 @@ async def chapter(self, scrape_item: ScrapeItem) -> None: for script in scripts: if "datePublished" in script.get_text(): date = script.get_text().split('datePublished":"')[1].split("+")[0] - date = await self.parse_datetime(date) + date = self.parse_datetime(date) break scrape_item.possible_datetime = date if date else scrape_item.possible_datetime @@ -130,7 +130,7 @@ async def handle_direct_link(self, scrape_item: ScrapeItem) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @staticmethod - async def parse_datetime(date: str) -> int: + def parse_datetime(date: str) -> int: """Parses a datetime string into a unix timestamp.""" date = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/crawlers/xbunker_crawler.py b/cyberdrop_dl/scraper/crawlers/xbunker_crawler.py index b06eb5058..1787090b3 100644 --- a/cyberdrop_dl/scraper/crawlers/xbunker_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/xbunker_crawler.py @@ -121,7 +121,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: if scrape_post: date = int(post.select_one(self.post_date_selector).get(self.post_date_attribute)) - new_scrape_item = await self.create_scrape_item( + new_scrape_item = self.create_scrape_item( scrape_item, thread_url, title, @@ -164,7 +164,7 @@ async def forum(self, scrape_item: ScrapeItem) -> None: async def post(self, scrape_item: ScrapeItem, post_content: Tag, post_number: int) -> None: """Scrapes a post.""" if self.manager.config_manager.settings_data["Download_Options"]["separate_posts"]: - scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url, "") + scrape_item = self.create_scrape_item(scrape_item, scrape_item.url, "") scrape_item.add_to_parent_title("post-" + str(post_number)) scrape_item.type = FORUM_POST @@ -208,7 +208,7 @@ async def links(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) try: if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or self.extra_attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -246,7 +246,7 @@ async def images(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or self.extra_attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -276,7 +276,7 @@ async def videos(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = "https:" + link link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -311,7 +311,7 @@ async def embeds(self, scrape_item: ScrapeItem, post_content: Tag) -> int: if link.endswith("/"): link = link[:-1] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) new_children += 1 if scrape_item.children_limit and (new_children + scrape_item.children) >= scrape_item.children_limit: @@ -341,7 +341,7 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: link = URL(link) if self.domain not in link.host: - new_scrape_item = await self.create_scrape_item(scrape_item, link, "") + new_scrape_item = self.create_scrape_item(scrape_item, link, "") await self.handle_external_links(new_scrape_item) elif self.attachment_url_part in link.parts or self.extra_attachment_url_part in link.parts: await self.handle_internal_links(link, scrape_item) @@ -358,5 +358,5 @@ async def attachments(self, scrape_item: ScrapeItem, post_content: Tag) -> int: async def handle_internal_links(self, link: URL, scrape_item: ScrapeItem) -> None: """Handles internal links.""" filename, ext = get_filename_and_ext(link.name, True) - new_scrape_item = await self.create_scrape_item(scrape_item, link, "Attachments", True) + new_scrape_item = self.create_scrape_item(scrape_item, link, "Attachments", True) await self.handle_file(link, new_scrape_item, filename, ext) diff --git a/cyberdrop_dl/scraper/crawlers/xbunkr_crawler.py b/cyberdrop_dl/scraper/crawlers/xbunkr_crawler.py index 862ef903a..4d5134b71 100644 --- a/cyberdrop_dl/scraper/crawlers/xbunkr_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/xbunkr_crawler.py @@ -64,7 +64,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: except NoExtensionError: log(f"Couldn't get extension for {link!s}", 30) continue - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, True, add_parent=scrape_item.url) await self.handle_file(link, new_scrape_item, filename, ext) scrape_item.children += 1 if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: diff --git a/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py b/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py index 97cd2fee5..f5f273953 100644 --- a/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/xxxbunker_crawler.py @@ -3,8 +3,9 @@ import asyncio import re from calendar import timegm +from collections.abc import AsyncGenerator from datetime import datetime, timedelta -from typing import TYPE_CHECKING, AsyncGenerator +from typing import TYPE_CHECKING from aiolimiter import AsyncLimiter from bs4 import BeautifulSoup @@ -157,7 +158,7 @@ async def playlist(self, scrape_item: ScrapeItem) -> None: link = self.primary_base_domain / link[1:] link = URL(link) - new_scrape_item = await self.create_scrape_item(scrape_item, link, title, add_parent=scrape_item.url) + new_scrape_item = self.create_scrape_item(scrape_item, link, title, add_parent=scrape_item.url) await self.video(new_scrape_item) async def web_pager(self, url: URL) -> AsyncGenerator[BeautifulSoup]: diff --git a/cyberdrop_dl/scraper/filters.py b/cyberdrop_dl/scraper/filters.py index 19e650ef8..bb1e5d2e5 100644 --- a/cyberdrop_dl/scraper/filters.py +++ b/cyberdrop_dl/scraper/filters.py @@ -61,8 +61,8 @@ def has_valid_extension(url: URL) -> bool: """Checks if the URL has a valid extension.""" try: _, ext = get_filename_and_ext(url.name) - valid_exts = FILE_FORMATS["Images"] | FILE_FORMATS["Videos"] | FILE_FORMATS["Audio"] - return ext in valid_exts except NoExtensionError: return False + else: + return ext in valid_exts diff --git a/cyberdrop_dl/scraper/scraper.py b/cyberdrop_dl/scraper/scraper.py index fbd6b2a96..495a205ab 100644 --- a/cyberdrop_dl/scraper/scraper.py +++ b/cyberdrop_dl/scraper/scraper.py @@ -580,13 +580,15 @@ async def send_to_crawler(self, scrape_item: ScrapeItem) -> None: log(f"Failed to send {scrape_item.url} to JDownloader\n{e.message}", 40) await self.manager.log_manager.write_unsupported_urls_log( scrape_item.url, - next(scrape_item.parents, None), + scrape_item.parents[0] if scrape_item.parents else None, ) self.manager.progress_manager.scrape_stats_progress.add_unsupported(sent_to_jdownloader=success) return log(f"Unsupported URL: {scrape_item.url}", 30) - await self.manager.log_manager.write_unsupported_urls_log(scrape_item.url, next(scrape_item.parents, None)) + await self.manager.log_manager.write_unsupported_urls_log( + scrape_item.url, scrape_item.parents[0] if scrape_item.parents else None + ) self.manager.progress_manager.scrape_stats_progress.add_unsupported() def filter_items(self, scrape_item: ScrapeItem) -> bool: diff --git a/cyberdrop_dl/utils/database/tables/hash_table.py b/cyberdrop_dl/utils/database/tables/hash_table.py index 2cfc82678..f0d59dcb1 100644 --- a/cyberdrop_dl/utils/database/tables/hash_table.py +++ b/cyberdrop_dl/utils/database/tables/hash_table.py @@ -128,7 +128,6 @@ async def insert_or_update_hash_db(self, hash_value: str, file: str, original_fi (hash_value, file_size, download_filename, folder, original_filename, referer), ) await self.db_conn.commit() - return True except IntegrityError as _: # Handle potential duplicate key (assuming a unique constraint on hash, filename, and folder) await cursor.execute( @@ -150,10 +149,10 @@ async def insert_or_update_hash_db(self, hash_value: str, file: str, original_fi ), ) await self.db_conn.commit() - return True except Exception as e: console.print(f"Error inserting/updating record: {e}") return False + return True async def get_all_unique_hashes(self) -> list: """Retrieves a list of (folder, filename) tuples based on a given hash. diff --git a/cyberdrop_dl/utils/utilities.py b/cyberdrop_dl/utils/utilities.py index 3ba2054a0..8b836fe4e 100644 --- a/cyberdrop_dl/utils/utilities.py +++ b/cyberdrop_dl/utils/utilities.py @@ -1,12 +1,12 @@ from __future__ import annotations -import asyncio import contextlib import os import re +from collections.abc import Callable from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING import aiofiles import apprise @@ -42,7 +42,7 @@ async def wrapper(self: Crawler, *args, **kwargs): except RealDebridError as e: log_message_short = log_message = f"RealDebridError - {e.error}" e_ui_failure = f"RD - {e.error}" - except asyncio.TimeoutError: + except TimeoutError: log_message_short = log_message = e_ui_failure = "Timeout" except Exception as e: # noqa exc_info = e @@ -55,7 +55,7 @@ async def wrapper(self: Crawler, *args, **kwargs): log(f"Scrape Failed: {link} ({log_message})", 40, exc_info=exc_info) await self.manager.log_manager.write_scrape_error_log(link, log_message_short, origin) - await self.manager.progress_manager.scrape_stats_progress.add_failure(e_ui_failure) + self.manager.progress_manager.scrape_stats_progress.add_failure(e_ui_failure) return None return wrapper