From e5d91b2b398f1dd2b8b6c715ff28d9e93353a6a5 Mon Sep 17 00:00:00 2001 From: Jules-WinnfieldX Date: Sat, 16 Dec 2023 14:48:06 -0700 Subject: [PATCH] Revert and put out last 3.10 update. --- cyberdrop_dl/__init__.py | 2 +- cyberdrop_dl/downloader/downloader.py | 70 ++++-- cyberdrop_dl/main.py | 40 +-- cyberdrop_dl/managers/client_manager.py | 3 +- cyberdrop_dl/managers/download_manager.py | 36 ++- cyberdrop_dl/managers/manager.py | 9 +- cyberdrop_dl/scraper/crawler.py | 64 +++-- .../scraper/crawlers/bunkrr_crawler.py | 4 +- .../scraper/crawlers/coomer_crawler.py | 2 +- .../scraper/crawlers/cyberdrop_crawler.py | 2 +- .../scraper/crawlers/cyberfile_crawler.py | 4 +- .../scraper/crawlers/ehentai_crawler.py | 4 +- .../scraper/crawlers/erome_crawler.py | 4 +- .../scraper/crawlers/fapello_crawler.py | 7 +- .../scraper/crawlers/gofile_crawler.py | 2 +- .../scraper/crawlers/imageban_crawler.py | 4 +- .../scraper/crawlers/imgbb_crawler.py | 4 +- .../scraper/crawlers/imgkiwi_crawler.py | 2 +- .../scraper/crawlers/jpgchurch_crawler.py | 8 +- .../scraper/crawlers/kemono_crawler.py | 2 +- .../scraper/crawlers/mediafire_crawler.py | 2 +- .../scraper/crawlers/nudostartv_crawler.py | 2 +- .../scraper/crawlers/omegascans_crawler.py | 2 +- .../scraper/crawlers/pimpandhost_crawler.py | 4 +- .../scraper/crawlers/rule34xxx_crawler.py | 4 +- .../scraper/crawlers/rule34xyz_crawler.py | 4 +- .../scraper/crawlers/toonily_crawler.py | 2 +- cyberdrop_dl/scraper/scraper.py | 229 ++++++++++-------- cyberdrop_dl/ui/progress/file_progress.py | 5 +- cyberdrop_dl/ui/progress/scraping_progress.py | 6 +- cyberdrop_dl/utils/utilities.py | 11 +- pyproject.toml | 2 +- 32 files changed, 328 insertions(+), 218 deletions(-) diff --git a/cyberdrop_dl/__init__.py b/cyberdrop_dl/__init__.py index a9c316e20..56a2a5faa 100644 --- a/cyberdrop_dl/__init__.py +++ b/cyberdrop_dl/__init__.py @@ -1 +1 @@ -__version__ = "5.1.1" +__version__ = "5.0.124" diff --git a/cyberdrop_dl/downloader/downloader.py b/cyberdrop_dl/downloader/downloader.py index 773d9b3e7..754d6da17 100644 --- a/cyberdrop_dl/downloader/downloader.py +++ b/cyberdrop_dl/downloader/downloader.py @@ -18,6 +18,7 @@ from cyberdrop_dl.utils.utilities import CustomHTTPStatus, FILE_FORMATS, log if TYPE_CHECKING: + from asyncio import Queue from typing import Tuple from cyberdrop_dl.clients.download_client import DownloadClient @@ -83,46 +84,63 @@ def __init__(self, manager: Manager, domain: str): self.manager: Manager = manager self.domain: str = domain + self.complete = True + self.client: DownloadClient = field(init=False) + self.download_queue: Queue = field(init=False) self._file_lock = manager.download_manager.file_lock - self._semaphore: asyncio.Semaphore = field(init=False) - self._additional_headers = {} - self.processed_items: list = [] - self.waiting_items = 0 + self._unfinished_count = 0 self._current_attempt_filesize = {} + self._lock = asyncio.Lock() + + self.processed_items: list = [] + async def startup(self) -> None: """Starts the downloader""" + self.download_queue = await self.manager.queue_manager.get_download_queue(self.domain) self.client = self.manager.client_manager.downloader_session await self.set_additional_headers() - self._semaphore = asyncio.Semaphore(await self.manager.download_manager.get_download_limit(self.domain)) - async def run(self, media_item: MediaItem) -> None: + async def run_loop(self) -> None: """Runs the download loop""" - self.waiting_items += 1 - media_item.current_attempt = 0 - - await self._semaphore.acquire() - self.waiting_items -= 1 - if not (media_item.url.path in self.processed_items): - self.processed_items.append(media_item.url.path) - await self.manager.progress_manager.download_progress.update_total() - - await log(f"Download Starting: {media_item.url}") - async with self.manager.client_manager.download_session_limit: - try: - await self.download(media_item) - except Exception as e: - await log(f"Download Failed: {media_item.url} with error {e}") - await log(traceback.format_exc()) - await self.manager.progress_manager.download_stats_progress.add_failure("Unknown") - await self.manager.progress_manager.download_progress.add_failed() - else: + while True: + media_item: MediaItem = await self.download_queue.get() + self.complete = False + self._unfinished_count += 1 + media_item.current_attempt = 0 + + await self._lock.acquire() + if not (media_item.url.path in self.processed_items): + self.processed_items.append(media_item.url.path) + self._lock.release() + await self.manager.progress_manager.download_progress.update_total() + + await log(f"Download Starting: {media_item.url}") + async with self.manager.client_manager.download_session_limit: + try: + await self.download(media_item) + except Exception as e: + await log(f"Download Failed: {media_item.url} with error {e}") + await log(traceback.format_exc()) + await self.manager.progress_manager.download_stats_progress.add_failure("Unknown") + await self.manager.progress_manager.download_progress.add_failed() + self._unfinished_count -= 1 + self.download_queue.task_done() + if self._unfinished_count == 0 and self.download_queue.empty(): + self.complete = True + continue + await log(f"Download Finished: {media_item.url}") - self._semaphore.release() + else: + self._lock.release() + self.download_queue.task_done() + self._unfinished_count -= 1 + if self._unfinished_count == 0 and self.download_queue.empty(): + self.complete = True """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" diff --git a/cyberdrop_dl/main.py b/cyberdrop_dl/main.py index 104171d3b..b0b6c9ae2 100644 --- a/cyberdrop_dl/main.py +++ b/cyberdrop_dl/main.py @@ -10,7 +10,6 @@ from cyberdrop_dl.managers.manager import Manager from cyberdrop_dl.scraper.scraper import ScrapeMapper from cyberdrop_dl.ui.ui import program_ui -from cyberdrop_dl.utils.backports.taskgroups import TaskGroup from cyberdrop_dl.utils.sorting import Sorter from cyberdrop_dl.utils.utilities import check_latest_pypi, log_with_color, check_partials_and_empty_folders, log @@ -39,11 +38,21 @@ def startup() -> Manager: async def runtime(manager: Manager) -> None: """Main runtime loop for the program, this will run until all scraping and downloading is complete""" scrape_mapper = ScrapeMapper(manager) + download_manager = manager.download_manager + asyncio.create_task(scrape_mapper.map_urls()) - # NEW CODE - async with TaskGroup() as task_group: - manager.task_group = task_group - await scrape_mapper.start() + if not manager.args_manager.retry: + await scrape_mapper.load_links() + else: + await scrape_mapper.load_failed_links() + + # Check completion + await asyncio.sleep(1) + while True: + scraper_complete = await scrape_mapper.check_complete() + downloader_complete = await download_manager.check_complete() + if scraper_complete and downloader_complete: + break async def director(manager: Manager) -> None: @@ -82,6 +91,9 @@ async def director(manager: Manager) -> None: try: with Live(manager.progress_manager.layout, refresh_per_second=10): await runtime(manager) + except (KeyboardInterrupt, SystemExit): + print("\nExiting...") + exit(1) except Exception as e: print("\nAn error occurred, please report this to the developer") print(e) @@ -119,19 +131,11 @@ async def director(manager: Manager) -> None: def main(): manager = startup() - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - with contextlib.suppress(RuntimeError): - try: - asyncio.run(director(manager)) - except KeyboardInterrupt: - print("\nTrying to Exit...") - try: - asyncio.run(manager.close()) - except Exception: - pass - exit(1) - sys.exit(0) + with contextlib.suppress(RuntimeError, asyncio.CancelledError): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + asyncio.run(director(manager)) + sys.exit(0) if __name__ == '__main__': diff --git a/cyberdrop_dl/managers/client_manager.py b/cyberdrop_dl/managers/client_manager.py index 2591cd13c..b27ebaff4 100644 --- a/cyberdrop_dl/managers/client_manager.py +++ b/cyberdrop_dl/managers/client_manager.py @@ -9,6 +9,7 @@ import certifi from aiohttp import ClientResponse from aiolimiter import AsyncLimiter +from multidict import CIMultiDictProxy from yarl import URL from cyberdrop_dl.clients.download_client import DownloadClient @@ -40,8 +41,6 @@ def __init__(self, manager: Manager): self.domain_rate_limits = { "bunkrr": AsyncLimiter(5, 1), "cyberdrop": AsyncLimiter(5, 1), - "coomer": AsyncLimiter(10, 1), - "kemono": AsyncLimiter(10, 1), "pixeldrain": AsyncLimiter(10, 1), "other": AsyncLimiter(25, 1) } diff --git a/cyberdrop_dl/managers/download_manager.py b/cyberdrop_dl/managers/download_manager.py index 1ce274103..08995190f 100644 --- a/cyberdrop_dl/managers/download_manager.py +++ b/cyberdrop_dl/managers/download_manager.py @@ -5,6 +5,7 @@ from base64 import b64encode from typing import TYPE_CHECKING +from cyberdrop_dl.downloader.downloader import Downloader from cyberdrop_dl.utils.utilities import FILE_FORMATS if TYPE_CHECKING: @@ -39,10 +40,33 @@ class DownloadManager: def __init__(self, manager: Manager): self.manager = manager self._download_instances: Dict = {} + self._download_instance_tasks: Dict = {} self.file_lock = FileLock() - self.download_limits = {'bunkr': 1, 'bunkrr': 1, 'cyberdrop': 1, 'coomer': 2, 'cyberfile': 2, 'kemono': 2, "pixeldrain": 2} + self.download_limits = {'bunkr': 1, 'bunkrr': 1, 'cyberdrop': 1, 'coomer': 8, 'cyberfile': 2, 'kemono': 8, "pixeldrain": 2} + + async def check_complete(self) -> bool: + """Checks if all download instances are complete""" + if not self._download_instances: + return True + + keys = list(self._download_instances.keys()) + for key in keys: + await self._download_instances[key].download_queue.join() + + await asyncio.sleep(1) + keys = list(self._download_instances.keys()) + for key in keys: + if not self._download_instances[key].download_queue.empty() or not self._download_instances[key].complete: + return False + return True + + async def close(self) -> None: + """Closes all download instances""" + for downloader in self._download_instance_tasks.values(): + for task in downloader: + task.cancel() async def get_download_limit(self, key: str) -> int: """Returns the download limit for a domain""" @@ -55,6 +79,16 @@ async def get_download_limit(self, key: str) -> int: instances = self.manager.config_manager.global_settings_data['Rate_Limiting_Options']['max_simultaneous_downloads_per_domain'] return instances + async def get_download_instance(self, key: str) -> Downloader: + """Returns a download instance""" + if key not in self._download_instances: + self._download_instances[key] = Downloader(self.manager, key) + await self._download_instances[key].startup() + self._download_instance_tasks[key] = [] + for i in range(await self.get_download_limit(key)): + self._download_instance_tasks[key].append(asyncio.create_task(self._download_instances[key].run_loop())) + return self._download_instances[key] + async def basic_auth(self, username, password) -> str: """Returns a basic auth token""" token = b64encode(f"{username}:{password}".encode('utf-8')).decode("ascii") diff --git a/cyberdrop_dl/managers/manager.py b/cyberdrop_dl/managers/manager.py index 767b951fe..ac3506421 100644 --- a/cyberdrop_dl/managers/manager.py +++ b/cyberdrop_dl/managers/manager.py @@ -1,6 +1,7 @@ import copy import json from dataclasses import field +from pathlib import Path from cyberdrop_dl import __version__ from cyberdrop_dl.managers.args_manager import ArgsManager @@ -12,8 +13,8 @@ from cyberdrop_dl.managers.log_manager import LogManager from cyberdrop_dl.managers.path_manager import PathManager from cyberdrop_dl.managers.progress_manager import ProgressManager +from cyberdrop_dl.managers.queue_manager import QueueManager from cyberdrop_dl.utils.args import config_definitions -from cyberdrop_dl.utils.backports.taskgroups import TaskGroup from cyberdrop_dl.utils.dataclasses.supported_domains import SupportedDomains from cyberdrop_dl.utils.transfer.first_time_setup import TransitionManager from cyberdrop_dl.utils.utilities import log @@ -26,6 +27,7 @@ def __init__(self): self.path_manager: PathManager = field(init=False) self.config_manager: ConfigManager = field(init=False) self.log_manager: LogManager = field(init=False) + self.queue_manager: QueueManager = QueueManager(self) self.db_manager: DBManager = field(init=False) self.client_manager: ClientManager = field(init=False) self.download_manager: DownloadManager = field(init=False) @@ -36,10 +38,6 @@ def __init__(self): self._loaded_args_config: bool = False self._made_portable: bool = False - self.task_group: TaskGroup = field(init=False) - self.task_list: list = [] - self.scrape_mapper = field(init=False) - def startup(self) -> None: """Startup process for the manager""" self.args_startup() @@ -169,3 +167,4 @@ async def args_logging(self) -> None: async def close(self) -> None: """Closes the manager""" await self.db_manager.close() + await self.download_manager.close() diff --git a/cyberdrop_dl/scraper/crawler.py b/cyberdrop_dl/scraper/crawler.py index 5e86b3653..99289788f 100644 --- a/cyberdrop_dl/scraper/crawler.py +++ b/cyberdrop_dl/scraper/crawler.py @@ -10,11 +10,12 @@ from yarl import URL from cyberdrop_dl.clients.errors import FailedLoginFailure -from cyberdrop_dl.downloader.downloader import Downloader from cyberdrop_dl.utils.dataclasses.url_objects import MediaItem, ScrapeItem from cyberdrop_dl.utils.utilities import log, get_download_path, remove_id, error_handling_wrapper if TYPE_CHECKING: + from asyncio import Queue + from cyberdrop_dl.clients.scraper_client import ScraperClient from cyberdrop_dl.managers.manager import Manager @@ -22,38 +23,54 @@ class Crawler(ABC): def __init__(self, manager: Manager, domain: str, folder_domain: str): self.manager = manager - self.downloader = field(init=False) self.scraping_progress = manager.progress_manager.scraping_progress self.client: ScraperClient = field(init=False) - self._lock = asyncio.Lock() self.domain = domain self.folder_domain = folder_domain + self.complete = False self.logged_in = field(init=False) self.scraped_items: list = [] - self.waiting_items = 0 + self.scraper_queue: Queue = field(init=False) + self.download_queue: Queue = field(init=False) + + self._lock = asyncio.Lock() async def startup(self) -> None: """Starts the crawler""" + self.scraper_queue = await self.manager.queue_manager.get_scraper_queue(self.domain) + self.download_queue = await self.manager.queue_manager.get_download_queue(self.domain) + self.client = self.manager.client_manager.scraper_session - self.downloader = Downloader(self.manager, self.domain) - await self.downloader.startup() - async def run(self, item: ScrapeItem) -> None: + async def finish_task(self) -> None: + self.scraper_queue.task_done() + if self.scraper_queue.empty(): + self.complete = True + + async def run_loop(self) -> None: """Runs the crawler loop""" - self.waiting_items += 1 - await self._lock.acquire() - self.waiting_items -= 1 - if item.url.path not in self.scraped_items: - await log(f"Scrape Starting: {item.url}") - self.scraped_items.append(item.url.path) - await self.fetch(item) - await log(f"Scrape Finished: {item.url}") - else: - await log(f"Skipping {item.url} as it has already been scraped") - self._lock.release() + while True: + item: ScrapeItem = await self.scraper_queue.get() + self.complete = False + + await self._lock.acquire() + if item.url.path not in self.scraped_items: + await log(f"Scrape Starting: {item.url}") + self.scraped_items.append(item.url.path) + self._lock.release() + await self.fetch(item) + await log(f"Scrape Finished: {item.url}") + else: + self._lock.release() + await log(f"Skipping {item.url} as it has already been scraped") + + await self.finish_task() + + if self.scraper_queue.empty(): + self.complete = True """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @@ -63,7 +80,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: raise NotImplementedError("Must override in child class") async def handle_file(self, url: URL, scrape_item: ScrapeItem, filename: str, ext: str) -> None: - """Finishes handling the file and hands it off to the downloader""" + """Finishes handling the file and hands it off to the download_queue""" if self.domain in ['cyberdrop', 'bunkrr']: original_filename, filename = await remove_id(self.manager, filename, ext) else: @@ -80,16 +97,17 @@ async def handle_file(self, url: URL, scrape_item: ScrapeItem, filename: str, ex if scrape_item.possible_datetime: media_item.datetime = scrape_item.possible_datetime + await self.download_queue.put(media_item) + + # if domains download limit is 1 join the queue if await self.manager.download_manager.get_download_limit(self.domain) == 1: - await self.downloader.run(media_item) - else: - self.manager.task_group.create_task(self.downloader.run(media_item)) + await self.download_queue.join() """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" async def handle_external_links(self, scrape_item: ScrapeItem) -> None: """Maps external links to the scraper class""" - self.manager.task_group.create_task(self.manager.scrape_mapper.map_url(scrape_item)) + await self.manager.queue_manager.url_objects_to_map.put(scrape_item) @error_handling_wrapper async def forum_login(self, login_url: URL, session_cookie: str, username: str, password: str, wait_time: int = 0) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py b/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py index 635f57136..3abf98e00 100644 --- a/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py @@ -65,7 +65,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: link = URL("https://" + scrape_item.url.host + link) link = URL(link) link = await self.get_stream_link(link) - self.manager.task_group.create_task(self.run(ScrapeItem(link, scrape_item.parent_title, True, date))) + await self.scraper_queue.put(ScrapeItem(link, scrape_item.parent_title, True, date)) @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: @@ -144,4 +144,4 @@ async def set_cookies(self): if self.manager.config_manager.authentication_data['DDOS-Guard']['bunkrr_ddgid']: self.client.client_manager.cookies.update_cookies({"__ddgid_": self.manager.config_manager.authentication_data['DDOS-Guard']['bunkrr_ddgid']}, response_url=self.ddos_guard_domain) - self.cookies_set = True + self.cookies_set = True \ No newline at end of file diff --git a/cyberdrop_dl/scraper/crawlers/coomer_crawler.py b/cyberdrop_dl/scraper/crawlers/coomer_crawler.py index 6d67e337a..cf6f0c468 100644 --- a/cyberdrop_dl/scraper/crawlers/coomer_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/coomer_crawler.py @@ -151,7 +151,7 @@ async def create_new_scrape_item(self, link: URL, old_scrape_item: ScrapeItem, u new_title = await self.create_title(user, None, None) new_scrape_item = await self.create_scrape_item(old_scrape_item, link, new_title, True, await self.parse_datetime(date)) await new_scrape_item.add_to_parent_title(post_title) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" diff --git a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py index d4c31ecd0..9dc124aa7 100644 --- a/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/cyberdrop_crawler.py @@ -53,7 +53,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: link = URL(link) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, date) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def file(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py b/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py index 0256e6be6..4cf3dfeaf 100644 --- a/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/cyberfile_crawler.py @@ -71,7 +71,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: continue new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) page += 1 if page >= num_pages: @@ -106,7 +106,7 @@ async def shared(self, scrape_item: ScrapeItem) -> None: continue new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) page += 1 if page >= num_pages: diff --git a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py index db72e5ce3..d43b15376 100644 --- a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py @@ -49,7 +49,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for image in images: link = URL(image.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, date) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) next_page_opts = soup.select('td[onclick="document.location=this.firstChild.href"]') next_page = None @@ -61,7 +61,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: next_page = URL(next_page.get('href')) if next_page is not None: new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def image(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/erome_crawler.py b/cyberdrop_dl/scraper/crawlers/erome_crawler.py index 3441de4d9..4fcdde50f 100644 --- a/cyberdrop_dl/scraper/crawlers/erome_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/erome_crawler.py @@ -43,13 +43,13 @@ async def profile(self, scrape_item: ScrapeItem) -> None: for album in albums: link = URL(album['href']) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) next_page = soup.select_one('a[rel="next"]') if next_page: next_page = next_page.get("href").split("page=")[-1] new_scrape_item = await self.create_scrape_item(scrape_item, scrape_item.url.with_query(f"page={next_page}"), "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def album(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/fapello_crawler.py b/cyberdrop_dl/scraper/crawlers/fapello_crawler.py index 55148ed36..49660c572 100644 --- a/cyberdrop_dl/scraper/crawlers/fapello_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/fapello_crawler.py @@ -50,18 +50,17 @@ async def profile(self, scrape_item: ScrapeItem) -> None: video_tag = post.select_one('iframe') video_link = URL(video_tag.get('src')) new_scrape_item = await self.create_scrape_item(scrape_item, video_link, "", True) - await self.handle_external_links(new_scrape_item) + await self.manager.queue_manager.url_objects_to_map.put(new_scrape_item) else: link = URL(post.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - await self.handle_external_links(new_scrape_item) + await self.scraper_queue.put(new_scrape_item) next_page = soup.select_one('div[id="next_page"] a') if next_page: next_page = next_page.get('href') if next_page: - new_scrape_item = ScrapeItem(URL(next_page), scrape_item.parent_title) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(ScrapeItem(URL(next_page), scrape_item.parent_title)) @error_handling_wrapper async def post(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py index 68606bb0b..c0d0af409 100644 --- a/cyberdrop_dl/scraper/crawlers/gofile_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/gofile_crawler.py @@ -73,7 +73,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: content = contents[content_id] if content["type"] == "folder": new_scrape_item = await self.create_scrape_item(scrape_item, URL(content["name"]), title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) continue if content["link"] == "overloaded": link = URL(content["directLink"]) diff --git a/cyberdrop_dl/scraper/crawlers/imageban_crawler.py b/cyberdrop_dl/scraper/crawlers/imageban_crawler.py index fba89029e..d8c5b4498 100644 --- a/cyberdrop_dl/scraper/crawlers/imageban_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imageban_crawler.py @@ -59,7 +59,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: link = URL(link_path) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) next_page = soup.select_one('a[class*="page-link next"]') if next_page: @@ -69,7 +69,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: else: link = URL(link_path) new_scrape_item = await self.create_scrape_item(scrape_item, link, "", True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def compilation(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py b/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py index 59c25db67..584e15acc 100644 --- a/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgbb_crawler.py @@ -50,7 +50,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for album in albums: sub_album_link = URL(album.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, sub_album_link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) async with self.request_limiter: soup = await self.client.get_BS4(self.domain, scrape_item.url / "sub") @@ -63,7 +63,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for link in links: link = URL(link.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) link_next = soup.select_one('a[data-pagination=next]') if link_next is not None: diff --git a/cyberdrop_dl/scraper/crawlers/imgkiwi_crawler.py b/cyberdrop_dl/scraper/crawlers/imgkiwi_crawler.py index 546ceb217..d1f6466ba 100644 --- a/cyberdrop_dl/scraper/crawlers/imgkiwi_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/imgkiwi_crawler.py @@ -52,7 +52,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for link in links: link = URL(link.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) link_next = soup.select_one('a[data-pagination=next]') if link_next is not None: diff --git a/cyberdrop_dl/scraper/crawlers/jpgchurch_crawler.py b/cyberdrop_dl/scraper/crawlers/jpgchurch_crawler.py index b6781fd80..39afaecb8 100644 --- a/cyberdrop_dl/scraper/crawlers/jpgchurch_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/jpgchurch_crawler.py @@ -10,7 +10,7 @@ from cyberdrop_dl.scraper.crawler import Crawler from cyberdrop_dl.utils.dataclasses.url_objects import ScrapeItem -from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext +from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext, FILE_FORMATS if TYPE_CHECKING: from cyberdrop_dl.managers.manager import Manager @@ -57,7 +57,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: for link in links: link = URL(link.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) link_next = soup.select_one('a[data-pagination=next]') if link_next is not None: @@ -80,7 +80,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for album in albums: sub_album_link = URL(album.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, sub_album_link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) async with self.request_limiter: soup = await self.client.get_BS4(self.domain, scrape_item.url / "sub") @@ -93,7 +93,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for link in links: link = URL(link.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) link_next = soup.select_one('a[data-pagination=next]') if link_next is not None: diff --git a/cyberdrop_dl/scraper/crawlers/kemono_crawler.py b/cyberdrop_dl/scraper/crawlers/kemono_crawler.py index a28271f51..3cc2fbca7 100644 --- a/cyberdrop_dl/scraper/crawlers/kemono_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/kemono_crawler.py @@ -166,7 +166,7 @@ async def create_new_scrape_item(self, link: URL, old_scrape_item: ScrapeItem, u new_title = await self.create_title(user, None, None) new_scrape_item = await self.create_scrape_item(old_scrape_item, link, new_title, True, await self.parse_datetime(date)) await new_scrape_item.add_to_parent_title(post_title) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" diff --git a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py index 9b695a2df..886526bfa 100644 --- a/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/mediafire_crawler.py @@ -53,7 +53,7 @@ async def folder(self, scrape_item: ScrapeItem) -> None: date = await self.parse_datetime(file['created']) link = URL(file['links']['normal_download']) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, date) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) if folder_contents["folder_content"]["more_chunks"] == "yes": chunk += 1 diff --git a/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py b/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py index f7c0b6af0..c4f7beca7 100644 --- a/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/nudostartv_crawler.py @@ -45,7 +45,7 @@ async def profile(self, scrape_item: ScrapeItem) -> None: if next_page: link = URL(next_page.get('href')) new_scrape_item = await self.create_scrape_item(scrape_item, link, "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def image(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py index c459409e5..8990a6761 100644 --- a/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/omegascans_crawler.py @@ -51,7 +51,7 @@ async def series(self, scrape_item: ScrapeItem) -> None: else: chapter_path = URL(chapter_path) new_scrape_item = await self.create_scrape_item(scrape_item, chapter_path, "", True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def chapter(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py b/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py index ff2078e13..78bb7f23b 100644 --- a/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/pimpandhost_crawler.py @@ -47,7 +47,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: for file in files: link = URL(file.get("href")) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True, date) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) next_page = soup.select_one("li[class=next] a") if next_page: @@ -55,7 +55,7 @@ async def album(self, scrape_item: ScrapeItem) -> None: if next_page.startswith("/"): next_page = URL("https://pimpandhost.com" + next_page) new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "", True, date) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def image(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py index 19222ba18..9cfcf8df6 100644 --- a/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/rule34xxx_crawler.py @@ -55,7 +55,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: link = f"{self.primary_base_url}{link}" link = URL(link, encoded=True) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) next_page = soup.select_one("a[alt=next]") if next_page is not None: @@ -66,7 +66,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: else: next_page = URL(next_page) new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def file(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py index 2eedd1cc2..cd73900f3 100644 --- a/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/rule34xyz_crawler.py @@ -50,7 +50,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: link = f"{self.primary_base_url}{link}" link = URL(link) new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) if not content: return @@ -60,7 +60,7 @@ async def tag(self, scrape_item: ScrapeItem) -> None: else: next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/2") new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def file(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/crawlers/toonily_crawler.py b/cyberdrop_dl/scraper/crawlers/toonily_crawler.py index 4c5d61d2b..45623bb12 100644 --- a/cyberdrop_dl/scraper/crawlers/toonily_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/toonily_crawler.py @@ -52,7 +52,7 @@ async def series(self, scrape_item: ScrapeItem) -> None: else: chapter_path = URL(chapter_path) new_scrape_item = await self.create_scrape_item(scrape_item, chapter_path, "", True) - self.manager.task_group.create_task(self.run(new_scrape_item)) + await self.scraper_queue.put(new_scrape_item) @error_handling_wrapper async def chapter(self, scrape_item: ScrapeItem) -> None: diff --git a/cyberdrop_dl/scraper/scraper.py b/cyberdrop_dl/scraper/scraper.py index fdb6e4b72..adf7c5b33 100644 --- a/cyberdrop_dl/scraper/scraper.py +++ b/cyberdrop_dl/scraper/scraper.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import re from dataclasses import Field from pathlib import Path @@ -9,7 +10,6 @@ from yarl import URL from cyberdrop_dl.clients.errors import NoExtensionFailure, JDownloaderFailure -from cyberdrop_dl.downloader.downloader import Downloader from cyberdrop_dl.scraper.jdownloader import JDownloader from cyberdrop_dl.utils.dataclasses.url_objects import ScrapeItem, MediaItem from cyberdrop_dl.utils.utilities import log, get_filename_and_ext, get_download_path @@ -23,7 +23,6 @@ class ScrapeMapper: """This class maps links to their respective handlers, or JDownloader if they are unsupported""" def __init__(self, manager: Manager): - self.manager = manager self.mapping = {"bunkrr": self.bunkrr, "celebforum": self.celebforum, "coomer": self.coomer, "cyberdrop": self.cyberdrop, "cyberfile": self.cyberfile, "e-hentai": self.ehentai, "erome": self.erome, "fapello": self.fapello, "f95zone": self.f95zone, "gofile": self.gofile, @@ -39,10 +38,28 @@ def __init__(self, manager: Manager): "rule34.xyz": self.rule34xyz, "saint": self.saint, "scrolller": self.scrolller, "simpcity": self.simpcity, "socialmediagirls": self.socialmediagirls, "toonily": self.toonily, "xbunker": self.xbunker, "xbunkr": self.xbunkr, "bunkr": self.bunkrr} + self.download_mapping = {"bunkrr": "bunkrr", "celebforum": "celebforum", "coomer": "coomer", + "cyberdrop": "cyberdrop", "cyberfile": "cyberfile", "e-hentai": "e-hentai", + "erome": "erome", "fapello": "fapello", "f95zone": "f95zone", "gofile": "gofile", + "hotpic": "hotpic", "ibb.co": "imgbb", "imageban": "imageban", "imgbox": "imgbox", + "imgur": "imgur", "img.kiwi": "img.kiwi", "jpg.church": "jpg.church", + "jpg.homes": "jpg.church", "jpg.fish": "jpg.church", "jpg.fishing": "jpg.church", + "jpg.pet": "jpg.church", "jpeg.pet": "jpg.church", "jpg1.su": "jpg.church", + "jpg2.su": "jpg.church", "jpg3.su": "jpg.church", "kemono": "kemono", + "leakedmodels": "leakedmodels", "mediafire": "mediafire", + "nudostar.com": "nudostar.com", "nudostar.tv": "nudostartv", + "omegascans": "omegascans", "pimpandhost": "pimpandhost", "pixeldrain": "pixeldrain", + "postimg": "postimg", "reddit": "reddit", "redd.it": "reddit", "redgifs": "redgifs", + "rule34.xxx": "rule34.xxx", "rule34.xyz": "rule34.xyz", "saint": "saint", + "scrolller": "scrolller", "simpcity": "simpcity", + "socialmediagirls": "socialmediagirls", "toonily": "toonily", "xbunker": "xbunker", + "xbunkr": "xbunkr", "bunkr": "bunkrr"} self.existing_crawlers = {} - self.no_crawler_downloader = Downloader(self.manager, "no_crawler") + self.manager = manager self.jdownloader = JDownloader(self.manager) + self.complete = False + async def bunkrr(self) -> None: """Creates a Bunkr Crawler instance""" from cyberdrop_dl.scraper.crawlers.bunkrr_crawler import BunkrrCrawler @@ -240,35 +257,6 @@ async def xbunkr(self) -> None: """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" - async def start_scrapers(self) -> None: - """Starts all scrapers""" - for key in self.mapping: - await self.mapping[key]() - crawler = self.existing_crawlers[key] - await crawler.startup() - - async def start_jdownloader(self) -> None: - """Starts JDownloader""" - if self.jdownloader.enabled: - if isinstance(self.jdownloader.jdownloader_agent, Field): - await self.jdownloader.jdownloader_setup() - - async def start(self) -> None: - """Starts the orchestra""" - self.manager.scrape_mapper = self - - await self.start_scrapers() - await self.start_jdownloader() - - await self.no_crawler_downloader.startup() - - if not self.manager.args_manager.retry: - await self.load_links() - else: - await self.load_failed_links() - - """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" - async def regex_links(self, line: str) -> List: """Regex grab the links from the URLs.txt file This allows code blocks or full paragraphs to be copy and pasted into the URLs.txt""" @@ -302,7 +290,7 @@ async def load_links(self) -> None: await log("No valid links found.") for link in links: item = ScrapeItem(url=link, parent_title="") - self.manager.task_group.create_task(self.map_url(item)) + await self.manager.queue_manager.url_objects_to_map.put(item) async def load_failed_links(self) -> None: """Loads failed links from db""" @@ -312,9 +300,7 @@ async def load_failed_links(self) -> None: retry_path = Path(item[3]) item = ScrapeItem(link, parent_title="", part_of_album=True, retry=True, retry_path=retry_path) - self.manager.task_group.create_task(self.map_url(item)) - - """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" + await self.manager.queue_manager.url_objects_to_map.put(item) async def extension_check(self, url: URL) -> bool: """Checks if the URL has a valid extension""" @@ -328,72 +314,117 @@ async def extension_check(self, url: URL) -> bool: except NoExtensionFailure: return False - async def map_url(self, scrape_item: ScrapeItem) -> None: + """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" + + async def check_complete(self) -> bool: + await self.manager.queue_manager.url_objects_to_map.join() + + keys = list(self.existing_crawlers.keys()) + for key in keys: + await self.existing_crawlers[key].scraper_queue.join() + + await asyncio.sleep(1) + keys = list(self.existing_crawlers.keys()) + for key in keys: + if not self.existing_crawlers[key].complete: + return False + + if not self.manager.queue_manager.url_objects_to_map.empty(): + return False + + self.complete = True + return True + + async def map_urls(self) -> None: """Maps URLs to their respective handlers""" - if not scrape_item.url: - return - if not isinstance(scrape_item.url, URL): + while True: + scrape_item: ScrapeItem = await self.manager.queue_manager.url_objects_to_map.get() + + if not scrape_item.url: + self.manager.queue_manager.url_objects_to_map.task_done() + continue + if not isinstance(scrape_item.url, URL): + try: + scrape_item.url = URL(scrape_item.url) + except Exception as e: + self.manager.queue_manager.url_objects_to_map.task_done() + continue + try: - scrape_item.url = URL(scrape_item.url) + if not scrape_item.url.host: + self.manager.queue_manager.url_objects_to_map.task_done() + continue except Exception as e: - return - try: - if not scrape_item.url.host: - return - except Exception as e: - return - - skip = False - if self.manager.config_manager.settings_data['Ignore_Options']['skip_hosts']: - for skip_host in self.manager.config_manager.settings_data['Ignore_Options']['skip_hosts']: - if skip_host in scrape_item.url.host: - skip = True - break - if self.manager.config_manager.settings_data['Ignore_Options']['only_hosts']: - for only_host in self.manager.config_manager.settings_data['Ignore_Options']['only_hosts']: - if only_host not in scrape_item.url.host: - skip = True - break - - if str(scrape_item.url).endswith("/"): - if scrape_item.url.query_string: - query = scrape_item.url.query_string[:-1] - scrape_item.url = scrape_item.url.with_query(query) + self.manager.queue_manager.url_objects_to_map.task_done() + continue + + skip = False + if self.manager.config_manager.settings_data['Ignore_Options']['skip_hosts']: + for skip_host in self.manager.config_manager.settings_data['Ignore_Options']['skip_hosts']: + if skip_host in scrape_item.url.host: + skip = True + break + if self.manager.config_manager.settings_data['Ignore_Options']['only_hosts']: + for only_host in self.manager.config_manager.settings_data['Ignore_Options']['only_hosts']: + if only_host not in scrape_item.url.host: + skip = True + break + + if str(scrape_item.url).endswith("/"): + if scrape_item.url.query_string: + query = scrape_item.url.query_string[:-1] + scrape_item.url = scrape_item.url.with_query(query) + else: + scrape_item.url = scrape_item.url.with_path(scrape_item.url.path[:-1]) + + key = next((key for key in self.mapping if key in scrape_item.url.host.lower()), None) + download_key = next((self.download_mapping[key] for key in self.download_mapping if key in scrape_item.url.host.lower()), None) + + if key and not skip: + """If the crawler doesn't exist, create it, finally add the scrape item to it's queue""" + if not self.existing_crawlers.get(key): + start_handler = self.mapping[key] + await start_handler() + await self.existing_crawlers[key].startup() + await self.manager.download_manager.get_download_instance(download_key) + asyncio.create_task(self.existing_crawlers[key].run_loop()) + await self.existing_crawlers[key].scraper_queue.put(scrape_item) + self.manager.queue_manager.url_objects_to_map.task_done() + continue + elif skip: + await log(f"Skipping URL by Config Selections: {scrape_item.url}") + elif await self.extension_check(scrape_item.url): + await self.manager.download_manager.get_download_instance("no_crawler") + check_complete = await self.manager.db_manager.history_table.check_complete("no_crawler", scrape_item.url) + if check_complete: + await log(f"Skipping {scrape_item.url} as it has already been downloaded") + await self.manager.progress_manager.download_progress.add_previously_completed() + self.manager.queue_manager.url_objects_to_map.task_done() + continue + download_queue = await self.manager.queue_manager.get_download_queue("no_crawler") + await scrape_item.add_to_parent_title("Loose Files") + scrape_item.part_of_album = True + download_folder = await get_download_path(self.manager, scrape_item, "no_crawler") + filename, ext = await get_filename_and_ext(scrape_item.url.name) + media_item = MediaItem(scrape_item.url, scrape_item.url, download_folder, filename, ext, filename) + await download_queue.put(media_item) + elif self.jdownloader.enabled: + if isinstance(self.jdownloader.jdownloader_agent, Field): + await self.jdownloader.jdownloader_setup() + if not self.jdownloader.enabled: + await log(f"Unsupported URL: {scrape_item.url}") + await self.manager.log_manager.write_unsupported_urls_log(scrape_item.url) + await log(f"Sending unsupported URL to JDownloader: {scrape_item.url}") + try: + await self.jdownloader.direct_unsupported_to_jdownloader(scrape_item.url, scrape_item.parent_title) + except JDownloaderFailure as e: + await log(f"Failed to send {scrape_item.url} to JDownloader") + await log(e.message) + await self.manager.log_manager.write_unsupported_urls_log(scrape_item.url) else: - scrape_item.url = scrape_item.url.with_path(scrape_item.url.path[:-1]) - - key = next((key for key in self.mapping if key in scrape_item.url.host.lower()), None) - - if key and not skip: - scraper = self.existing_crawlers[key] - self.manager.task_group.create_task(scraper.run(scrape_item)) - return - - elif skip: - await log(f"Skipping URL by Config Selections: {scrape_item.url}") - - elif await self.extension_check(scrape_item.url): - check_complete = await self.manager.db_manager.history_table.check_complete("no_crawler", scrape_item.url) - if check_complete: - await log(f"Skipping {scrape_item.url} as it has already been downloaded") - await self.manager.progress_manager.download_progress.add_previously_completed() - return - await scrape_item.add_to_parent_title("Loose Files") - scrape_item.part_of_album = True - download_folder = await get_download_path(self.manager, scrape_item, "no_crawler") - filename, ext = await get_filename_and_ext(scrape_item.url.name) - media_item = MediaItem(scrape_item.url, scrape_item.url, download_folder, filename, ext, filename) - self.manager.task_group.create_task(self.no_crawler_downloader.run(media_item)) - - elif self.jdownloader.enabled: - await log(f"Sending unsupported URL to JDownloader: {scrape_item.url}") - try: - await self.jdownloader.direct_unsupported_to_jdownloader(scrape_item.url, scrape_item.parent_title) - except JDownloaderFailure as e: - await log(f"Failed to send {scrape_item.url} to JDownloader") - await log(e.message) + await log(f"Unsupported URL: {scrape_item.url}") await self.manager.log_manager.write_unsupported_urls_log(scrape_item.url) - else: - await log(f"Unsupported URL: {scrape_item.url}") - await self.manager.log_manager.write_unsupported_urls_log(scrape_item.url) + self.manager.queue_manager.url_objects_to_map.task_done() + if self.complete: + break diff --git a/cyberdrop_dl/ui/progress/file_progress.py b/cyberdrop_dl/ui/progress/file_progress.py index ee3be0d8c..c99533e40 100644 --- a/cyberdrop_dl/ui/progress/file_progress.py +++ b/cyberdrop_dl/ui/progress/file_progress.py @@ -55,8 +55,8 @@ async def get_queue_length(self) -> int: """Returns the number of tasks in the downloader queue""" total = 0 - for scraper in self.manager.scrape_mapper.existing_crawlers.values(): - total += scraper.downloader.waiting_items + for queue in self.manager.queue_manager.download_queues.values(): + total += queue.qsize() return total @@ -94,7 +94,6 @@ async def add_task(self, file: str, expected_size: Optional[int]) -> TaskID: else: task_id = self.progress.add_task(self.progress_str.format(color=self.color, description=description), total=expected_size) self.visible_tasks.append(task_id) - await self.redraw() return task_id async def remove_file(self, task_id: TaskID) -> None: diff --git a/cyberdrop_dl/ui/progress/scraping_progress.py b/cyberdrop_dl/ui/progress/scraping_progress.py index 03de8cfcd..d7d9d13db 100644 --- a/cyberdrop_dl/ui/progress/scraping_progress.py +++ b/cyberdrop_dl/ui/progress/scraping_progress.py @@ -45,8 +45,9 @@ async def get_queue_length(self) -> int: """Returns the number of tasks in the scraper queue""" total = 0 - for scraper in self.manager.scrape_mapper.existing_crawlers.values(): - total += scraper.waiting_items + total += self.manager.queue_manager.url_objects_to_map.qsize() + for queue in self.manager.queue_manager.scraper_queues.values(): + total += queue.qsize() return total @@ -80,7 +81,6 @@ async def add_task(self, url: URL) -> TaskID: else: task_id = self.progress.add_task(self.progress_str.format(color=self.color, description=str(url))) self.visible_tasks.append(task_id) - await self.redraw() return task_id async def remove_task(self, task_id: TaskID) -> None: diff --git a/cyberdrop_dl/utils/utilities.py b/cyberdrop_dl/utils/utilities.py index c5588a681..7dfc66005 100644 --- a/cyberdrop_dl/utils/utilities.py +++ b/cyberdrop_dl/utils/utilities.py @@ -221,4 +221,13 @@ async def check_latest_pypi(): return if current_version != latest_version: - await log_with_color(f"New version of cyberdrop-dl available: {latest_version}", "bold_red") + await log_with_color(f"\nNew version of cyberdrop-dl available: {latest_version}", "bold_red") + + # check python version + import sys + version = sys.version_info + if version.major == 3 and version.minor == 10: + await log_with_color("\nNew versions of this program will require python 3.11 specifically to run.", "bold_red") + await log_with_color("If you are seeing this message, that means you need to update python to continue getting Cyberdrop-DL updates.", "bold_red") + await log_with_color("You can see how to do that here: https://jules-winnfieldx.gitbook.io/cyberdrop-dl/quick-start", "bold_red") + diff --git a/pyproject.toml b/pyproject.toml index 54de97f0c..1fe620966 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cyberdrop-dl" -version = "5.1.1" +version = "5.0.124" description = "Bulk downloader for multiple file hosts" authors = ["Jules Winnfield "] readme = "README.md"