diff --git a/cyberdrop_dl/__init__.py b/cyberdrop_dl/__init__.py index a7c419667..f84eed2c7 100644 --- a/cyberdrop_dl/__init__.py +++ b/cyberdrop_dl/__init__.py @@ -1 +1 @@ -__version__ = "5.3.27" +__version__ = "5.3.28" diff --git a/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py b/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py new file mode 100644 index 000000000..50ab47c9d --- /dev/null +++ b/cyberdrop_dl/scraper/crawlers/rule34vault_crawler.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import calendar +import datetime +from typing import TYPE_CHECKING + +from aiolimiter import AsyncLimiter +from yarl import URL + +from cyberdrop_dl.scraper.crawler import Crawler +from cyberdrop_dl.utils.dataclasses.url_objects import ScrapeItem +from cyberdrop_dl.utils.utilities import get_filename_and_ext, error_handling_wrapper + +if TYPE_CHECKING: + from cyberdrop_dl.managers.manager import Manager + + +class Rule34VaultCrawler(Crawler): + def __init__(self, manager: Manager): + super().__init__(manager, "rule34vault", "Rule34Vault") + self.primary_base_url = URL("https://rule34vault.com") + self.request_limiter = AsyncLimiter(10, 1) + + """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" + + async def fetch(self, scrape_item: ScrapeItem) -> None: + """Determines where to send the scrape item based on the url""" + task_id = await self.scraping_progress.add_task(scrape_item.url) + + if "post" in scrape_item.url.parts: + await self.file(scrape_item) + elif "playlists" in scrape_item.url.parts: + await self.playlist(scrape_item) + else: + await self.tag(scrape_item) + + await self.scraping_progress.remove_task(task_id) + + @error_handling_wrapper + async def tag(self, scrape_item: ScrapeItem) -> None: + """Scrapes an album""" + async with self.request_limiter: + soup = await self.client.get_BS4(self.domain, scrape_item.url) + + title = await self.create_title(scrape_item.url.parts[1], None, None) + + content_block = soup.select_one('div[class="grid ng-star-inserted"]') + content = content_block.select('a[class="box ng-star-inserted"]') + for file_page in content: + link = file_page.get('href') + if link.startswith("/"): + link = f"{self.primary_base_url}{link}" + link = URL(link) + new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) + self.manager.task_group.create_task(self.run(new_scrape_item)) + if not content: + return + + if len(scrape_item.url.parts) > 2: + page = int(scrape_item.url.parts[-1]) + next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/{page + 1}") + else: + next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/2") + new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + self.manager.task_group.create_task(self.run(new_scrape_item)) + + @error_handling_wrapper + async def playlist(self, scrape_item: ScrapeItem) -> None: + """Scrapes a playlist""" + async with self.request_limiter: + soup = await self.client.get_BS4(self.domain, scrape_item.url) + + title_str = soup.select_one('div[class*=title]').text + title = await self.create_title(title_str, scrape_item.url.parts[-1], None) + + content_block = soup.select_one('div[class="grid ng-star-inserted"]') + content = content_block.select('a[class="box ng-star-inserted"]') + for file_page in content: + link = file_page.get('href') + if link.startswith("/"): + link = f"{self.primary_base_url}{link}" + link = URL(link) + new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) + self.manager.task_group.create_task(self.run(new_scrape_item)) + if not content: + return + + if scrape_item.url.query: + page = scrape_item.url.query.get("page") + next_page = scrape_item.url.with_query({"page": int(page) + 1}) + else: + next_page = scrape_item.url.with_query({"page": 2}) + new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") + self.manager.task_group.create_task(self.run(new_scrape_item)) + + @error_handling_wrapper + async def file(self, scrape_item: ScrapeItem) -> None: + """Scrapes an image""" + async with self.request_limiter: + soup = await self.client.get_BS4(self.domain, scrape_item.url) + + date = await self.parse_datetime(soup.select_one('div[class="text-primary ng-star-inserted"]').text.split("(")[1].split(")")[0]) + scrape_item.date = date + + image = soup.select_one('img[class*="img ng-star-inserted"]') + if image: + link = image.get('src') + if link.startswith("/"): + link = f"{self.primary_base_url}{link}" + link = URL(link) + filename, ext = await get_filename_and_ext(link.name) + await self.handle_file(link, scrape_item, filename, ext) + video = soup.select_one("video source") + if video: + link = video.get('src') + if link.startswith("/"): + link = f"{self.primary_base_url}{link}" + link = URL(link) + filename, ext = await get_filename_and_ext(link.name) + await self.handle_file(link, scrape_item, filename, ext) + + """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" + + async def parse_datetime(self, date: str) -> int: + """Parses a datetime string into a unix timestamp""" + date = datetime.datetime.strptime(date, "%b %d, %Y, %I:%M:%S %p") + return calendar.timegm(date.timetuple()) diff --git a/cyberdrop_dl/scraper/scraper.py b/cyberdrop_dl/scraper/scraper.py index 67614c8a2..7988611d2 100644 --- a/cyberdrop_dl/scraper/scraper.py +++ b/cyberdrop_dl/scraper/scraper.py @@ -36,8 +36,8 @@ def __init__(self, manager: Manager): "mediafire": self.mediafire, "nudostar.com": self.nudostar, "nudostar.tv": self.nudostartv, "omegascans": self.omegascans, "pimpandhost": self.pimpandhost, "pixeldrain": self.pixeldrain, "postimg": self.postimg, "realbooru": self.realbooru, "reddit": self.reddit, - "redd.it": self.reddit, "redgifs": self.redgifs, "rule34.xxx": self.rule34xxx, - "rule34.xyz": self.rule34xyz, "saint": self.saint, "scrolller": self.scrolller, + "redd.it": self.reddit, "redgifs": self.redgifs, "rule34vault": self.rule34vault, "rule34.xxx": self.rule34xxx, + "rule34.xyz": self.rule34xyz, "saint": self.saint, "scrolller": self.scrolller, "simpcity": self.simpcity, "socialmediagirls": self.socialmediagirls, "toonily": self.toonily, "xbunker": self.xbunker, "xbunkr": self.xbunkr, "bunkr": self.bunkrr} self.existing_crawlers = {} @@ -201,6 +201,11 @@ async def redgifs(self) -> None: from cyberdrop_dl.scraper.crawlers.redgifs_crawler import RedGifsCrawler self.existing_crawlers['redgifs'] = RedGifsCrawler(self.manager) + async def rule34vault(self) -> None: + """Creates a Rule34Vault Crawler instance""" + from cyberdrop_dl.scraper.crawlers.rule34vault_crawler import Rule34VaultCrawler + self.existing_crawlers['rule34vault'] = Rule34VaultCrawler(self.manager) + async def rule34xxx(self) -> None: """Creates a Rule34XXX Crawler instance""" from cyberdrop_dl.scraper.crawlers.rule34xxx_crawler import Rule34XXXCrawler diff --git a/cyberdrop_dl/utils/dataclasses/supported_domains.py b/cyberdrop_dl/utils/dataclasses/supported_domains.py index eebda2dd9..99696162f 100644 --- a/cyberdrop_dl/utils/dataclasses/supported_domains.py +++ b/cyberdrop_dl/utils/dataclasses/supported_domains.py @@ -12,7 +12,7 @@ class SupportedDomains: "jpg1.su", "jpg2.su", "jpg3.su", "jpg4.su", "host.church", "kemono", "leakedmodels", "mediafire", "nudostar.com", "nudostar.tv", "omegascans", "pimpandhost", "pixeldrain", "postimg", "realbooru", - "reddit", "redd.it", "redgifs", "rule34.xxx", "rule34.xyz", "saint", + "reddit", "redd.it", "redgifs", "rule34.xxx", "rule34.xyz", "rule34vault", "saint", "scrolller", "simpcity", "socialmediagirls", "toonily", "xbunker", "xbunkr") diff --git a/pyproject.toml b/pyproject.toml index b09a03f78..c63943caa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cyberdrop-dl" -version = "5.3.27" +version = "5.3.28" description = "Bulk downloader for multiple file hosts" authors = ["Jules Winnfield "] readme = "README.md"