From 2f6bbdd067de080dcb2d60992aff9fd43ba37e13 Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Thu, 2 Jan 2025 22:16:58 -0500 Subject: [PATCH] fix: e-hentai crawler (#422) * fix: e-hentai crawler * fix: scrape_item reference I have to watch out for these... * refactor: simplify query string * refactor: use a custom filename --- .../scraper/crawlers/ehentai_crawler.py | 106 +++++++++--------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py index 147d02a6..68b62bdf 100644 --- a/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py +++ b/cyberdrop_dl/scraper/crawlers/ehentai_crawler.py @@ -1,20 +1,20 @@ from __future__ import annotations import calendar -import contextlib import datetime from typing import TYPE_CHECKING from aiolimiter import AsyncLimiter from yarl import URL -from cyberdrop_dl.clients.errors import MaxChildrenError from cyberdrop_dl.scraper.crawler import Crawler from cyberdrop_dl.utils.data_enums_classes.url_objects import FILE_HOST_ALBUM, ScrapeItem from cyberdrop_dl.utils.logger import log from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from bs4 import BeautifulSoup from cyberdrop_dl.managers.manager import Manager @@ -26,7 +26,9 @@ class EHentaiCrawler(Crawler): def __init__(self, manager: Manager) -> None: super().__init__(manager, "e-hentai", "E-Hentai") self.request_limiter = AsyncLimiter(10, 1) - self.warnings_set = False + self._warnings_set = False + self.next_page_selector = "td[onclick='document.location=this.firstChild.href']:contains('>') a" + self.next_page_attribute = "href" """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @@ -35,8 +37,6 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: task_id = self.scraping_progress.add_task(scrape_item.url) if "g" in scrape_item.url.parts: - if not self.warnings_set: - await self.set_no_warnings(scrape_item) await self.album(scrape_item) elif "s" in scrape_item.url.parts: await self.image(scrape_item) @@ -49,47 +49,34 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: @error_handling_wrapper async def album(self, scrape_item: ScrapeItem) -> None: """Scrapes an album.""" - async with self.request_limiter: - soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) - - title = self.create_title(soup.select_one("h1[id=gn]").get_text(), None, None) - date = self.parse_datetime(soup.select_one("td[class=gdt2]").get_text()) - scrape_item.type = FILE_HOST_ALBUM - scrape_item.children = scrape_item.children_limit = 0 - - with contextlib.suppress(IndexError, TypeError): - scrape_item.children_limit = ( - self.manager.config_manager.settings_data.download_options.maximum_number_of_children[scrape_item.type] - ) - - images = soup.select("div[class=gdtm] div a") - for image in images: - link = URL(image.get("href")) - new_scrape_item = self.create_scrape_item( - scrape_item, - link, - title, - True, - None, - date, - add_parent=scrape_item.url, - ) - self.manager.task_group.create_task(self.run(new_scrape_item)) - scrape_item.children += 1 - if scrape_item.children_limit and scrape_item.children >= scrape_item.children_limit: - raise MaxChildrenError(origin=scrape_item) - - next_page_opts = soup.select('td[onclick="document.location=this.firstChild.href"]') - next_page = None - for maybe_next in next_page_opts: - if maybe_next.get_text() == ">": - next_page = maybe_next.select_one("a") - break - if next_page is not None: - next_page = URL(next_page.get("href")) - if next_page is not None: - new_scrape_item = self.create_scrape_item(scrape_item, next_page, "") - self.manager.task_group.create_task(self.run(new_scrape_item)) + if not self._warnings_set: + await self.set_no_warnings(scrape_item) + + title = date = None + gallery_id = scrape_item.url.parts[2] + scrape_item.url = scrape_item.url.with_query(None) + scrape_item.set_type(FILE_HOST_ALBUM, self.manager) + + async for soup in self.web_pager(scrape_item): + if not title: + title = self.create_title(soup.select_one("h1[id=gn]").get_text()) + date = self.parse_datetime(soup.select_one("td[class=gdt2]").get_text()) + + images = soup.select("div#gdt.gt200 a") + for image in images: + link = URL(image.get("href")) + new_scrape_item = self.create_scrape_item( + scrape_item, + link, + title, + part_of_album=True, + album_id=gallery_id, + possible_datetime=date, + add_parent=scrape_item.url, + ) + + await self.image(new_scrape_item) + scrape_item.add_children() @error_handling_wrapper async def image(self, scrape_item: ScrapeItem) -> None: @@ -99,20 +86,22 @@ async def image(self, scrape_item: ScrapeItem) -> None: async with self.request_limiter: soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) + image = soup.select_one("img[id=img]") link = URL(image.get("src")) filename, ext = get_filename_and_ext(link.name) - await self.handle_file(link, scrape_item, filename, ext) + custom_filename, _ = get_filename_and_ext(f"{scrape_item.url.name}{ext}") + await self.handle_file(link, scrape_item, filename, ext, custom_filename=custom_filename) """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" @error_handling_wrapper async def set_no_warnings(self, scrape_item: ScrapeItem) -> None: """Sets the no warnings cookie.""" - self.warnings_set = True async with self.request_limiter: - scrape_item.url = URL(str(scrape_item.url) + "/").update_query("nw=session") - await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item) + url = scrape_item.url.update_query(nw="session") + await self.client.get_soup(self.domain, url, origin=scrape_item) + self._warnings_set = True @staticmethod def parse_datetime(date: str) -> int: @@ -121,3 +110,20 @@ def parse_datetime(date: str) -> int: date = date + ":00" date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") return calendar.timegm(date.timetuple()) + + async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]: + """Generator of website pages.""" + page_url = scrape_item.url + while True: + async with self.request_limiter: + soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url, origin=scrape_item) + next_page = soup.select_one(self.next_page_selector) + yield soup + if next_page: + page_url = next_page.get(self.next_page_attribute) + if page_url: + if page_url.startswith("/"): + page_url = self.primary_base_domain / page_url[1:] + page_url = URL(page_url) + continue + break