Skip to content

Commit

Permalink
fix: handle encoded URLs from within any crawler (jbsparrow#475)
Browse files Browse the repository at this point in the history
* fix: handle encoded URLs from within any crawler

* refactor: reduce code duplication

* refactor: make AsyncLimiter(10, 1) the default

* fix: crawler specific fixes

* docs: update changelog
  • Loading branch information
NTFSvolume authored Jan 18, 2025
1 parent 35ff8bf commit 1dceff5
Show file tree
Hide file tree
Showing 46 changed files with 952 additions and 1,618 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- bunkr reinforced link handling
- Handle encoded URLS found while scraping (every crawler)

## [6.2.0] - 2025-01-10

Expand Down
8 changes: 4 additions & 4 deletions cyberdrop_dl/clients/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,23 +53,23 @@ def __init__(self, *, message: str | None = None, origin: ScrapeItem | MediaItem


class PasswordProtectedError(CDLBaseError):
def __init__(self, *, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
def __init__(self, message: str | None = None, *, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
"""This error will be thrown when a file is password protected."""
ui_message = "Password Protected"
message = message or "File/Folder is password protected"
super().__init__(ui_message, message=message, origin=origin)


class MaxChildrenError(CDLBaseError):
def __init__(self, *, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
def __init__(self, message: str | None = None, *, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
"""This error will be thrown when an scrape item reaches its max number or children."""
ui_message = "Max Children Reached"
message = message or "Max number of children reached"
super().__init__(ui_message, message=message, origin=origin)


class DDOSGuardError(CDLBaseError):
def __init__(self, *, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
def __init__(self, message: str | None = None, *, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
"""This error will be thrown when DDoS-Guard is detected."""
ui_message = "DDoS-Guard"
message = message or "DDoS-Guard detected"
Expand Down Expand Up @@ -152,7 +152,7 @@ def __init__(


class LoginError(CDLBaseError):
def __init__(self, *, message: str | None = None, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
def __init__(self, message: str | None = None, *, origin: ScrapeItem | MediaItem | URL | None = None) -> None:
"""This error will be thrown when the login fails for a site."""
ui_message = "Failed Login"
super().__init__(ui_message, message=message, origin=origin)
Expand Down
4 changes: 2 additions & 2 deletions cyberdrop_dl/clients/scraper_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from aiohttp_client_cache import CachedSession
from aiohttp_client_cache.response import CachedStreamReader
from bs4 import BeautifulSoup
from yarl import URL

from cyberdrop_dl.clients.errors import DDOSGuardError, InvalidContentTypeError
from cyberdrop_dl.utils.constants import DEBUG_VAR
Expand All @@ -19,6 +18,7 @@
from collections.abc import Callable

from multidict import CIMultiDictProxy
from yarl import URL

from cyberdrop_dl.managers.client_manager import ClientManager
from cyberdrop_dl.utils.data_enums_classes.url_objects import ScrapeItem
Expand Down Expand Up @@ -129,7 +129,7 @@ async def get_soup(
raise InvalidContentTypeError(message=f"Received {content_type}, was expecting text", origin=origin)
text = await CachedStreamReader(await response.read()).read()
if with_response_url:
return BeautifulSoup(text, "html.parser"), URL(response.url)
return BeautifulSoup(text, "html.parser"), response.url
return BeautifulSoup(text, "html.parser")

async def get_soup_and_return_url(
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/config_definitions/global_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class General(BaseModel):
flaresolverr: HttpURL | None = None
max_file_name_length: PositiveInt = 95
max_folder_name_length: PositiveInt = 60
required_free_space: ByteSize = Field(DEFAULT_REQUIRED_FREE_SPACE, gt=MIN_REQUIRED_FREE_SPACE)
required_free_space: ByteSize = Field(DEFAULT_REQUIRED_FREE_SPACE, ge=MIN_REQUIRED_FREE_SPACE)

@field_serializer("required_free_space")
def human_readable(self, value: ByteSize | int) -> str:
Expand Down
21 changes: 19 additions & 2 deletions cyberdrop_dl/scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from functools import wraps
from typing import TYPE_CHECKING, Any, ClassVar, Protocol

from aiolimiter import AsyncLimiter
from bs4 import BeautifulSoup
from yarl import URL

Expand Down Expand Up @@ -46,6 +47,7 @@ def __init__(self, manager: Manager, domain: str, folder_domain: str | None = No
self.client: ScraperClient = field(init=False)
self._semaphore = asyncio.Semaphore(20)
self.startup_lock = asyncio.Lock()
self.request_limiter = AsyncLimiter(10, 1)
self.ready: bool = False

self.domain = domain
Expand Down Expand Up @@ -247,7 +249,7 @@ async def check_complete_from_referer(self, scrape_item: ScrapeItem | URL) -> bo
return True
return False

async def get_album_results(self, album_id: str) -> bool | dict[Any, Any]:
async def get_album_results(self, album_id: str) -> dict[Any, Any]:
"""Checks whether an album has completed given its domain and album id."""
return await self.manager.db_manager.history_table.check_album(self.domain, album_id)

Expand Down Expand Up @@ -317,8 +319,23 @@ def add_separate_post_title(self, scrape_item: ScrapeItem, post: Post) -> None:
title = title_format.format(id=id, number=id, date=date, title=title)
scrape_item.add_to_parent_title(title)

def parse_url(self, link_str: str, relative_to: URL | None = None) -> URL:
assert link_str
assert isinstance(link_str, str)
encoded = "%" in link_str
base = relative_to or self.primary_base_domain
if link_str.startswith("?"):
link = base.with_query(link_str[1:])
elif link_str.startswith("//"):
link = URL("https:" + link_str, encoded=encoded)
elif link_str.startswith("/"):
link = base.joinpath(link_str[1:], encoded=encoded)
else:
link = URL(link_str, encoded=encoded)
return link


def create_task_id(func: Callable) -> None:
def create_task_id(func: Callable) -> Callable:
"""Wrapper handles task_id creation and removal for ScrapeItems"""

@wraps(func)
Expand Down
34 changes: 15 additions & 19 deletions cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import re
from typing import TYPE_CHECKING, ClassVar

from aiolimiter import AsyncLimiter
from yarl import URL

from cyberdrop_dl.clients.errors import NoExtensionError, ScrapeError
Expand Down Expand Up @@ -52,7 +51,6 @@ class BunkrrCrawler(Crawler):

def __init__(self, manager: Manager, site: str) -> None:
super().__init__(manager, site, "Bunkrr")
self.request_limiter = AsyncLimiter(10, 1)

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

Expand Down Expand Up @@ -83,40 +81,37 @@ async def album(self, scrape_item: ScrapeItem) -> None:
scrape_item.url = self.primary_base_domain.with_path(scrape_item.url.path)
album_id = scrape_item.url.parts[2]
scrape_item.album_id = album_id
scrape_item.part_of_album = True
results = await self.get_album_results(album_id)
scrape_item.set_type(FILE_HOST_ALBUM, self.manager)

async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

title = soup.select_one("title").text.rsplit(" | Bunkr")[0].strip()
title = self.create_title(title, scrape_item.url.parts[2], None)
title = self.create_title(title, album_id)
scrape_item.add_to_parent_title(title)

card_listings: list[Tag] = soup.select('div[class*="relative group/item theItem"]')
for card_listing in card_listings:
filename = card_listing.select_one('p[class*="theName"]').text
file_ext = "." + filename.split(".")[-1]
thumbnail = card_listing.select_one("img").get("src")
thumbnail: str = card_listing.select_one("img").get("src")
date_str = card_listing.select_one('span[class*="theDate"]').text.strip()
date = self.parse_datetime(date_str)
link = card_listing.find("a").get("href")
if link.startswith("/"):
link = URL("https://" + scrape_item.url.host + link)

link = URL(link)
link_str: str = card_listing.find("a").get("href")
link = self.parse_url(link_str, scrape_item.url.with_path("/"))
new_scrape_item = self.create_scrape_item(
scrape_item,
link,
part_of_album=True,
album_id=album_id,
possible_datetime=date,
add_parent=scrape_item.url,
)

valid_extensions = FILE_FORMATS["Images"] | FILE_FORMATS["Videos"]
src = thumbnail.replace("/thumbs/", "/")
src = URL(src, encoded=True)
src_str = thumbnail.replace("/thumbs/", "/")
src = self.parse_url(src_str)
src = src.with_suffix(file_ext).with_query(None)
if file_ext.lower() not in FILE_FORMATS["Images"]:
src = src.with_host(src.host.replace("i-", ""))
Expand Down Expand Up @@ -165,11 +160,11 @@ async def file(self, scrape_item: ScrapeItem) -> None:
link_container = soup.select_one("a.btn.ic-download-01")
src_selector = "href"

link = link_container.get(src_selector) if link_container else None
if not link:
link_str: str = link_container.get(src_selector) if link_container else None
if not link_str:
raise ScrapeError(422, "Couldn't find source", origin=scrape_item)

link = URL(link)
link = self.parse_url(link_str)
date = None
date_str = soup.select_one('span[class*="theDate"]')
if date_str:
Expand Down Expand Up @@ -212,7 +207,8 @@ async def handle_reinforced_link(self, url: URL, scrape_item: ScrapeItem) -> URL
link_container = soup.select('a[download*=""]')[-1]
except IndexError:
link_container = soup.select("a[class*=download]")[-1]
return URL(link_container.get("href"))
link_str: str = link_container.get("href")
return self.parse_url(link_str)

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

Expand All @@ -225,7 +221,7 @@ def is_reinforced_link(url: URL) -> bool:

@staticmethod
def is_stream_redirect(url: URL) -> bool:
return any(part in url.host.split(".") for part in ("cdn12",))
return any(part in url.host.split(".") for part in ("cdn12",)) or url.host == "cdn.bunkr.ru"

@staticmethod
def is_cdn(url: URL) -> bool:
Expand All @@ -235,8 +231,8 @@ def is_cdn(url: URL) -> bool:
@staticmethod
def parse_datetime(date: str) -> int:
"""Parses a datetime string into a unix timestamp."""
date = datetime.datetime.strptime(date, "%H:%M:%S %d/%m/%Y")
return calendar.timegm(date.timetuple())
parsed_date = datetime.datetime.strptime(date, "%H:%M:%S %d/%m/%Y")
return calendar.timegm(parsed_date.timetuple())

@staticmethod
def override_cdn(link: URL) -> URL:
Expand Down
66 changes: 28 additions & 38 deletions cyberdrop_dl/scraper/crawlers/chevereto_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cyberdrop_dl.utils.data_enums_classes.url_objects import ScrapeItem

CDN_PATTERNS = {
"jpg.church": r"^(?:https?:\/\/?)((jpg.church\/images)|(simp..jpg.church)|(jpg.fish\/images)|(simp..jpg.fish)|(jpg.fishing\/images)|(simp..jpg.fishing)|(simp..host.church)|(simp..jpg..su))(\/.*)",
"jpg5.su": r"^(?:https?:\/\/?)((jpg.church\/images)|(simp..jpg.church)|(jpg.fish\/images)|(simp..jpg.fish)|(jpg.fishing\/images)|(simp..jpg.fishing)|(simp..host.church)|(simp..jpg..su))(\/.*)",
"imagepond.net": r"^(?:https?:\/\/)?(media.imagepond.net\/.*)",
"img.kiwi": r"^(?:https?:\/\/)?img\.kiwi\/images\/.*",
}
Expand Down Expand Up @@ -78,9 +78,10 @@ def __init__(self, manager: Manager, site: str) -> None:
self.album_img_selector = "a[class='image-container --media'] img"
self.profile_item_selector = "a[class='image-container --media']"
self.profile_title_selector = 'meta[property="og:title"]'
self.images_parts = "image", "img", "images"
self.images_parts = "image", "img"
self.album_parts = "a", "album"
self.video_parts = "video", "videos"
self.direct_link_parts = ("images",)

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

Expand All @@ -97,6 +98,9 @@ async def fetch(self, scrape_item: ScrapeItem) -> None:
await self.image(scrape_item)
elif any(part in scrape_item.url.parts for part in self.video_parts):
await self.video(scrape_item)
elif any(part in scrape_item.url.parts for part in self.direct_link_parts):
filename, ext = get_filename_and_ext(scrape_item.url.name)
await self.handle_file(scrape_item.url, scrape_item, filename, ext)
else:
await self.profile(scrape_item)

Expand All @@ -106,17 +110,15 @@ async def profile(self, scrape_item: ScrapeItem) -> None:
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

title = self.create_title(soup.select_one(self.profile_title_selector).get("content"), None, None)
title = self.create_title(soup.select_one(self.profile_title_selector).get("content"))

async for soup in self.web_pager(scrape_item):
links = soup.select(self.profile_item_selector)
for link in links:
link = link.get("href")
if not link:
link_str: str = link.get("href")
if not link_str:
continue
if link.startswith("/"):
link = self.primary_base_domain / link[1:]
link = URL(link)
link = self.parse_url(link_str)
new_scrape_item = self.create_scrape_item(
scrape_item,
link,
Expand All @@ -137,9 +139,8 @@ async def album(self, scrape_item: ScrapeItem) -> None:
scrape_item.url = scrape_item.url.with_query(None)

async with self.request_limiter:
sub_albums_soup: BeautifulSoup = await self.client.get_soup(
self.domain, scrape_item.url / "sub", origin=scrape_item
)
sub_albums = scrape_item.url / "sub"
sub_albums_soup: BeautifulSoup = await self.client.get_soup(self.domain, sub_albums, origin=scrape_item)

scrape_item.url = canonical_url

Expand All @@ -160,29 +161,20 @@ async def album(self, scrape_item: ScrapeItem) -> None:
if "This content is password protected" in sub_albums_soup.text:
raise PasswordProtectedError(message="Wrong password" if password else None, origin=scrape_item)

title = self.create_title(
sub_albums_soup.select_one(self.album_title_selector).get_text(),
album_id,
None,
)
title = self.create_title(sub_albums_soup.select_one(self.album_title_selector).get_text(), album_id)

sub_albums = sub_albums_soup.select(self.profile_item_selector)
for album in sub_albums:
sub_album_link = album.get("href")
if sub_album_link.startswith("/"):
sub_album_link = self.primary_base_domain / sub_album_link[1:]

sub_album_link = URL(sub_album_link)
new_scrape_item = self.create_scrape_item(scrape_item, sub_album_link, "", True)
link_str: str = album.get("href")
link = self.parse_url(link_str)
new_scrape_item = self.create_scrape_item(scrape_item, link)
self.manager.task_group.create_task(self.run(new_scrape_item))

async for soup in self.web_pager(scrape_item):
links = soup.select(self.album_img_selector)
for link in links:
link = link.get("src")
if link.startswith("/"):
link = self.primary_base_domain / link[1:]
link = URL(link)
link_str: str = link.get("src")
link = self.parse_url(link_str)
new_scrape_item = self.create_scrape_item(
scrape_item,
link,
Expand Down Expand Up @@ -222,7 +214,8 @@ async def _proccess_media_item(self, scrape_item: ScrapeItem, url_type: UrlType,
scrape_item.url = canonical_url

try:
link = URL(soup.select_one(selector[0]).get(selector[1]))
link_str: str = soup.select_one(selector[0]).get(selector[1])
link = self.parse_url(link_str)
link = link.with_name(link.name.replace(".md.", ".").replace(".th.", "."))
except AttributeError:
raise ScrapeError(422, f"Couldn't find {url_type.value} source", origin=scrape_item) from None
Expand Down Expand Up @@ -268,7 +261,8 @@ def get_canonical_url(self, scrape_item: ScrapeItem, url_type: UrlType = UrlType
name = scrape_item.url.parts[name_index]
_id = name.rsplit(".")[-1]
new_parts = scrape_item.url.parts[1:name_index] + (_id,)
return _id, scrape_item.url.with_path("/".join(new_parts))
new_path = "/" + "/".join(new_parts)
return _id, self.parse_url(new_path, scrape_item.url.with_path("/"))

async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSoup]:
"""Generator of website pages."""
Expand All @@ -278,14 +272,10 @@ async def web_pager(self, scrape_item: ScrapeItem) -> AsyncGenerator[BeautifulSo
soup: BeautifulSoup = await self.client.get_soup(self.domain, page_url, origin=scrape_item)
next_page = soup.select_one(self.next_page_selector)
yield soup
if next_page:
page_url = next_page.get("href")
if page_url:
if page_url.startswith("/"):
page_url = self.primary_base_domain / page_url[1:]
page_url = URL(page_url)
continue
break
if not next_page:
break
page_url_str: str = next_page.get("href")
page_url = self.parse_url(page_url_str)

@staticmethod
async def get_sort_by_new_url(url: URL) -> URL:
Expand All @@ -294,8 +284,8 @@ async def get_sort_by_new_url(url: URL) -> URL:
@staticmethod
def parse_datetime(date: str) -> int:
"""Parses a datetime string into a unix timestamp."""
date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
return calendar.timegm(date.timetuple())
parsed_date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
return calendar.timegm(parsed_date.timetuple())

@staticmethod
def check_direct_link(url: URL) -> bool:
Expand Down
Loading

0 comments on commit 1dceff5

Please sign in to comment.