Skip to content

Commit

Permalink
Merge branch 'master' into bunkr-albums.io
Browse files Browse the repository at this point in the history
  • Loading branch information
NTFSvolume authored Jan 22, 2025
2 parents b4d9ec6 + 9cb6be9 commit d4161c1
Show file tree
Hide file tree
Showing 31 changed files with 504 additions and 499 deletions.
24 changes: 20 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,33 @@ All notable changes to this project will be documented here. For more details, v
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## UNRELEASED
## [6.3.0] - 2025-01-24


### Added

- members.luscious.net support
- Coomer search results support
- Members.luscious.net support
- Coomer and Kemono search results support
- SendVid.com support
- Support for forum direct link URLs (attachments) as input URLs
- `--logs-expire-after` option: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options/settings/logs#logs_expire_after
- `--filename-regex-filter` option: https://script-ware.gitbook.io/cyberdrop-dl/reference/configuration-options/settings/ignore_options#filename_regex_filter

### Changed

- `--rotate-logs` now creates a subfolder by date

### Fixed

- bunkr reinforced link handling
- Bunkr reinforced link handling
- Handle encoded URLS found while scraping (every crawler)
- Imgur crawler
- MediaFire Crawler
- JPG5 rate limit

### Deprecated

- Forums authentication settings (username, password and `xf_cookie`) will be removed in a future version

## [6.2.0] - 2025-01-10

Expand Down
6 changes: 3 additions & 3 deletions cyberdrop_dl/clients/hash_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, manager: Manager) -> None:
self.md5 = "md5"
self.sha256 = "sha256"
self.hashed_media_items: set[MediaItem] = set()
self.hashes_dict: defaultdict[defaultdict[set[Path]]] = defaultdict(lambda: defaultdict(set))
self.hashes_dict: defaultdict[str, defaultdict[str, set[Path]]] = defaultdict(lambda: defaultdict(set))

async def startup(self) -> None:
pass
Expand Down Expand Up @@ -168,9 +168,9 @@ async def get_file_hashes_dict(self) -> dict:
downloads = self.manager.path_manager.completed_downloads - self.hashed_media_items
for media_item in downloads:
if not media_item.complete_file.is_file():
return
continue
try:
self.hash_item(media_item)
await self.hash_item(media_item)
except Exception as e:
msg = f"Unable to hash file = {media_item.complete_file.resolve()}: {e}"
log(msg, 40)
Expand Down
1 change: 1 addition & 0 deletions cyberdrop_dl/config_definitions/config_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class IgnoreOptions(BaseModel):
ignore_coomer_ads: bool = False
skip_hosts: list[NonEmptyStr] = []
only_hosts: list[NonEmptyStr] = []
filename_regex_filter: NonEmptyStr | None = None

@field_validator("skip_hosts", "only_hosts", mode="before")
@classmethod
Expand Down
7 changes: 6 additions & 1 deletion cyberdrop_dl/config_definitions/global_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class General(BaseModel):
flaresolverr: HttpURL | None = None
max_file_name_length: PositiveInt = 95
max_folder_name_length: PositiveInt = 60
required_free_space: ByteSize = Field(DEFAULT_REQUIRED_FREE_SPACE, ge=MIN_REQUIRED_FREE_SPACE)
required_free_space: ByteSize = DEFAULT_REQUIRED_FREE_SPACE

@field_serializer("required_free_space")
def human_readable(self, value: ByteSize | int) -> str:
Expand All @@ -43,6 +43,11 @@ def serialize(self, value: URL | str) -> str | None:
def convert_to_str(cls, value: URL | str) -> str | None:
return convert_to_str(value)

@field_validator("required_free_space", mode="after")
@classmethod
def override_min(cls, value: ByteSize) -> ByteSize:
return max(value, MIN_REQUIRED_FREE_SPACE)


class RateLimitingOptions(BaseModel):
connection_timeout: PositiveInt = 15
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def attempt_task_removal(self, media_item: MediaItem) -> None:
"""Attempts to remove the task from the progress bar."""
if media_item.task_id is not None:
with contextlib.suppress(ValueError):
self.manager.progress_manager.file_progress.remove_file(media_item.task_id)
self.manager.progress_manager.file_progress.remove_task(media_item.task_id)
media_item.task_id = None

"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""
Expand Down
4 changes: 2 additions & 2 deletions cyberdrop_dl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
STARTUP_LOGGER_CONSOLE = None


def startup() -> Manager:
def startup() -> Manager | None:
"""Starts the program and returns the manager.
This will also run the UI for the program
Expand Down Expand Up @@ -188,7 +188,7 @@ def setup_logger(manager: Manager, config_name: str) -> None:
logger.addHandler(rich_handler)


def ui_error_handling_wrapper(func: Callable) -> None:
def ui_error_handling_wrapper(func: Callable) -> Callable:
"""Wrapper handles errors from the main UI."""

@wraps(func)
Expand Down
8 changes: 5 additions & 3 deletions cyberdrop_dl/managers/client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,12 @@ async def check_http_status(
if any(domain in response.url.host for domain in ("gofile", "imgur")):
with contextlib.suppress(ContentTypeError):
JSON_Resp: dict = await response.json()
if "status" in JSON_Resp and "notFound" in JSON_Resp["status"]:
status = JSON_Resp.get("status")
if status and isinstance(status, str) and "notFound" in status:
raise ScrapeError(404, origin=origin)
if "data" in JSON_Resp and "error" in JSON_Resp["data"]:
raise ScrapeError(JSON_Resp["status"], JSON_Resp["data"]["error"], origin=origin)
data = JSON_Resp.get("data")
if data and isinstance(data, dict) and "error" in data:
raise ScrapeError(status, data["error"], origin=origin)

response_text = None
with contextlib.suppress(UnicodeDecodeError):
Expand Down
10 changes: 10 additions & 0 deletions cyberdrop_dl/managers/config_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import contextlib
import os
import re
import shutil
from dataclasses import field
from time import sleep
Expand Down Expand Up @@ -34,6 +36,7 @@ def __init__(self, manager: Manager) -> None:
self.authentication_data: AuthSettings = field(init=False)
self.settings_data: ConfigSettings = field(init=False)
self.global_settings_data: GlobalSettings = field(init=False)
self.valid_filename_filter_regex = False

def startup(self) -> None:
"""Startup process for the config manager."""
Expand Down Expand Up @@ -66,6 +69,13 @@ def load_configs(self) -> None:
self._set_apprise_fixed()
self._set_pydantic_config()

def post_config_load_validation(self) -> None:
if not self.settings_data.ignore_options.filename_regex_filter:
return
with contextlib.suppress(re.error):
re.compile(self.settings_data.ignore_options.filename_regex_filter)
self.valid_filename_filter_regex = True

@staticmethod
def get_model_fields(model: type[BaseModel], *, exclude_unset: bool = True) -> set[str]:
fields = set()
Expand Down
6 changes: 3 additions & 3 deletions cyberdrop_dl/managers/live_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
if TYPE_CHECKING:
from collections.abc import Generator

from rich.layout import Layout
from rich.console import RenderableType

from cyberdrop_dl.managers.manager import Manager

Expand All @@ -35,7 +35,7 @@ def __init__(self, manager: Manager) -> None:
self.placeholder.add_task("running with no UI", total=100, completed=0)

@contextmanager
def get_live(self, layout: Layout, stop: bool = False) -> Generator[Live]:
def get_live(self, layout: RenderableType, stop: bool = False) -> Generator[Live]:
show = self.placeholder if self.no_ui else layout
try:
self.live.start()
Expand All @@ -48,7 +48,7 @@ def get_live(self, layout: Layout, stop: bool = False) -> Generator[Live]:
@contextmanager
def get_main_live(self, stop: bool = False) -> Generator[Live]:
"""Main UI startup and context manager."""
layout = self.manager.progress_manager.layout
layout = self.manager.progress_manager.main_runtime_layout
with self.get_live(layout, stop=stop) as live:
yield live

Expand Down
7 changes: 7 additions & 0 deletions cyberdrop_dl/managers/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def adjust_for_simpcity(self) -> None:

async def async_startup(self) -> None:
"""Async startup process for the manager."""
self.config_manager.post_config_load_validation()
self.args_logging()

if not isinstance(self.client_manager, ClientManager):
Expand Down Expand Up @@ -208,6 +209,12 @@ def args_logging(self) -> None:
log(f"Using Settings: \n{config_settings}", 10)
log(f"Using Global Settings: \n{global_settings}", 10)

if (
self.config_manager.settings_data.ignore_options.filename_regex_filter
and not self.config_manager.valid_filename_filter_regex
):
log("Regex pattern of filename filter is invalid. Regex check has been disabled", 40)

async def close(self) -> None:
"""Closes the manager."""
await self.db_manager.close()
Expand Down
27 changes: 13 additions & 14 deletions cyberdrop_dl/managers/progress_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from cyberdrop_dl.utils.logger import log, log_spacer, log_with_color

if TYPE_CHECKING:
from rich.console import RenderableType

from cyberdrop_dl.managers.manager import Manager


Expand Down Expand Up @@ -44,34 +46,31 @@ def __init__(self, manager: Manager) -> None:

self.ui_refresh_rate = manager.config_manager.global_settings_data.ui_options.refresh_rate

self.layout: Layout = field(init=False)
self.hash_remove_layout: Layout = field(init=False)
self.hash_layout: Layout = field(init=False)
self.sort_layout: Layout = field(init=False)
self.main_runtime_layout: Layout = field(init=False)
self.hash_remove_layout: RenderableType = field(init=False)
self.hash_layout: RenderableType = field(init=False)
self.sort_layout: RenderableType = field(init=False)

def startup(self) -> None:
"""Startup process for the progress manager."""
progress_layout = Layout()
progress_layout.split_column(
Layout(name="upper", ratio=2, minimum_size=8),
Layout(renderable=self.scraping_progress.get_progress(), name="Scraping", ratio=2),
Layout(renderable=self.file_progress.get_progress(), name="Downloads", ratio=2),
Layout(renderable=self.scraping_progress.get_renderable(), name="Scraping", ratio=2),
Layout(renderable=self.file_progress.get_renderable(), name="Downloads", ratio=2),
)
progress_layout["upper"].split_row(
Layout(renderable=self.download_progress.get_progress(), name="Files", ratio=1),
Layout(renderable=self.scrape_stats_progress.get_progress(), name="Scrape Failures", ratio=1),
Layout(renderable=self.download_stats_progress.get_progress(), name="Download Failures", ratio=1),
)

hash_remove_layout = Layout()
hash_remove_layout = self.hash_progress.get_removed_progress()

self.layout = progress_layout
self.hash_remove_layout = hash_remove_layout
self.hash_layout = self.hash_progress.get_hash_progress()
self.sort_layout = self.sort_progress.get_progress()
self.main_runtime_layout = progress_layout
self.hash_remove_layout = self.hash_progress.get_removed_progress()
self.hash_layout = self.hash_progress.get_renderable()
self.sort_layout = self.sort_progress.get_renderable()

def print_stats(self, start_time: timedelta | float) -> None:
def print_stats(self, start_time: float) -> None:
"""Prints the stats of the program."""
end_time = time.perf_counter()
runtime = timedelta(seconds=int(end_time - start_time))
Expand Down
1 change: 1 addition & 0 deletions cyberdrop_dl/scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from cyberdrop_dl.scraper.crawlers.rule34xyz_crawler import Rule34XYZCrawler
from cyberdrop_dl.scraper.crawlers.saint_crawler import SaintCrawler
from cyberdrop_dl.scraper.crawlers.scrolller_crawler import ScrolllerCrawler
from cyberdrop_dl.scraper.crawlers.sendvid_crawler import SendVidCrawler
from cyberdrop_dl.scraper.crawlers.simpcity_crawler import SimpCityCrawler
from cyberdrop_dl.scraper.crawlers.socialmediagirls_crawler import SocialMediaGirlsCrawler
from cyberdrop_dl.scraper.crawlers.titsintops_crawler import TitsInTopsCrawler
Expand Down
55 changes: 28 additions & 27 deletions cyberdrop_dl/scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import copy
import re
from abc import ABC, abstractmethod
from dataclasses import field
from datetime import datetime
Expand Down Expand Up @@ -138,27 +139,32 @@ async def handle_file(
"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

async def check_skip_by_config(self, media_item: MediaItem) -> bool:
skip = False
settings = self.manager.config_manager.settings_data
if (
settings.download_options.skip_referer_seen_before
and await self.manager.db_manager.temp_referer_table.check_referer(media_item.referer)
):
log(f"Download skip {media_item.url} as referer has been seen before", 10)
return True

if self.manager.config_manager.settings_data.download_options.skip_referer_seen_before:
skip = await self.manager.db_manager.temp_referer_table.check_referer(media_item.referer)
skip_hosts = settings.ignore_options.skip_hosts
if skip_hosts and any(host in media_item.url.host for host in skip_hosts):
log(f"Download skip {media_item.url} due to skip_hosts config", 10)
return True

if skip:
log(f"Download skip {media_item.url} as referer has been seen before", 10)
only_hosts = settings.ignore_options.only_hosts
if only_hosts and not any(host in media_item.url.host for host in only_hosts):
log(f"Download skip {media_item.url} due to only_hosts config", 10)
return True

if not skip and self.manager.config_manager.settings_data.ignore_options.skip_hosts:
skip_hosts = self.manager.config_manager.settings_data.ignore_options.skip_hosts
if any(host in media_item.url.host for host in skip_hosts):
log(f"Download skip {media_item.url} due to skip_hosts config", 10)
skip = True
valid_regex_filter = self.manager.config_manager.valid_filename_filter_regex
regex_filter = self.manager.config_manager.settings_data.ignore_options.filename_regex_filter

if not skip and self.manager.config_manager.settings_data.ignore_options.only_hosts:
only_hosts = self.manager.config_manager.settings_data.ignore_options.only_hosts
if not any(host in media_item.url.host for host in only_hosts):
log(f"Download skip {media_item.url} due to only_hosts config", 10)
skip = True
if valid_regex_filter and re.search(regex_filter, media_item.filename):
log(f"Download skip {media_item.url} due to filename regex filter config", 10)
return True

return skip
return False

def check_post_number(self, post_number: int, current_post_number: int) -> tuple[bool, bool]:
"""Checks if the program should scrape the current post."""
Expand Down Expand Up @@ -324,17 +330,12 @@ def parse_url(self, link_str: str, relative_to: URL | None = None) -> URL:
assert isinstance(link_str, str)
encoded = "%" in link_str
base = relative_to or self.primary_base_domain
if link_str.startswith("?"):
link = base.with_query(link_str[1:])
elif link_str.startswith("/?"):
link = base.with_query(link_str[2:])
elif link_str.startswith("//"):
link = URL("https:" + link_str, encoded=encoded)
elif link_str.startswith("/"):
link = base.joinpath(link_str[1:], encoded=encoded)
else:
link = URL(link_str, encoded=encoded)
return link
new_url = URL(link_str, encoded=encoded)
if not new_url.absolute:
new_url = base.join(new_url)
if not new_url.scheme:
new_url = new_url.with_scheme(base.scheme or "https")
return new_url


def create_task_id(func: Callable) -> Callable:
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def is_reinforced_link(url: URL) -> bool:

@staticmethod
def is_stream_redirect(url: URL) -> bool:
return any(part in url.host.split(".") for part in ("cdn12",)) or url.host == "cdn.bunkr.ru"
return any(part in url.host for part in ("cdn12", "cdn-")) or url.host == "cdn.bunkr.ru"

@staticmethod
def is_cdn(url: URL) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/scraper/crawlers/chevereto_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class CheveretoCrawler(Crawler):
def __init__(self, manager: Manager, site: str) -> None:
super().__init__(manager, site, self.FOLDER_DOMAINS.get(site, "Chevereto"))
self.primary_base_domain = self.PRIMARY_BASE_DOMAINS.get(site, URL(f"https://{site}"))
self.request_limiter = AsyncLimiter(2, 1)
self.request_limiter = AsyncLimiter(1, 1)
self.next_page_selector = "a[data-pagination=next]"
self.album_title_selector = "a[data-text=album-name]"
self.album_img_selector = "a[class='image-container --media'] img"
Expand Down
Loading

0 comments on commit d4161c1

Please sign in to comment.