This repository has been archived by the owner on Jul 5, 2024. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 201
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
852342d
commit 7769595
Showing
5 changed files
with
137 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "5.3.27" | ||
__version__ = "5.3.28" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
from __future__ import annotations | ||
|
||
import calendar | ||
import datetime | ||
from typing import TYPE_CHECKING | ||
|
||
from aiolimiter import AsyncLimiter | ||
from yarl import URL | ||
|
||
from cyberdrop_dl.scraper.crawler import Crawler | ||
from cyberdrop_dl.utils.dataclasses.url_objects import ScrapeItem | ||
from cyberdrop_dl.utils.utilities import get_filename_and_ext, error_handling_wrapper | ||
|
||
if TYPE_CHECKING: | ||
from cyberdrop_dl.managers.manager import Manager | ||
|
||
|
||
class Rule34VaultCrawler(Crawler): | ||
def __init__(self, manager: Manager): | ||
super().__init__(manager, "rule34vault", "Rule34Vault") | ||
self.primary_base_url = URL("https://rule34vault.com") | ||
self.request_limiter = AsyncLimiter(10, 1) | ||
|
||
"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" | ||
|
||
async def fetch(self, scrape_item: ScrapeItem) -> None: | ||
"""Determines where to send the scrape item based on the url""" | ||
task_id = await self.scraping_progress.add_task(scrape_item.url) | ||
|
||
if "post" in scrape_item.url.parts: | ||
await self.file(scrape_item) | ||
elif "playlists" in scrape_item.url.parts: | ||
await self.playlist(scrape_item) | ||
else: | ||
await self.tag(scrape_item) | ||
|
||
await self.scraping_progress.remove_task(task_id) | ||
|
||
@error_handling_wrapper | ||
async def tag(self, scrape_item: ScrapeItem) -> None: | ||
"""Scrapes an album""" | ||
async with self.request_limiter: | ||
soup = await self.client.get_BS4(self.domain, scrape_item.url) | ||
|
||
title = await self.create_title(scrape_item.url.parts[1], None, None) | ||
|
||
content_block = soup.select_one('div[class="grid ng-star-inserted"]') | ||
content = content_block.select('a[class="box ng-star-inserted"]') | ||
for file_page in content: | ||
link = file_page.get('href') | ||
if link.startswith("/"): | ||
link = f"{self.primary_base_url}{link}" | ||
link = URL(link) | ||
new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) | ||
self.manager.task_group.create_task(self.run(new_scrape_item)) | ||
if not content: | ||
return | ||
|
||
if len(scrape_item.url.parts) > 2: | ||
page = int(scrape_item.url.parts[-1]) | ||
next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/{page + 1}") | ||
else: | ||
next_page = scrape_item.url.with_path(f"/{scrape_item.url.parts[1]}/page/2") | ||
new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") | ||
self.manager.task_group.create_task(self.run(new_scrape_item)) | ||
|
||
@error_handling_wrapper | ||
async def playlist(self, scrape_item: ScrapeItem) -> None: | ||
"""Scrapes a playlist""" | ||
async with self.request_limiter: | ||
soup = await self.client.get_BS4(self.domain, scrape_item.url) | ||
|
||
title_str = soup.select_one('div[class*=title]').text | ||
title = await self.create_title(title_str, scrape_item.url.parts[-1], None) | ||
|
||
content_block = soup.select_one('div[class="grid ng-star-inserted"]') | ||
content = content_block.select('a[class="box ng-star-inserted"]') | ||
for file_page in content: | ||
link = file_page.get('href') | ||
if link.startswith("/"): | ||
link = f"{self.primary_base_url}{link}" | ||
link = URL(link) | ||
new_scrape_item = await self.create_scrape_item(scrape_item, link, title, True) | ||
self.manager.task_group.create_task(self.run(new_scrape_item)) | ||
if not content: | ||
return | ||
|
||
if scrape_item.url.query: | ||
page = scrape_item.url.query.get("page") | ||
next_page = scrape_item.url.with_query({"page": int(page) + 1}) | ||
else: | ||
next_page = scrape_item.url.with_query({"page": 2}) | ||
new_scrape_item = await self.create_scrape_item(scrape_item, next_page, "") | ||
self.manager.task_group.create_task(self.run(new_scrape_item)) | ||
|
||
@error_handling_wrapper | ||
async def file(self, scrape_item: ScrapeItem) -> None: | ||
"""Scrapes an image""" | ||
async with self.request_limiter: | ||
soup = await self.client.get_BS4(self.domain, scrape_item.url) | ||
|
||
date = await self.parse_datetime(soup.select_one('div[class="text-primary ng-star-inserted"]').text.split("(")[1].split(")")[0]) | ||
scrape_item.date = date | ||
|
||
image = soup.select_one('img[class*="img ng-star-inserted"]') | ||
if image: | ||
link = image.get('src') | ||
if link.startswith("/"): | ||
link = f"{self.primary_base_url}{link}" | ||
link = URL(link) | ||
filename, ext = await get_filename_and_ext(link.name) | ||
await self.handle_file(link, scrape_item, filename, ext) | ||
video = soup.select_one("video source") | ||
if video: | ||
link = video.get('src') | ||
if link.startswith("/"): | ||
link = f"{self.primary_base_url}{link}" | ||
link = URL(link) | ||
filename, ext = await get_filename_and_ext(link.name) | ||
await self.handle_file(link, scrape_item, filename, ext) | ||
|
||
"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" | ||
|
||
async def parse_datetime(self, date: str) -> int: | ||
"""Parses a datetime string into a unix timestamp""" | ||
date = datetime.datetime.strptime(date, "%b %d, %Y, %I:%M:%S %p") | ||
return calendar.timegm(date.timetuple()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "cyberdrop-dl" | ||
version = "5.3.27" | ||
version = "5.3.28" | ||
description = "Bulk downloader for multiple file hosts" | ||
authors = ["Jules Winnfield <[email protected]>"] | ||
readme = "README.md" | ||
|