Skip to content

Commit

Permalink
Merge branch 'master' into pydantic
Browse files Browse the repository at this point in the history
  • Loading branch information
NTFSvolume committed Nov 28, 2024
2 parents d638751 + 3ff244e commit 4ad95a1
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 102 deletions.
4 changes: 2 additions & 2 deletions cyberdrop_dl/clients/scraper_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ async def get_soup(
origin,
)
# retry request with flaresolverr cookies
if self.client_manager.check_ddos_guard(soup):
if self.client_manager.check_ddos_guard(soup) or self.client_manager.check_cloudflare(soup):
if not retry:
raise DDOSGuardError(message="Unable to access website with flaresolverr cookies") from None
return await self.get_soup(domain, url, client_session, origin, with_response_url, retry=False)
Expand Down Expand Up @@ -174,7 +174,7 @@ async def get_text(
await self.client_manager.check_http_status(response, origin=origin)
except DDOSGuardError:
soup, _ = await self.client_manager.flaresolverr.get(url, client_session, origin)
if self.client_manager.check_ddos_guard(soup):
if self.client_manager.check_ddos_guard(soup) or self.client_manager.check_cloudflare(soup):
if not retry:
raise DDOSGuardError(message="Unable to access website with flaresolverr cookies") from None
return await self.get_text(domain, url, client_session, origin, retry=False)
Expand Down
36 changes: 27 additions & 9 deletions cyberdrop_dl/managers/client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
".lds-ring",
]

CLOUDFLARE_CHALLENGE_TITLES = ["Simpcity Cuck Detection"]
CLOUDFLARE_CHALLENGE_SELECTORS = ["captchawrapper", "cf-turnstile"]


class ClientManager:
"""Creates a 'client' that can be referenced by scraping or download sessions."""
Expand Down Expand Up @@ -122,6 +125,15 @@ async def check_http_status(
message = DOWNLOAD_ERROR_ETAGS.get(headers.get("ETag"))
raise DownloadError(HTTPStatus.NOT_FOUND, message=message, origin=origin)

response_text = None
with contextlib.suppress(UnicodeDecodeError):
response_text = await response.text()

if response_text:
soup = BeautifulSoup(response_text, "html.parser")
if cls.check_ddos_guard(soup) or cls.check_cloudflare(soup):
raise DDOSGuardError(origin=origin)

if HTTPStatus.OK <= status < HTTPStatus.BAD_REQUEST:
return

Expand All @@ -133,15 +145,6 @@ async def check_http_status(
if "data" in JSON_Resp and "error" in JSON_Resp["data"]:
raise ScrapeError(JSON_Resp["status"], JSON_Resp["data"]["error"], origin=origin)

response_text = None
with contextlib.suppress(UnicodeDecodeError):
response_text = await response.text()

if response_text:
soup = BeautifulSoup(response_text, "html.parser")
if cls.check_ddos_guard(soup):
raise DDOSGuardError(origin=origin)

status = status if headers.get("Content-Type") else CustomHTTPStatus.IM_A_TEAPOT
message = "No content-type in response header" if headers.get("Content-Type") else None

Expand Down Expand Up @@ -170,6 +173,21 @@ def check_ddos_guard(soup: BeautifulSoup) -> bool:

return False

@staticmethod
def check_cloudflare(soup: BeautifulSoup) -> bool:
if soup.title:
for title in CLOUDFLARE_CHALLENGE_TITLES:
challenge_found = title.casefold() == soup.title.string.casefold()
if challenge_found:
return True

for selector in CLOUDFLARE_CHALLENGE_SELECTORS:
challenge_found = soup.find(selector)
if challenge_found:
return True

return False

async def close(self) -> None:
await self.flaresolverr._destroy_session()

Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def handle_file(
"""Finishes handling the file and hands it off to the downloader."""
if custom_filename:
original_filename, filename = filename, custom_filename
elif self.domain in ["cyberdrop", "bunkrr"]:
elif self.domain in ["cyberdrop"]:
original_filename, filename = remove_file_id(self.manager, filename, ext)
else:
original_filename = filename
Expand Down
78 changes: 58 additions & 20 deletions cyberdrop_dl/scraper/crawlers/bunkrr_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,29 @@

from cyberdrop_dl.managers.manager import Manager

CDN_POSSIBILITIES = re.compile(
r"^(?:(?:(?:media-files|cdn|c|pizza|cdn-burger|cdn-nugget|burger|taquito|pizza|fries|meatballs|milkshake|kebab|nachos|ramen|wiener)[0-9]{0,2})|(?:(?:big-taco-|cdn-pizza|cdn-meatballs|cdn-milkshake|i.kebab|i.fries|i-nugget|i-milkshake|i-nachos|i-ramen|i-wiener)[0-9]{0,2}(?:redir)?))\.bunkr?\.[a-z]{2,3}$",
)
BASE_CDNS = [
"big-taco",
"burger",
"c",
"cdn",
"fries",
"kebab",
"meatballs",
"milkshake",
"nachos",
"nugget",
"pizza",
"ramen",
"soup",
"taquito",
"wiener",
]

EXTENDED_CDNS = [f"cdn-{cdn}" for cdn in BASE_CDNS]
IMAGE_CDNS = [f"i-{cdn}" for cdn in BASE_CDNS]
CDNS = BASE_CDNS + EXTENDED_CDNS + IMAGE_CDNS
CDN_REGEX_STR = r"^(?:(?:(" + "|".join(CDNS) + r")[0-9]{0,2}(?:redir)?))\.bunkr?\.[a-z]{2,3}$"
CDN_POSSIBILITIES = re.compile(CDN_REGEX_STR)


class BunkrrCrawler(Crawler):
Expand All @@ -38,16 +58,16 @@ def __init__(self, manager: Manager, site: str) -> None:
async def fetch(self, scrape_item: ScrapeItem) -> None:
"""Determines where to send the scrape item based on the url."""
task_id = self.scraping_progress.add_task(scrape_item.url)
scrape_item.url = self.get_stream_link(scrape_item.url)

if scrape_item.url.host.startswith("get"):
scrape_item.url = await self.reinforced_link(scrape_item.url)
if not scrape_item.url:
return
scrape_item.url = self.get_stream_link(scrape_item.url)

if "a" in scrape_item.url.parts:
await self.album(scrape_item)
elif self.is_cdn(scrape_item.url):
await self.handle_direct_link(scrape_item)
else:
await self.file(scrape_item)

Expand Down Expand Up @@ -113,9 +133,9 @@ async def album(self, scrape_item: ScrapeItem) -> None:
msg = "No image found, reverting to parent"
raise FileNotFoundError(msg)

filename, ext = get_filename_and_ext(src.name)
src_filename, ext = get_filename_and_ext(src.name)
if not self.check_album_results(src, results):
await self.handle_file(src, new_scrape_item, filename, ext)
await self.handle_file(src, new_scrape_item, src_filename, ext, custom_filename=filename)

except FileNotFoundError:
self.manager.task_group.create_task(self.run(new_scrape_item))
Expand All @@ -134,6 +154,11 @@ async def file(self, scrape_item: ScrapeItem) -> None:
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)

"""
Some old page details may have the uuid as the title instead of the filename.
Commenting out this code ensures we always get the actual filename from `get.bunkr.su`, at the expense of one additional request
# try video
link_container = soup.select_one("video > source")
src_selector = "src"
Expand All @@ -144,28 +169,39 @@ async def file(self, scrape_item: ScrapeItem) -> None:
# fallback for everything else
if not link_container:
link_container = soup.select_one("a.btn.ic-download-01")
src_selector = "href"
"""
link_container = soup.select_one("a.btn.ic-download-01")
src_selector = "href"

link = link_container.get(src_selector) if link_container else None

if not link:
raise ScrapeError(404, f"Could not find source for: {scrape_item.url}", origin=scrape_item)

link = URL(link)
await self.handle_direct_link(scrape_item, link, fallback_filename=soup.select_one("h1").text)

async def handle_direct_link(
self, scrape_item: ScrapeItem, url: URL | None = None, fallback_filename: str | None = None
) -> None:
"""Handles direct links (CDNs URLs) before sending them to the downloader.
If `link` is not supplied, `scrape_item.url` will be used by default
`fallback_filename` will only be used if the link has not `n` query parameter"""
link = url or scrape_item.url
if "get" in link.host:
link: URL = await self.reinforced_link(link)
if not link:
return
try:
filename, ext = get_filename_and_ext(link.name)
src_filename, ext = get_filename_and_ext(link.name)
except NoExtensionError:
if "get" in link.host:
link = await self.reinforced_link(link)
if not link:
return
filename, ext = get_filename_and_ext(link.name)
else:
filename, ext = get_filename_and_ext(scrape_item.url.name)

await self.handle_file(link, scrape_item, filename, ext)
src_filename, ext = get_filename_and_ext(scrape_item.url.name)
filename = link.query.get("n") or fallback_filename
if not url:
scrape_item = self.create_scrape_item(scrape_item, URL("https://get.bunkrr.su/"), "")
await self.handle_file(link, scrape_item, src_filename, ext, custom_filename=filename)

@error_handling_wrapper
async def reinforced_link(self, url: URL) -> URL:
Expand All @@ -188,7 +224,9 @@ def is_cdn(url: URL) -> bool:
return bool(re.match(CDN_POSSIBILITIES, url.host))

def get_stream_link(self, url: URL) -> URL:
"""Gets the stream link for a given url."""
"""DEPRECATED: NO LONGER WORKS.
Gets the stream link for a given url."""
if not self.is_cdn(url):
return url

Expand Down
45 changes: 15 additions & 30 deletions cyberdrop_dl/scraper/crawlers/coomer_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext

if TYPE_CHECKING:
from bs4 import BeautifulSoup

from cyberdrop_dl.managers.manager import Manager


Expand Down Expand Up @@ -78,8 +76,7 @@ async def favorites(self, scrape_item: ScrapeItem) -> None:
async def profile(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a profile."""
offset = 0
service, user = self.get_service_and_user(scrape_item)
user_str = await self.get_user_str_from_profile(scrape_item)
service, user, user_str = await self.get_user_info(scrape_item)
api_call = self.api_url / service / "user" / user
scrape_item.type = FILE_HOST_PROFILE
scrape_item.children = scrape_item.children_limit = 0
Expand Down Expand Up @@ -109,11 +106,11 @@ async def profile(self, scrape_item: ScrapeItem) -> None:
@error_handling_wrapper
async def post(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a post."""
service, user, post_id = await self.get_service_user_and_post(scrape_item)
user_str = await self.get_user_str_from_post(scrape_item)
service, user, post_id, user_str = await self.get_user_info(scrape_item)
api_call = self.api_url / service / "user" / user / "post" / post_id
async with self.request_limiter:
post = await self.client.get_json(self.domain, api_call, origin=scrape_item)
post = post.get("post")
await self.handle_post_content(scrape_item, post, user, user_str)

@error_handling_wrapper
Expand Down Expand Up @@ -207,32 +204,20 @@ async def create_new_scrape_item(
new_scrape_item.add_to_parent_title(post_title)
self.manager.task_group.create_task(self.run(new_scrape_item))

async def get_user_str_from_post(self, scrape_item: ScrapeItem) -> str:
"""Gets the user string from a scrape item."""
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)
return soup.select_one("a[class=post__user-name]").text

async def get_user_str_from_profile(self, scrape_item: ScrapeItem) -> str:
"""Gets the user string from a scrape item."""
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)
return soup.select_one("span[itemprop=name]").text

@staticmethod
def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]:
"""Gets the service and user from a scrape item."""
user = scrape_item.url.parts[3]
service = scrape_item.url.parts[1]
return service, user

@staticmethod
async def get_service_user_and_post(scrape_item: ScrapeItem) -> tuple[str, str, str]:
"""Gets the service, user and post id from a scrape item."""
async def get_user_info(self, scrape_item: ScrapeItem) -> tuple[str, str, str, str]:
"""Gets the user info from a scrape item."""
user = scrape_item.url.parts[3]
service = scrape_item.url.parts[1]
post = scrape_item.url.parts[5]
return service, user, post
try:
post = scrape_item.url.parts[5]
except IndexError:
post = None
profile_api_url = self.api_url / service / "user" / user / "profile"
async with self.request_limiter:
profile_json: dict = await self.client.get_json(self.domain, profile_api_url, origin=scrape_item)
if post:
return service, user, post, profile_json["name"]
return service, user, profile_json["name"]

@staticmethod
def parse_datetime(date: str) -> int:
Expand Down
47 changes: 15 additions & 32 deletions cyberdrop_dl/scraper/crawlers/kemono_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_filename_and_ext

if TYPE_CHECKING:
from bs4 import BeautifulSoup

from cyberdrop_dl.managers.manager import Manager


Expand Down Expand Up @@ -54,8 +52,7 @@ async def fetch(self, scrape_item: ScrapeItem) -> None:
async def profile(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a profile."""
offset = 0
service, user = self.get_service_and_user(scrape_item)
user_str = await self.get_user_str_from_profile(scrape_item)
service, user, user_str = await self.get_user_info(scrape_item)
api_call = self.api_url / service / "user" / user
scrape_item.type = FILE_HOST_PROFILE
scrape_item.children = scrape_item.children_limit = 0
Expand Down Expand Up @@ -114,11 +111,11 @@ async def discord(self, scrape_item: ScrapeItem) -> None:
@error_handling_wrapper
async def post(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a post."""
service, user, post_id = await self.get_service_user_and_post(scrape_item)
user_str = await self.get_user_str_from_post(scrape_item)
service, user, post_id, user_str = await self.get_user_info(scrape_item)
api_call = self.api_url / service / "user" / user / "post" / post_id
async with self.request_limiter:
post = await self.client.get_json(self.domain, api_call, origin=scrape_item)
post = post.get("post")
await self.handle_post_content(scrape_item, post, user, user_str)

@error_handling_wrapper
Expand Down Expand Up @@ -262,34 +259,20 @@ async def create_new_scrape_item(
new_scrape_item.add_to_parent_title(post_title)
self.manager.task_group.create_task(self.run(new_scrape_item))

@error_handling_wrapper
async def get_user_str_from_post(self, scrape_item: ScrapeItem) -> str:
"""Gets the user string from a scrape item."""
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)
return soup.select_one("a[class=post__user-name]").text

@error_handling_wrapper
async def get_user_str_from_profile(self, scrape_item: ScrapeItem) -> str:
"""Gets the user string from a scrape item."""
async with self.request_limiter:
soup: BeautifulSoup = await self.client.get_soup(self.domain, scrape_item.url, origin=scrape_item)
return soup.select_one("span[itemprop=name]").text

@staticmethod
def get_service_and_user(scrape_item: ScrapeItem) -> tuple[str, str]:
"""Gets the service and user from a scrape item."""
async def get_user_info(self, scrape_item: ScrapeItem) -> tuple[str, str, str, str]:
"""Gets the user info from a scrape item."""
user = scrape_item.url.parts[3]
service = scrape_item.url.parts[1]
return service, user

@staticmethod
async def get_service_user_and_post(scrape_item: ScrapeItem) -> tuple[str, str, str]:
"""Gets the service, user and post id from a scrape item."""
user = scrape_item.url.parts[3]
service = scrape_item.url.parts[1]
post = scrape_item.url.parts[5]
return service, user, post
try:
post = scrape_item.url.parts[5]
except IndexError:
post = None
profile_api_url = self.api_url / service / "user" / user / "profile"
async with self.request_limiter:
profile_json: dict = await self.client.get_json(self.domain, profile_api_url, origin=scrape_item)
if post:
return service, user, post, profile_json["name"]
return service, user, profile_json["name"]

@staticmethod
def parse_datetime(date: str) -> int:
Expand Down
Loading

0 comments on commit 4ad95a1

Please sign in to comment.