Skip to content
This repository has been archived by the owner on Jul 5, 2024. It is now read-only.

Commit

Permalink
Change clients to use proxy for every request (why wasn't it before?)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-WinnfieldX committed Nov 21, 2023
1 parent af7dc9e commit df29884
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 26 deletions.
2 changes: 1 addition & 1 deletion cyberdrop_dl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.2.225"
__version__ = "4.2.226"
44 changes: 24 additions & 20 deletions cyberdrop_dl/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ async def wrapper(self, *args, **kwargs):
class Client:
"""Creates a 'client' that can be referenced by scraping or download sessions"""
def __init__(self, ratelimit: int, throttle: float, secure: bool, connect_timeout: int, read_timeout: int,
user_agent: str):
user_agent: str, proxy: str):
self.connect_timeout = connect_timeout
self.read_timeout = read_timeout
self.ratelimit = ratelimit
self.throttle = throttle
self.simultaneous_session_limit = asyncio.Semaphore(50)
self.user_agent = user_agent
self.verify_ssl = secure
self.proxy = proxy
self.ssl_context = ssl.create_default_context(cafile=certifi.where()) if secure else False
self.cookies = aiohttp.CookieJar(quote_cookie=False)

Expand All @@ -69,7 +70,7 @@ def __init__(self, client: Client) -> None:

@scrape_limit
async def get_BS4(self, url: URL) -> BeautifulSoup:
async with self.client_session.get(url, ssl=self.client.ssl_context) as response:
async with self.client_session.get(url, ssl=self.client.ssl_context, proxy=self.client.proxy) as response:
content_type = response.headers.get('Content-Type')
assert content_type is not None
if not any(s in content_type.lower() for s in ("html", "text")):
Expand All @@ -79,55 +80,59 @@ async def get_BS4(self, url: URL) -> BeautifulSoup:

@scrape_limit
async def get_BS4_and_url(self, url: URL) -> Tuple[BeautifulSoup, URL]:
async with self.client_session.get(url, ssl=self.client.ssl_context) as response:
async with self.client_session.get(url, ssl=self.client.ssl_context, proxy=self.client.proxy) as response:
text = await response.text()
soup = BeautifulSoup(text, 'html.parser')
return soup, URL(response.url)

@scrape_limit
async def get_json(self, url: URL, params: Optional[Dict] = None, headers_inc: Optional[Dict] = None) -> Dict:
headers = {**self.headers, **headers_inc} if headers_inc else self.headers
async with self.client_session.get(url, ssl=self.client.ssl_context, params=params, headers=headers) as response:
async with self.client_session.get(url, ssl=self.client.ssl_context, params=params, headers=headers,
proxy=self.client.proxy) as response:
return json.loads(await response.content.read())

@scrape_limit
async def get_json_with_headers(self, url: URL, params: Optional[Dict] = None,
headers_inc: Optional[Dict] = None) -> tuple[Any, CIMultiDictProxy[str]]:
headers = {**self.headers, **headers_inc} if headers_inc else self.headers
async with self.client_session.get(url, ssl=self.client.ssl_context, params=params, headers=headers) as response:
async with self.client_session.get(url, ssl=self.client.ssl_context, params=params, headers=headers,
proxy=self.client.proxy) as response:
content = await response.content.read()
return json.loads(content), response.headers

@scrape_limit
async def get_text(self, url: URL) -> str:
async with self.client_session.get(url, ssl=self.client.ssl_context) as response:
async with self.client_session.get(url, ssl=self.client.ssl_context, proxy=self.client.proxy) as response:
return await response.text()

@scrape_limit
async def post(self, url: URL, data: Dict) -> Dict:
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context) as response:
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context,
proxy=self.client.proxy) as response:
return json.loads(await response.content.read())

@scrape_limit
async def post_with_auth(self, url: URL, data: Dict, auth: aiohttp.BasicAuth) -> Dict:
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context, auth=auth) as response:
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context, auth=auth,
proxy=self.client.proxy) as response:
return json.loads(await response.content.read())

@scrape_limit
async def get_no_resp(self, url: URL, headers: Dict) -> None:
async with self.client_session.get(url, headers=headers, ssl=self.client.ssl_context):
async with self.client_session.get(url, headers=headers, ssl=self.client.ssl_context, proxy=self.client.proxy):
pass

@scrape_limit
async def post_data_no_resp(self, url: URL, data: Dict) -> None:
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context):
async with self.client_session.post(url, data=data, headers=self.headers, ssl=self.client.ssl_context, proxy=self.client.proxy):
pass

@scrape_limit
async def head(self, url: URL, headers_inc: Dict, allow_redirects=True) -> tuple[CIMultiDictProxy[str], URL]:
headers = {**self.headers, **headers_inc} if headers_inc else self.headers
async with self.client_session.head(url, headers=headers, ssl=self.client.ssl_context, raise_for_status=False,
allow_redirects=allow_redirects) as response:
allow_redirects=allow_redirects, proxy=self.client.proxy) as response:
return response.headers, response.url

async def exit_handler(self) -> None:
Expand Down Expand Up @@ -162,7 +167,7 @@ async def _append_content(self, file: Path, content: aiohttp.StreamReader,
else:
update_progress(len(chunk))

async def _download(self, media: MediaItem, current_throttle: float, proxy: str, headers: Dict,
async def _download(self, media: MediaItem, current_throttle: float, headers: Dict,
save_content: Callable[[aiohttp.StreamReader], Coroutine[Any, Any, None]], file: Path) -> None:
headers['Referer'] = str(media.referer)
headers['user-agent'] = self.client.user_agent
Expand All @@ -171,7 +176,7 @@ async def _download(self, media: MediaItem, current_throttle: float, proxy: str,
await self._throttle(current_throttle, media.url.host)

async with self.client_session.get(media.url, headers=headers, ssl=self.client.ssl_context,
raise_for_status=True, proxy=proxy) as resp:
raise_for_status=True, proxy=self.client.proxy) as resp:
content_type = resp.headers.get('Content-Type')
if not content_type:
raise DownloadFailure(status=CustomHTTPStatus.IM_A_TEAPOT, message="No content-type in response header")
Expand Down Expand Up @@ -210,34 +215,33 @@ async def _throttle(self, delay: float, host: str) -> None:
await asyncio.sleep(remaining)

async def download_file(self, Progress_Master: ProgressMaster, media: MediaItem, file: Path,
current_throttle: float, resume_point: int, proxy: str, headers: Dict,
file_task: TaskID) -> None:
current_throttle: float, resume_point: int, headers: Dict, file_task: TaskID) -> None:

async def save_content(content: aiohttp.StreamReader) -> None:
await Progress_Master.FileProgress.advance_file(file_task, resume_point)
await self._append_content(file, content, functools.partial(Progress_Master.FileProgress.advance_file, file_task))

await self._download(media, current_throttle, proxy, headers, save_content, file)
await self._download(media, current_throttle, headers, save_content, file)

async def old_download_file(self, media: MediaItem, file: Path, current_throttle: float, resume_point: int,
proxy: str, headers: Dict, size: int) -> None:
headers: Dict, size: int) -> None:

async def save_content(content: aiohttp.StreamReader) -> None:
task_description = adjust_title(f"{media.url.host}: {media.filename}")
with tqdm(total=size + resume_point, unit_scale=True, unit='B', leave=False,
initial=resume_point, desc=task_description) as progress:
await self._append_content(file, content, lambda chunk_len: progress.update(chunk_len))

await self._download(media, current_throttle, proxy, headers, save_content, file)
await self._download(media, current_throttle, headers, save_content, file)

async def get_filesize(self, url: URL, referer: str, current_throttle: float, headers: Dict, proxy: str) -> int:
async def get_filesize(self, url: URL, referer: str, current_throttle: float, headers: Dict) -> int:
headers['Referer'] = referer
headers['user-agent'] = self.client.user_agent

assert url.host is not None
await self._throttle(current_throttle, url.host)
async with self.client_session.get(url, headers=headers, ssl=self.client.ssl_context,
raise_for_status=False, proxy=proxy) as resp:
raise_for_status=False, proxy=self.client.proxy) as resp:
if resp.status > 206:
if "Server" in resp.headers:
if resp.headers["Server"] == "ddos-guard":
Expand Down
4 changes: 2 additions & 2 deletions cyberdrop_dl/downloader/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ async def download_file(self, album: str, media: MediaItem, url_path: str, album

if not await self.CDL_Helper.SQL_Helper.sql_check_old_existing(url_path) and download_bool:
await self.download_session.download_file(self.Progress_Master, media, partial_file, self.throttle,
resume_point, self.CDL_Helper.proxy, headers, file_task)
resume_point, headers, file_task)
partial_file.rename(complete_file)

await self.CDL_Helper.SQL_Helper.mark_complete(url_path, original_filename)
Expand Down Expand Up @@ -328,7 +328,7 @@ async def check_file_exists(self, complete_file: Path, partial_file: Path, media
while True:
if not expected_size:
expected_size = await self.download_session.get_filesize(media.url, str(media.referer),
current_throttle, headers, self.CDL_Helper.proxy)
current_throttle, headers)
if not complete_file.exists() and not partial_file.exists():
break

Expand Down
4 changes: 2 additions & 2 deletions cyberdrop_dl/downloader/old_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ async def download_file(self, album: str, media: MediaItem, url_path: str) -> No

if not await self.SQL_Helper.sql_check_old_existing(url_path) and download_bool:
await self.download_session.old_download_file(media, partial_file, current_throttle, resume_point,
self.proxy, headers, expected_size)
headers, expected_size)
partial_file.rename(complete_file)

await self.SQL_Helper.mark_complete(url_path, original_filename)
Expand Down Expand Up @@ -288,7 +288,7 @@ async def check_file_exists(self, complete_file: Path, partial_file: Path, media
while True:
if not expected_size:
expected_size = await self.download_session.get_filesize(media.url, str(media.referer),
current_throttle, headers, self.proxy)
current_throttle, headers)

if not complete_file.exists() and not partial_file.exists():
break
Expand Down
2 changes: 1 addition & 1 deletion cyberdrop_dl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ async def director(args: Dict, links: List) -> None:
links = await consolidate_links(args, links)
client = Client(args['Ratelimiting']['ratelimit'], args['Ratelimiting']['throttle'],
args['Runtime']['allow_insecure_connections'], args["Ratelimiting"]["connection_timeout"],
args["Ratelimiting"]["read_timeout"], args['Runtime']['user_agent'])
args["Ratelimiting"]["read_timeout"], args['Runtime']['user_agent'], args['Runtime']['proxy'])
SQL_Helper = SQLHelper(args['Ignore']['ignore_history'], args['Ignore']['ignore_cache'], args['Files']['db_file'])
Scraper = ScrapeMapper(args, client, SQL_Helper, False, error_writer, cache_manager)

Expand Down

0 comments on commit df29884

Please sign in to comment.