From 492ed0a13e50f8291afddb74890873125276f093 Mon Sep 17 00:00:00 2001 From: Karim Rahal Date: Thu, 29 Jun 2023 00:48:38 -0700 Subject: [PATCH] Refactor and add timeout logging --- scripts/bq-to-raw-url.sh | 3 +++ scripts/filter_popularity.py | 11 ++++------- scripts/github_downloader.py | 24 +++++++++++++----------- scripts/utils.py | 18 ------------------ 4 files changed, 20 insertions(+), 36 deletions(-) create mode 100755 scripts/bq-to-raw-url.sh delete mode 100644 scripts/utils.py diff --git a/scripts/bq-to-raw-url.sh b/scripts/bq-to-raw-url.sh new file mode 100755 index 0000000..611ea2c --- /dev/null +++ b/scripts/bq-to-raw-url.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +jq -r '"https://raw.githubusercontent.com/\(.repo_name)/HEAD/\(.path)"' diff --git a/scripts/filter_popularity.py b/scripts/filter_popularity.py index 8a0f0fa..08436f6 100644 --- a/scripts/filter_popularity.py +++ b/scripts/filter_popularity.py @@ -1,12 +1,10 @@ +import aiohttp import argparse import asyncio import logging import sys -from utils import create_client, setup_logging - - class PopularityChecker: def __init__(self, client, min_stars): self._client = client @@ -63,7 +61,7 @@ async def print_if_popular(self, query): async def main(): - setup_logging() + logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "-s", @@ -86,9 +84,8 @@ async def main(): headers = {} if args.token: headers["Authorization"] = f"Token {args.token}" - async with create_client( - request_limit=args.limit, headers=headers - ) as client: + conn = aiohttp.TCPConnector(limit=args.limit) + async with aiohttp.ClientSession(connector=conn) as client: checker = PopularityChecker(client=client, min_stars=args.stars) queries = (l.rstrip("\n") for l in sys.stdin) tasks = [] diff --git a/scripts/github_downloader.py b/scripts/github_downloader.py index 52a0027..0247fb0 100644 --- a/scripts/github_downloader.py +++ b/scripts/github_downloader.py @@ -2,9 +2,9 @@ import asyncio import logging import sys +import aiohttp from aiopath import AsyncPath -from utils import create_client, setup_logging class NotFoundException(Exception): @@ -21,8 +21,7 @@ def _url_to_path(url): async def download(self, url): async with self._client.get(url) as resp: - if resp.status == 404: - raise NotFoundException + resp.raise_for_status() return await resp.text() async def save(self, url, content): @@ -30,12 +29,16 @@ async def save(self, url, content): await path.parent.mkdir(parents=True, exist_ok=True) await path.write_text(content) - async def download_and_save(self, url): + async def download_and_save(self, url, retry_sleep=30): logging.info("Downloading %s", url) try: content = await self.download(url) - except NotFoundException: - logging.warning("File not found: %s", url) + except aiohttp.ClientResponseError as e: + if e.status == 429: + logging.warning("Retrying in %s seconds (got 429-ed): %s", retry_sleep, url) + await asyncio.sleep(retry_sleep) + return await self.download_and_save(url, retry_sleep * 2) + logging.error("Status code %s != 200 received: %s", e.status, url) return except Exception as e: logging.exception("Exception occurred while downloading: %s", str(e)) @@ -48,7 +51,7 @@ async def download_and_save(self, url): async def main(): - setup_logging() + logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "-o", "--output", help="Output file path (default: 'out')", default="out" @@ -61,15 +64,14 @@ async def main(): type=int, ) args = parser.parse_args() - async with create_client(request_limit=args.limit) as client: + conn = aiohttp.TCPConnector(limit=args.limit) + async with aiohttp.ClientSession(connector=conn) as client: downloader = GitHubDownloader(client=client, output_path=args.output) - i = 0 tasks = [] urls = (l.rstrip("\n") for l in sys.stdin) for url in urls: - i += 1 tasks.append(downloader.download_and_save(url)) - if i % args.limit == 0: + if len(tasks) == args.limit: await asyncio.gather(*tasks) tasks = [] await asyncio.gather(*tasks) diff --git a/scripts/utils.py b/scripts/utils.py deleted file mode 100644 index 9dfad86..0000000 --- a/scripts/utils.py +++ /dev/null @@ -1,18 +0,0 @@ -import logging - -from aiohttp import TCPConnector -from aiohttp_retry import FibonacciRetry, RetryClient - - -def setup_logging(): - logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) - - -def create_client(request_limit, *args, **kwargs): - return RetryClient( - raise_for_status=False, - retry_options=FibonacciRetry(attempts=float("inf"), statuses=[429, 500, 502, 503, 504], max_timeout=float("inf")), - connector=TCPConnector(limit=request_limit), - *args, - **kwargs - )