Skip to content

Commit

Permalink
Refactor and add timeout logging
Browse files Browse the repository at this point in the history
  • Loading branch information
KarimPwnz committed Jun 29, 2023
1 parent 2e39edc commit 492ed0a
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 36 deletions.
3 changes: 3 additions & 0 deletions scripts/bq-to-raw-url.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

jq -r '"https://raw.githubusercontent.com/\(.repo_name)/HEAD/\(.path)"'
11 changes: 4 additions & 7 deletions scripts/filter_popularity.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import aiohttp
import argparse
import asyncio
import logging
import sys


from utils import create_client, setup_logging


class PopularityChecker:
def __init__(self, client, min_stars):
self._client = client
Expand Down Expand Up @@ -63,7 +61,7 @@ async def print_if_popular(self, query):


async def main():
setup_logging()
logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
Expand All @@ -86,9 +84,8 @@ async def main():
headers = {}
if args.token:
headers["Authorization"] = f"Token {args.token}"
async with create_client(
request_limit=args.limit, headers=headers
) as client:
conn = aiohttp.TCPConnector(limit=args.limit)
async with aiohttp.ClientSession(connector=conn) as client:
checker = PopularityChecker(client=client, min_stars=args.stars)
queries = (l.rstrip("\n") for l in sys.stdin)
tasks = []
Expand Down
24 changes: 13 additions & 11 deletions scripts/github_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import asyncio
import logging
import sys
import aiohttp

from aiopath import AsyncPath
from utils import create_client, setup_logging


class NotFoundException(Exception):
Expand All @@ -21,21 +21,24 @@ def _url_to_path(url):

async def download(self, url):
async with self._client.get(url) as resp:
if resp.status == 404:
raise NotFoundException
resp.raise_for_status()
return await resp.text()

async def save(self, url, content):
path = self.output_path / AsyncPath(self._url_to_path(url))
await path.parent.mkdir(parents=True, exist_ok=True)
await path.write_text(content)

async def download_and_save(self, url):
async def download_and_save(self, url, retry_sleep=30):
logging.info("Downloading %s", url)
try:
content = await self.download(url)
except NotFoundException:
logging.warning("File not found: %s", url)
except aiohttp.ClientResponseError as e:
if e.status == 429:
logging.warning("Retrying in %s seconds (got 429-ed): %s", retry_sleep, url)
await asyncio.sleep(retry_sleep)
return await self.download_and_save(url, retry_sleep * 2)
logging.error("Status code %s != 200 received: %s", e.status, url)
return
except Exception as e:
logging.exception("Exception occurred while downloading: %s", str(e))
Expand All @@ -48,7 +51,7 @@ async def download_and_save(self, url):


async def main():
setup_logging()
logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"-o", "--output", help="Output file path (default: 'out')", default="out"
Expand All @@ -61,15 +64,14 @@ async def main():
type=int,
)
args = parser.parse_args()
async with create_client(request_limit=args.limit) as client:
conn = aiohttp.TCPConnector(limit=args.limit)
async with aiohttp.ClientSession(connector=conn) as client:
downloader = GitHubDownloader(client=client, output_path=args.output)
i = 0
tasks = []
urls = (l.rstrip("\n") for l in sys.stdin)
for url in urls:
i += 1
tasks.append(downloader.download_and_save(url))
if i % args.limit == 0:
if len(tasks) == args.limit:
await asyncio.gather(*tasks)
tasks = []
await asyncio.gather(*tasks)
Expand Down
18 changes: 0 additions & 18 deletions scripts/utils.py

This file was deleted.

0 comments on commit 492ed0a

Please sign in to comment.