From d3a4cd56a4c2444f2c50dd1df97ced7faea334c3 Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Sat, 4 Jan 2025 08:16:58 -0500 Subject: [PATCH] perf: remove crawler scrape lock This PR removes the async lock from the crawlers and replaces it with a semaphore with a capacity of 20 In reality, neither the lock nor the semaphore are needed. The requests are actually limited by the `request_limiter` of each crawler, not the lock. However, i could not remove the lock because it will brake the logic for the UI scrape queue, that's why i replaced it with a semaphore instead. The lock was making each crawler behave synchronously and the `request_limiter` was never close to being reached. This was only for the crawlers. The downloaders have a semaphore already with a different capacity per domain, so they were not affected. The default capacity for each downloader is 3, defined by `--max-simultaneous-downloads-per-domain` ## Disadvantages The only drawback of replacing the lock with a semaphore (or eventually removing the lock altogether) is that the `request_limiter` is defined per crawler and almost all crawlers right now have a generic `10 requests / sec` limit. Some crawlers may require fine tuning the limiter to make sure CDL does not trigger `429s` --- cyberdrop_dl/scraper/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cyberdrop_dl/scraper/crawler.py b/cyberdrop_dl/scraper/crawler.py index e92f3337..f7ff5746 100644 --- a/cyberdrop_dl/scraper/crawler.py +++ b/cyberdrop_dl/scraper/crawler.py @@ -44,7 +44,7 @@ def __init__(self, manager: Manager, domain: str, folder_domain: str | None = No self.downloader = field(init=False) self.scraping_progress = manager.progress_manager.scraping_progress self.client: ScraperClient = field(init=False) - self._lock = asyncio.Lock() + self._semaphore = asyncio.Semaphore(20) self.domain = domain self.folder_domain = folder_domain or domain.capitalize() @@ -65,7 +65,7 @@ async def run(self, item: ScrapeItem) -> None: if not item.url.host: return self.waiting_items += 1 - async with self._lock: + async with self._semaphore: self.waiting_items -= 1 if item.url.path_qs not in self.scraped_items: log(f"Scraping: {item.url}", 20)