From 1b012fa96dc60d565a1f889415548fe0e37af4e2 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Mon, 26 Feb 2024 12:52:31 +1100 Subject: [PATCH] Limit the concurrency when fetching pages For a production test case, BookProvider::getPages() was opening 3492 concurrent connections to the Wikimedia servers. That exceeds the default maxmimum file descriptor limit, and there is a risk that the tool will be blocked by WMF sysadmins. So, limit the concurrency to 10 connections. --- src/BookProvider.php | 11 +++++------ src/Util/Api.php | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/BookProvider.php b/src/BookProvider.php index ff77aad1..37d39665 100644 --- a/src/BookProvider.php +++ b/src/BookProvider.php @@ -193,16 +193,15 @@ protected function domDocumentFromHtml( $html ) { * @return Page[] */ protected function getPages( $pages ) { - $promises = []; - + $pageTitles = []; foreach ( $pages as $id => $page ) { - $promises[$id] = $this->api->getPageAsync( $page->title ); + $pageTitles[$id] = $page->title; } - + $contents = $this->api->getPageBatch( $pageTitles ); foreach ( $pages as $id => $page ) { - $page->content = $this->domDocumentFromHtml( $promises[$id]->wait() ); + $page->content = isset( $contents[$id] ) + ? $this->domDocumentFromHtml( $contents[$id] ) : null; } - return $pages; } diff --git a/src/Util/Api.php b/src/Util/Api.php index 4a00b1c4..525c4107 100644 --- a/src/Util/Api.php +++ b/src/Util/Api.php @@ -9,6 +9,7 @@ use GuzzleHttp\Client; use GuzzleHttp\ClientInterface; use GuzzleHttp\HandlerStack; +use GuzzleHttp\Promise\Each; use GuzzleHttp\Promise\PromiseInterface; use Kevinrob\GuzzleCache\CacheMiddleware; use Kevinrob\GuzzleCache\Storage\Psr6CacheStorage; @@ -28,6 +29,7 @@ class Api { private const USER_AGENT = 'Wikisource Export/0.1'; private const CONNECT_TIMEOUT = 10; // in seconds private const REQUEST_TIMEOUT = 60; // in seconds + private const MAX_CONNECTIONS = 10; /** @var string */ private $lang = ''; @@ -252,6 +254,8 @@ public function completeQuery( $params ) { } /** + * Get a page asynchronously, with unlimited concurrency + * * @param string $title the title of the page * @return PromiseInterface promise with the content of a page */ @@ -269,6 +273,31 @@ function ( $reason ) use ( $title ) { ); } + /** + * Get a batch of pages, with a concurrency limit + * + * @param string[] $titles + * @return string[] The contents of the pages + */ + public function getPageBatch( $titles ) { + $requests = function () use ( $titles ) { + foreach ( $titles as $id => $title ) { + yield $id => $this->getPageAsync( $title ); + } + }; + $this->logger->debug( "Sending request for " . count( $titles ) . " titles" ); + $texts = []; + Each::ofLimit( + $requests(), + self::MAX_CONNECTIONS, + function ( $text, $id ) use ( &$texts ) { + $texts[$id] = $text; + } + )->wait(); + $this->logger->debug( "Got responses for " . count( $texts ) . " pages" ); + return $texts; + } + /** * @param string $url the url * @return string the file content