Skip to content

Commit

Permalink
Limit the concurrency when fetching pages
Browse files Browse the repository at this point in the history
For a production test case, BookProvider::getPages() was opening 3492
concurrent connections to the Wikimedia servers. That exceeds the
default maxmimum file descriptor limit, and there is a risk that the
tool will be blocked by WMF sysadmins.

So, limit the concurrency to 10 connections.
  • Loading branch information
tstarling authored and Tpt committed Feb 26, 2024
1 parent f8d968a commit 1b012fa
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
11 changes: 5 additions & 6 deletions src/BookProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,15 @@ protected function domDocumentFromHtml( $html ) {
* @return Page[]
*/
protected function getPages( $pages ) {
$promises = [];

$pageTitles = [];
foreach ( $pages as $id => $page ) {
$promises[$id] = $this->api->getPageAsync( $page->title );
$pageTitles[$id] = $page->title;
}

$contents = $this->api->getPageBatch( $pageTitles );
foreach ( $pages as $id => $page ) {
$page->content = $this->domDocumentFromHtml( $promises[$id]->wait() );
$page->content = isset( $contents[$id] )
? $this->domDocumentFromHtml( $contents[$id] ) : null;
}

return $pages;
}

Expand Down
29 changes: 29 additions & 0 deletions src/Util/Api.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
use GuzzleHttp\Client;
use GuzzleHttp\ClientInterface;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Promise\Each;
use GuzzleHttp\Promise\PromiseInterface;
use Kevinrob\GuzzleCache\CacheMiddleware;
use Kevinrob\GuzzleCache\Storage\Psr6CacheStorage;
Expand All @@ -28,6 +29,7 @@ class Api {
private const USER_AGENT = 'Wikisource Export/0.1';
private const CONNECT_TIMEOUT = 10; // in seconds
private const REQUEST_TIMEOUT = 60; // in seconds
private const MAX_CONNECTIONS = 10;

/** @var string */
private $lang = '';
Expand Down Expand Up @@ -252,6 +254,8 @@ public function completeQuery( $params ) {
}

/**
* Get a page asynchronously, with unlimited concurrency
*
* @param string $title the title of the page
* @return PromiseInterface promise with the content of a page
*/
Expand All @@ -269,6 +273,31 @@ function ( $reason ) use ( $title ) {
);
}

/**
* Get a batch of pages, with a concurrency limit
*
* @param string[] $titles
* @return string[] The contents of the pages
*/
public function getPageBatch( $titles ) {
$requests = function () use ( $titles ) {
foreach ( $titles as $id => $title ) {
yield $id => $this->getPageAsync( $title );
}
};
$this->logger->debug( "Sending request for " . count( $titles ) . " titles" );
$texts = [];
Each::ofLimit(
$requests(),
self::MAX_CONNECTIONS,
function ( $text, $id ) use ( &$texts ) {
$texts[$id] = $text;
}
)->wait();
$this->logger->debug( "Got responses for " . count( $texts ) . " pages" );
return $texts;
}

/**
* @param string $url the url
* @return string the file content
Expand Down

0 comments on commit 1b012fa

Please sign in to comment.