Skip to content

Commit

Permalink
Add execution time limit
Browse files Browse the repository at this point in the history
  • Loading branch information
VincentLanglet committed Dec 11, 2024
1 parent b9fb6cf commit bfdc077
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
68 changes: 68 additions & 0 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ class Crawler

protected ?int $currentCrawlLimit = null;

protected ?int $startedAt = null;

protected int $executionTime = 0;

protected ?int $totalExecutionTimeLimit = null;

protected ?int $currentExecutionTimeLimit = null;

protected int $maximumResponseSize = 1024 * 1024 * 2;

protected ?int $maximumDepth = null;
Expand Down Expand Up @@ -174,6 +182,44 @@ public function getCurrentCrawlCount(): int
return $this->currentUrlCount;
}

public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
{
$this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;

return $this;
}

public function getTotalExecutionTimeLimit(): ?int
{
return $this->totalExecutionTimeLimit;
}

public function getTotalExecutionTime(): int
{
return $this->executionTime + $this->getCurrentExecutionTime();
}

public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
{
$this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;

return $this;
}

public function getCurrentExecutionTimeLimit(): ?int
{
return $this->currentExecutionTimeLimit;
}

public function getCurrentExecutionTime(): int
{
if (is_null($this->startedAt)) {
return 0;
}

return time() - $this->startedAt;
}

public function setMaximumDepth(int $maximumDepth): self
{
$this->maximumDepth = $maximumDepth;
Expand Down Expand Up @@ -412,6 +458,8 @@ public function getBaseUrl(): UriInterface

public function startCrawling(UriInterface|string $baseUrl)
{
$this->startedAt = time();

if (! $baseUrl instanceof UriInterface) {
$baseUrl = new Uri($baseUrl);
}
Expand Down Expand Up @@ -445,6 +493,9 @@ public function startCrawling(UriInterface|string $baseUrl)
foreach ($this->crawlObservers as $crawlObserver) {
$crawlObserver->finishedCrawling();
}

$this->executionTime += time() - $this->startedAt;
$this->startedAt = null; // To reset currentExecutionTime
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
Expand Down Expand Up @@ -480,6 +531,7 @@ protected function startCrawlingQueue(): void
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$this->crawlQueue->hasPendingUrls()
) {
$pool = new Pool($this->client, $this->getCrawlRequests(), [
Expand All @@ -504,6 +556,7 @@ protected function getCrawlRequests(): Generator
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$crawlUrl = $this->crawlQueue->getPendingUrl()
) {
if (
Expand Down Expand Up @@ -556,4 +609,19 @@ public function reachedCrawlLimits(): bool

return false;
}

public function reachedTimeLimits(): bool
{
$totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
return true;
}

$currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
return true;
}

return false;
}
}
38 changes: 38 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,44 @@ function ($url) {
assertCrawledUrlCount(3);
});

it('respects the total execution time limit', function () {
$baseUrl = 'http://localhost:8080';

$crawler = createCrawler()
->setMaximumDepth(2)
->setDelayBetweenRequests(500) // 500ms
->setTotalExecutionTimeLimit(2)
->setCrawlProfile(new CrawlSubdomains($baseUrl));

$crawler->startCrawling($baseUrl);

// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
assertCrawledUrlCount(4);

$crawler->startCrawling($baseUrl);

assertCrawledUrlCount(4);
});

it('respects the current execution time limit', function () {
$baseUrl = 'http://localhost:8080';

$crawler = createCrawler()
->setMaximumDepth(2)
->setDelayBetweenRequests(500) // 500ms
->setCurrentExecutionTimeLimit(2)
->setCrawlProfile(new CrawlSubdomains($baseUrl));

$crawler->startCrawling($baseUrl);

// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
assertCrawledUrlCount(4);

$crawler->startCrawling($baseUrl);

assertCrawledUrlCount(8);
});

function javascriptInjectedUrls(): array
{
return [[
Expand Down

0 comments on commit bfdc077

Please sign in to comment.