diff --git a/README.md b/README.md index 69bd2fd..9d0ef20 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ Crawler::create() ->setConcurrency(1) // now all urls will be crawled one by one ``` -## Defining Crawl Limits +## Defining Crawl and Time Limits By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment. @@ -232,8 +232,11 @@ The crawl behavior can be controlled with the following two options: - **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl. - **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl. + - **Total Execution Time Limit** (`setTotalExecutionTimeLimit`): This limit defines the maximal execution time of the crawl. + - **Current Execution Time Limit** (`setCurrentExecutionTimeLimit`): This limits the execution time of the current crawl. -Let's take a look at some examples to clarify the difference between these two methods. +Let's take a look at some examples to clarify the difference between `setTotalCrawlLimit` and `setCurrentCrawlLimit`. +The difference between `setTotalExecutionTimeLimit` and `setCurrentExecutionTimeLimit` will be the same. ### Example 1: Using the total crawl limit diff --git a/src/Crawler.php b/src/Crawler.php index 141ab02..f5e826f 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -43,6 +43,14 @@ class Crawler protected ?int $currentCrawlLimit = null; + protected ?int $startedAt = null; + + protected int $executionTime = 0; + + protected ?int $totalExecutionTimeLimit = null; + + protected ?int $currentExecutionTimeLimit = null; + protected int $maximumResponseSize = 1024 * 1024 * 2; protected ?int $maximumDepth = null; @@ -174,6 +182,44 @@ public function getCurrentCrawlCount(): int return $this->currentUrlCount; } + public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self + { + $this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond; + + return $this; + } + + public function getTotalExecutionTimeLimit(): ?int + { + return $this->totalExecutionTimeLimit; + } + + public function getTotalExecutionTime(): int + { + return $this->executionTime + $this->getCurrentExecutionTime(); + } + + public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self + { + $this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond; + + return $this; + } + + public function getCurrentExecutionTimeLimit(): ?int + { + return $this->currentExecutionTimeLimit; + } + + public function getCurrentExecutionTime(): int + { + if (is_null($this->startedAt)) { + return 0; + } + + return time() - $this->startedAt; + } + public function setMaximumDepth(int $maximumDepth): self { $this->maximumDepth = $maximumDepth; @@ -412,6 +458,8 @@ public function getBaseUrl(): UriInterface public function startCrawling(UriInterface|string $baseUrl) { + $this->startedAt = time(); + if (! $baseUrl instanceof UriInterface) { $baseUrl = new Uri($baseUrl); } @@ -445,6 +493,9 @@ public function startCrawling(UriInterface|string $baseUrl) foreach ($this->crawlObservers as $crawlObserver) { $crawlObserver->finishedCrawling(); } + + $this->executionTime += time() - $this->startedAt; + $this->startedAt = null; // To reset currentExecutionTime } public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node @@ -480,6 +531,7 @@ protected function startCrawlingQueue(): void { while ( $this->reachedCrawlLimits() === false && + $this->reachedTimeLimits() === false && $this->crawlQueue->hasPendingUrls() ) { $pool = new Pool($this->client, $this->getCrawlRequests(), [ @@ -504,6 +556,7 @@ protected function getCrawlRequests(): Generator { while ( $this->reachedCrawlLimits() === false && + $this->reachedTimeLimits() === false && $crawlUrl = $this->crawlQueue->getPendingUrl() ) { if ( @@ -556,4 +609,19 @@ public function reachedCrawlLimits(): bool return false; } + + public function reachedTimeLimits(): bool + { + $totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit(); + if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) { + return true; + } + + $currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit(); + if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) { + return true; + } + + return false; + } } diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 0e7b80a..c580e5a 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -517,6 +517,44 @@ function ($url) { assertCrawledUrlCount(3); }); +it('respects the total execution time limit', function () { + $baseUrl = 'http://localhost:8080'; + + $crawler = createCrawler() + ->setMaximumDepth(2) + ->setDelayBetweenRequests(500) // 500ms + ->setTotalExecutionTimeLimit(2) + ->setCrawlProfile(new CrawlSubdomains($baseUrl)); + + $crawler->startCrawling($baseUrl); + + // At 500ms delay per URL, only four URL can be crawled in 2 seconds. + assertCrawledUrlCount(4); + + $crawler->startCrawling($baseUrl); + + assertCrawledUrlCount(4); +}); + +it('respects the current execution time limit', function () { + $baseUrl = 'http://localhost:8080'; + + $crawler = createCrawler() + ->setMaximumDepth(2) + ->setDelayBetweenRequests(500) // 500ms + ->setCurrentExecutionTimeLimit(2) + ->setCrawlProfile(new CrawlSubdomains($baseUrl)); + + $crawler->startCrawling($baseUrl); + + // At 500ms delay per URL, only four URL can be crawled in 2 seconds. + assertCrawledUrlCount(4); + + $crawler->startCrawling($baseUrl); + + assertCrawledUrlCount(11); +}); + function javascriptInjectedUrls(): array { return [[