diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 6b16e16..476dfb0 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -32,6 +32,10 @@ public function __construct(protected Crawler $crawler) public function __invoke(ResponseInterface $response, $index) { $body = $this->getBody($response); + if (empty($body)) { + usleep($this->crawler->getDelayBetweenRequests()); + return; + } $robots = new CrawlerRobots( $response->getHeaders(), diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 2ecbed9..0e7b80a 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -436,8 +436,35 @@ public function shouldCrawl(UriInterface $url): bool ->startCrawling('http://localhost:8080/content-types'); $urls = [ - ['url' => 'http://localhost:8080/content-types/music.html', 'foundOn' => 'http://localhost:8080/content-types/music.mp3'], - ['url' => 'http://localhost:8080/content-types/video.html', 'foundOn' => 'http://localhost:8080/content-types/video.mkv'], + ['url' => 'http://localhost:8080/content-types/music.mp3', 'foundOn' => 'http://localhost:8080/content-types'], + ['url' => 'http://localhost:8080/content-types/video.mkv', 'foundOn' => 'http://localhost:8080/content-types'], + ['url' => 'http://localhost:8080/content-types/normal.html', 'foundOn' => 'http://localhost:8080/content-types'], + ]; + + expect($urls)->sequence( + function ($url) { + $url->notToBeCrawled(); + }, + function ($url) { + $url->notToBeCrawled(); + }, + function ($url) { + $url->toBeCrawledOnce(); + }, + ); + + assertCrawledUrlCount(2); +}); + +it('will only crawl correct mime types when asked to when executing javascript', function () { + createCrawler() + ->executeJavaScript() + ->setParseableMimeTypes(['text/html', 'text/plain']) + ->startCrawling('http://localhost:8080/content-types'); + + $urls = [ + ['url' => 'http://localhost:8080/content-types/music.mp3', 'foundOn' => 'http://localhost:8080/content-types'], + ['url' => 'http://localhost:8080/content-types/video.mkv', 'foundOn' => 'http://localhost:8080/content-types'], ['url' => 'http://localhost:8080/content-types/normal.html', 'foundOn' => 'http://localhost:8080/content-types'], ]; @@ -453,7 +480,7 @@ function ($url) { }, ); - assertCrawledUrlCount(4); + assertCrawledUrlCount(2); }); it('will crawl all content types when not explicitly whitelisted', function () {