Skip to content

Commit

Permalink
Fix setParsableMimeTypes() (#470)
Browse files Browse the repository at this point in the history
* Do not continue rest of crawl if no body

* Remove erroneous ->only

---------

Co-authored-by: David Racovan <[email protected]>
  • Loading branch information
superpenguin612 and David Racovan authored Jul 31, 2024
1 parent 6230c45 commit c659f2f
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 3 deletions.
4 changes: 4 additions & 0 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public function __construct(protected Crawler $crawler)
public function __invoke(ResponseInterface $response, $index)
{
$body = $this->getBody($response);
if (empty($body)) {
usleep($this->crawler->getDelayBetweenRequests());
return;
}

$robots = new CrawlerRobots(
$response->getHeaders(),
Expand Down
33 changes: 30 additions & 3 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,35 @@ public function shouldCrawl(UriInterface $url): bool
->startCrawling('http://localhost:8080/content-types');

$urls = [
['url' => 'http://localhost:8080/content-types/music.html', 'foundOn' => 'http://localhost:8080/content-types/music.mp3'],
['url' => 'http://localhost:8080/content-types/video.html', 'foundOn' => 'http://localhost:8080/content-types/video.mkv'],
['url' => 'http://localhost:8080/content-types/music.mp3', 'foundOn' => 'http://localhost:8080/content-types'],
['url' => 'http://localhost:8080/content-types/video.mkv', 'foundOn' => 'http://localhost:8080/content-types'],
['url' => 'http://localhost:8080/content-types/normal.html', 'foundOn' => 'http://localhost:8080/content-types'],
];

expect($urls)->sequence(
function ($url) {
$url->notToBeCrawled();
},
function ($url) {
$url->notToBeCrawled();
},
function ($url) {
$url->toBeCrawledOnce();
},
);

assertCrawledUrlCount(2);
});

it('will only crawl correct mime types when asked to when executing javascript', function () {
createCrawler()
->executeJavaScript()
->setParseableMimeTypes(['text/html', 'text/plain'])
->startCrawling('http://localhost:8080/content-types');

$urls = [
['url' => 'http://localhost:8080/content-types/music.mp3', 'foundOn' => 'http://localhost:8080/content-types'],
['url' => 'http://localhost:8080/content-types/video.mkv', 'foundOn' => 'http://localhost:8080/content-types'],
['url' => 'http://localhost:8080/content-types/normal.html', 'foundOn' => 'http://localhost:8080/content-types'],
];

Expand All @@ -453,7 +480,7 @@ function ($url) {
},
);

assertCrawledUrlCount(4);
assertCrawledUrlCount(2);
});

it('will crawl all content types when not explicitly whitelisted', function () {
Expand Down

0 comments on commit c659f2f

Please sign in to comment.