diff --git a/CHANGELOG.md b/CHANGELOG.md index 54b0a08..c82d137 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to `spatie/crawler` will be documented in this file. +## 4.0.4 - 2018-03-20 + +- do not follow links that have attribute `rel` set to `nofollow` + ## 4.0.3 - 2018-03-02 - Support both `Illuminate`'s and `Tighten`'s `Collection`. diff --git a/src/Crawler.php b/src/Crawler.php index ff6887d..cb55fbe 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -414,6 +414,9 @@ protected function extractAllLinks(string $html, UriInterface $foundOnUrl) $domCrawler = new DomCrawler($html, $foundOnUrl); return collect($domCrawler->filterXpath('//a')->links()) + ->reject(function(Link $link) { + return $link->getNode()->getAttribute('rel') === 'nofollow'; + }) ->map(function (Link $link) { try { return new Uri($link->getUri()); diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 8378ba5..03c6297 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -342,6 +342,17 @@ public function it_crawls_subdomains() ]); } + /** @test */ + public function it_should_not_follow_nofollow_links() + { + Crawler::create() + ->setCrawlObserver(new CrawlLogger()) + ->setMaximumDepth(1) + ->startCrawling('http://localhost:8080'); + + $this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]); + } + protected function regularUrls(): array { return [ diff --git a/tests/server/server.js b/tests/server/server.js index b9d2de3..e624a79 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -3,7 +3,7 @@ let app = require('express')(); app.get('/', function (request, response) { - response.end('Link1Link2Link4EmailTelephone'); + response.end('Link1Link2Link4EmailTelephoneNo follow'); }); app.get('/link1', function (request, response) { @@ -14,6 +14,10 @@ app.get('/javascript', function (request, response) { response.end('This page can only be reached if JavaScript is being executed'); }); +app.get('/nofollow', function (request, response) { + response.end('This page should not be crawler'); +}); + app.get('/link2', function (request, response) { response.end('You are on link2Link3SubdomainSubdomain2'); });