diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54b0a08..c82d137 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
All notable changes to `spatie/crawler` will be documented in this file.
+## 4.0.4 - 2018-03-20
+
+- do not follow links that have attribute `rel` set to `nofollow`
+
## 4.0.3 - 2018-03-02
- Support both `Illuminate`'s and `Tighten`'s `Collection`.
diff --git a/src/Crawler.php b/src/Crawler.php
index ff6887d..cb55fbe 100644
--- a/src/Crawler.php
+++ b/src/Crawler.php
@@ -414,6 +414,9 @@ protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
$domCrawler = new DomCrawler($html, $foundOnUrl);
return collect($domCrawler->filterXpath('//a')->links())
+ ->reject(function(Link $link) {
+ return $link->getNode()->getAttribute('rel') === 'nofollow';
+ })
->map(function (Link $link) {
try {
return new Uri($link->getUri());
diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php
index 8378ba5..03c6297 100644
--- a/tests/CrawlerTest.php
+++ b/tests/CrawlerTest.php
@@ -342,6 +342,17 @@ public function it_crawls_subdomains()
]);
}
+ /** @test */
+ public function it_should_not_follow_nofollow_links()
+ {
+ Crawler::create()
+ ->setCrawlObserver(new CrawlLogger())
+ ->setMaximumDepth(1)
+ ->startCrawling('http://localhost:8080');
+
+ $this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]);
+ }
+
protected function regularUrls(): array
{
return [
diff --git a/tests/server/server.js b/tests/server/server.js
index b9d2de3..e624a79 100644
--- a/tests/server/server.js
+++ b/tests/server/server.js
@@ -3,7 +3,7 @@
let app = require('express')();
app.get('/', function (request, response) {
- response.end('Link1Link2Link4EmailTelephone');
+ response.end('Link1Link2Link4EmailTelephoneNo follow');
});
app.get('/link1', function (request, response) {
@@ -14,6 +14,10 @@ app.get('/javascript', function (request, response) {
response.end('This page can only be reached if JavaScript is being executed');
});
+app.get('/nofollow', function (request, response) {
+ response.end('This page should not be crawler');
+});
+
app.get('/link2', function (request, response) {
response.end('You are on link2Link3SubdomainSubdomain2');
});