Skip to content

Commit

Permalink
do not follow links that have attribute set to
Browse files Browse the repository at this point in the history
  • Loading branch information
freekmurze committed Mar 20, 2018
1 parent 5b29aac commit 66ca6b3
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to `spatie/crawler` will be documented in this file.

## 4.0.4 - 2018-03-20

- do not follow links that have attribute `rel` set to `nofollow`

## 4.0.3 - 2018-03-02

- Support both `Illuminate`'s and `Tighten`'s `Collection`.
Expand Down
3 changes: 3 additions & 0 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,9 @@ protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
$domCrawler = new DomCrawler($html, $foundOnUrl);

return collect($domCrawler->filterXpath('//a')->links())
->reject(function(Link $link) {
return $link->getNode()->getAttribute('rel') === 'nofollow';
})
->map(function (Link $link) {
try {
return new Uri($link->getUri());
Expand Down
11 changes: 11 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,17 @@ public function it_crawls_subdomains()
]);
}

/** @test */
public function it_should_not_follow_nofollow_links()
{
Crawler::create()
->setCrawlObserver(new CrawlLogger())
->setMaximumDepth(1)
->startCrawling('http://localhost:8080');

$this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]);
}

protected function regularUrls(): array
{
return [
Expand Down
6 changes: 5 additions & 1 deletion tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
let app = require('express')();

app.get('/', function (request, response) {
response.end('<a href="/link1">Link1</a><a href="/link2">Link2</a><a href="dir/link4">Link4</a><a href="mailto:[email protected]">Email</a><a href="tel:123">Telephone</a>');
response.end('<a href="/link1">Link1</a><a href="/link2">Link2</a><a href="dir/link4">Link4</a><a href="mailto:[email protected]">Email</a><a href="tel:123">Telephone</a><a href="/nofollow" rel="nofollow">No follow</a>');
});

app.get('/link1', function (request, response) {
Expand All @@ -14,6 +14,10 @@ app.get('/javascript', function (request, response) {
response.end('This page can only be reached if JavaScript is being executed');
});

app.get('/nofollow', function (request, response) {
response.end('This page should not be crawler');
});

app.get('/link2', function (request, response) {
response.end('You are on link2<a href="/link3">Link3</a><a href="http://sub.localhost:8080/subdomainpage">Subdomain</a><a href="http://subdomain.sub.localhost:8080/subdomainpage">Subdomain2</a>');
});
Expand Down

0 comments on commit 66ca6b3

Please sign in to comment.