Skip to content

Commit

Permalink
Merge pull request #145 from spatie/respect-robots
Browse files Browse the repository at this point in the history
Respect robots
  • Loading branch information
brendt authored May 8, 2018
2 parents 016fca9 + 80e1a7b commit 0d1e607
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 68 deletions.
5 changes: 1 addition & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,4 @@ before_script:
- travis_retry composer update ${COMPOSER_FLAGS} --no-interaction --prefer-source

script:
- vendor/bin/phpunit --coverage-text --coverage-clover=coverage.clover

after_script:
- php vendor/bin/ocular code-coverage:upload --format=php-clover coverage.clover
- vendor/bin/phpunit
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to `spatie/crawler` will be documented in this file.

## 4.1.0 - 2018-05-08

- Respect robots.txt

## 4.0.5 - 2018-04-30

- improved extensibility by removing php native type hinting of url, queue and crawler pool Closures
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,21 @@ This package comes with three `CrawlProfiles` out of the box:
- `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host.
- `CrawlSubdomainUrls`: this profile will only crawl the internal urls and its subdomains on the pages of a host.

### Ignoring robots.txt and robots meta

By default, the crawler will respect robots data. It is possible to disable these checks like so:

```php
Crawler::create()
->ignoreRobots()
...
```

Robots data can come from either a `robots.txt` file, meta tags or response headers.
More information on the spec can be found here: [http://www.robotstxt.org/](http://www.robotstxt.org/).

Parsing robots data is done by our package [spatie/robots-txt](https://github.com/spatie/robots-txt).

## Setting the number of concurrent requests

To improve the speed of the crawl the package concurrently crawls 10 urls by default. If you want to change that number you can use the `setConcurrency` method.
Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"guzzlehttp/psr7": "^1.4",
"nicmart/tree": "^0.2.7",
"spatie/browsershot": "^3.14",
"spatie/robots-txt": "^1.0.0",
"symfony/dom-crawler": "^4.0",
"tightenco/collect": "^5.6"
},
Expand Down
99 changes: 94 additions & 5 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\Request;
use Spatie\Robots\RobotsTxt;
use InvalidArgumentException;
use Spatie\Robots\RobotsMeta;
use GuzzleHttp\RequestOptions;
use Spatie\Robots\RobotsHeaders;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Psr\Http\Message\StreamInterface;
Expand Down Expand Up @@ -52,6 +55,9 @@ class Crawler
/** @var int|null */
protected $maximumDepth = null;

/** @var bool */
protected $respectRobots = true;

/** @var \Tree\Node\Node */
protected $depthTree;

Expand All @@ -61,6 +67,9 @@ class Crawler
/** @var Browsershot */
protected $browsershot = null;

/** @var \Spatie\Robots\RobotsTxt */
protected $robotsTxt = null;

protected static $defaultClientOptions = [
RequestOptions::COOKIES => true,
RequestOptions::CONNECT_TIMEOUT => 10,
Expand Down Expand Up @@ -145,6 +154,26 @@ public function setMaximumDepth(int $maximumDepth)
return $this;
}

/**
* @return $this
*/
public function ignoreRobots()
{
$this->respectRobots = false;

return $this;
}

/**
* @return $this
*/
public function respectRobots()
{
$this->respectRobots = true;

return $this;
}

/**
* @param CrawlQueue $crawlQueue
*
Expand Down Expand Up @@ -238,7 +267,11 @@ public function startCrawling($baseUrl)

$crawlUrl = CrawlUrl::create($this->baseUrl);

$this->addToCrawlQueue($crawlUrl);
$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);

if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
$this->addToCrawlQueue($crawlUrl);
}

$this->depthTree = new Node((string) $this->baseUrl);

Expand All @@ -257,6 +290,17 @@ protected function startCrawlingQueue()
'options' => $this->client->getConfig(),
'fulfilled' => function (ResponseInterface $response, $index) {
$crawlUrl = $this->crawlQueue->getUrlById($index);

$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);

$robotsHeaders = RobotsHeaders::create($response->getHeaders());

$robotsMeta = RobotsMeta::create($body);

if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
return;
}

$this->handleCrawled($response, $crawlUrl);

if (! $this->crawlProfile instanceof CrawlSubdomains) {
Expand All @@ -265,7 +309,9 @@ protected function startCrawlingQueue()
}
}

$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
return;
}

$this->addAllLinksToCrawlQueue(
$body,
Expand Down Expand Up @@ -301,9 +347,14 @@ protected function convertBodyToString(StreamInterface $bodyStream, $readMaximum
return $body;
}

protected function createRobotsTxt(UriInterface $uri): RobotsTxt
{
return RobotsTxt::create($uri->withPath('/robots.txt'));
}

/**
* @param ResponseInterface|null $response
* @param CrawlUrl $crawlUrl
* @param CrawlUrl $crawlUrl
*/
protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
{
Expand All @@ -318,7 +369,7 @@ protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl

/**
* @param RequestException $exception
* @param CrawlUrl $crawlUrl
* @param CrawlUrl $crawlUrl
*/
protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
{
Expand Down Expand Up @@ -393,6 +444,10 @@ protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUr

protected function shouldCrawl(Node $node): bool
{
if ($this->respectRobots) {
return $this->robotsTxt->allows($node->getValue());
}

if (is_null($this->maximumDepth)) {
return true;
}
Expand All @@ -401,7 +456,7 @@ protected function shouldCrawl(Node $node): bool
}

/**
* @param string $html
* @param string $html
* @param \Psr\Http\Message\UriInterface $foundOnUrl
*
* @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
Expand Down Expand Up @@ -505,4 +560,38 @@ protected function maximumCrawlCountReached(): bool

return $this->crawledUrlCount >= $this->maximumCrawlCount;
}

protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
{
if (! $this->respectRobots) {
return true;
}

if (! $robotsHeaders->mayIndex()) {
return false;
}

if (! $robotsMeta->mayIndex()) {
return false;
}

return true;
}

protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
{
if (! $this->respectRobots) {
return true;
}

if (! $robotsHeaders->mayFollow()) {
return false;
}

if (! $robotsMeta->mayFollow()) {
return false;
}

return true;
}
}
90 changes: 90 additions & 0 deletions tests/CrawlerRobotsTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?php

namespace Spatie\Crawler\Test;

use Spatie\Crawler\Crawler;

class CrawlerRobotsTest extends TestCase
{
public function setUp()
{
parent::setUp();

$this->skipIfTestServerIsNotRunning();

$this->resetLog();
}

/** @test */
public function it_should_not_follow_robots_txt_disallowed_links()
{
$this->createCrawler()
->startCrawling('http://localhost:8080');

$this->assertNotCrawled([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_does_not_allow_a_root_ignored_url()
{
$this->createCrawler()
->startCrawling('http://localhost:8080/txt-disallow');

$this->assertNotCrawled([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_follow_robots_txt_disallowed_links_when_robots_are_ignored()
{
$this->createCrawler()
->ignoreRobots()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_not_follow_robots_meta_disallowed_links()
{
$this->createCrawler()
->startCrawling('http://localhost:8080');

$this->assertNotCrawled([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_follow_robots_meta_disallowed_links_when_robots_are_ignored()
{
$this->createCrawler()
->ignoreRobots()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_not_follow_robots_header_disallowed_links()
{
$this->createCrawler()
->startCrawling('http://localhost:8080');

$this->assertNotCrawled([['url' => 'http://localhost:8080/header-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_follow_robots_header_disallowed_links_when_robots_are_ignored()
{
$this->createCrawler()
->ignoreRobots()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([['url' => 'http://localhost:8080/header-disallow', 'foundOn' => 'http://localhost:8080/']]);
}

private function createCrawler(): Crawler
{
return Crawler::create()
->setMaximumDepth(1)
->setCrawlObserver(new CrawlLogger());
}
}
Loading

0 comments on commit 0d1e607

Please sign in to comment.