Merge pull request #145 from spatie/respect-robots

Respect robots
spatie · May 8, 2018 · 0d1e607 · 0d1e607
2 parents 016fca9 + 80e1a7b
commit 0d1e607
Show file tree

Hide file tree

Showing 9 changed files with 286 additions and 68 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -23,7 +23,4 @@ before_script:
   - travis_retry composer update ${COMPOSER_FLAGS} --no-interaction --prefer-source
 
 script:
-  - vendor/bin/phpunit --coverage-text --coverage-clover=coverage.clover
-
-after_script:
-  - php vendor/bin/ocular code-coverage:upload --format=php-clover coverage.clover
+  - vendor/bin/phpunit
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes to `spatie/crawler` will be documented in this file.
 
+## 4.1.0 - 2018-05-08
+
+- Respect robots.txt
+
 ## 4.0.5 - 2018-04-30
 
 - improved extensibility by removing php native type hinting of url, queue and crawler pool Closures

diff --git a/README.md b/README.md
@@ -155,6 +155,21 @@ This package comes with three `CrawlProfiles` out of the box:
 - `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host.
 - `CrawlSubdomainUrls`: this profile will only crawl the internal urls and its subdomains on the pages of a host.
 
+### Ignoring robots.txt and robots meta
+
+By default, the crawler will respect robots data. It is possible to disable these checks like so:
+
+```php
+Crawler::create()
+    ->ignoreRobots()
+    ...
+```
+
+Robots data can come from either a `robots.txt` file, meta tags or response headers.
+More information on the spec can be found here: [http://www.robotstxt.org/](http://www.robotstxt.org/).
+
+Parsing robots data is done by our package [spatie/robots-txt](https://github.com/spatie/robots-txt).
+
 ## Setting the number of concurrent requests
 
 To improve the speed of the crawl the package concurrently crawls 10 urls by default. If you want to change that number you can use the `setConcurrency` method.

diff --git a/composer.json b/composer.json
@@ -21,6 +21,7 @@
         "guzzlehttp/psr7": "^1.4",
         "nicmart/tree": "^0.2.7",
         "spatie/browsershot": "^3.14",
+        "spatie/robots-txt": "^1.0.0",
         "symfony/dom-crawler": "^4.0",
         "tightenco/collect": "^5.6"
     },

diff --git a/src/Crawler.php b/src/Crawler.php
@@ -8,8 +8,11 @@
 use GuzzleHttp\Client;
 use GuzzleHttp\Psr7\Uri;
 use GuzzleHttp\Psr7\Request;
+use Spatie\Robots\RobotsTxt;
 use InvalidArgumentException;
+use Spatie\Robots\RobotsMeta;
 use GuzzleHttp\RequestOptions;
+use Spatie\Robots\RobotsHeaders;
 use Psr\Http\Message\UriInterface;
 use Spatie\Browsershot\Browsershot;
 use Psr\Http\Message\StreamInterface;
@@ -52,6 +55,9 @@ class Crawler
     /** @var int|null */
     protected $maximumDepth = null;
 
+    /** @var bool */
+    protected $respectRobots = true;
+
     /** @var \Tree\Node\Node */
     protected $depthTree;
 
@@ -61,6 +67,9 @@ class Crawler
     /** @var Browsershot */
     protected $browsershot = null;
 
+    /** @var \Spatie\Robots\RobotsTxt */
+    protected $robotsTxt = null;
+
     protected static $defaultClientOptions = [
         RequestOptions::COOKIES => true,
         RequestOptions::CONNECT_TIMEOUT => 10,
@@ -145,6 +154,26 @@ public function setMaximumDepth(int $maximumDepth)
         return $this;
     }
 
+    /**
+     * @return $this
+     */
+    public function ignoreRobots()
+    {
+        $this->respectRobots = false;
+
+        return $this;
+    }
+
+    /**
+     * @return $this
+     */
+    public function respectRobots()
+    {
+        $this->respectRobots = true;
+
+        return $this;
+    }
+
     /**
      * @param CrawlQueue $crawlQueue
      *
@@ -238,7 +267,11 @@ public function startCrawling($baseUrl)
 
         $crawlUrl = CrawlUrl::create($this->baseUrl);
 
-        $this->addToCrawlQueue($crawlUrl);
+        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
+
+        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
+            $this->addToCrawlQueue($crawlUrl);
+        }
 
         $this->depthTree = new Node((string) $this->baseUrl);
 
@@ -257,6 +290,17 @@ protected function startCrawlingQueue()
                 'options' => $this->client->getConfig(),
                 'fulfilled' => function (ResponseInterface $response, $index) {
                     $crawlUrl = $this->crawlQueue->getUrlById($index);
+
+                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
+
+                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
+
+                    $robotsMeta = RobotsMeta::create($body);
+
+                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
+                        return;
+                    }
+
                     $this->handleCrawled($response, $crawlUrl);
 
                     if (! $this->crawlProfile instanceof CrawlSubdomains) {
@@ -265,7 +309,9 @@ protected function startCrawlingQueue()
                         }
                     }
 
-                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
+                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
+                        return;
+                    }
 
                     $this->addAllLinksToCrawlQueue(
                         $body,
@@ -301,9 +347,14 @@ protected function convertBodyToString(StreamInterface $bodyStream, $readMaximum
         return $body;
     }
 
+    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
+    {
+        return RobotsTxt::create($uri->withPath('/robots.txt'));
+    }
+
     /**
      * @param ResponseInterface|null $response
-     * @param CrawlUrl $crawlUrl
+     * @param CrawlUrl               $crawlUrl
      */
     protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
     {
@@ -318,7 +369,7 @@ protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl
 
     /**
      * @param RequestException $exception
-     * @param CrawlUrl $crawlUrl
+     * @param CrawlUrl         $crawlUrl
      */
     protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
     {
@@ -393,6 +444,10 @@ protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUr
 
     protected function shouldCrawl(Node $node): bool
     {
+        if ($this->respectRobots) {
+            return $this->robotsTxt->allows($node->getValue());
+        }
+
         if (is_null($this->maximumDepth)) {
             return true;
         }
@@ -401,7 +456,7 @@ protected function shouldCrawl(Node $node): bool
     }
 
     /**
-     * @param string $html
+     * @param string                         $html
      * @param \Psr\Http\Message\UriInterface $foundOnUrl
      *
      * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
@@ -505,4 +560,38 @@ protected function maximumCrawlCountReached(): bool
 
         return $this->crawledUrlCount >= $this->maximumCrawlCount;
     }
+
+    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
+    {
+        if (! $this->respectRobots) {
+            return true;
+        }
+
+        if (! $robotsHeaders->mayIndex()) {
+            return false;
+        }
+
+        if (! $robotsMeta->mayIndex()) {
+            return false;
+        }
+
+        return true;
+    }
+
+    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
+    {
+        if (! $this->respectRobots) {
+            return true;
+        }
+
+        if (! $robotsHeaders->mayFollow()) {
+            return false;
+        }
+
+        if (! $robotsMeta->mayFollow()) {
+            return false;
+        }
+
+        return true;
+    }
 }
diff --git a/tests/CrawlerRobotsTest.php b/tests/CrawlerRobotsTest.php
@@ -0,0 +1,90 @@
+<?php
+
+namespace Spatie\Crawler\Test;
+
+use Spatie\Crawler\Crawler;
+
+class CrawlerRobotsTest extends TestCase
+{
+    public function setUp()
+    {
+        parent::setUp();
+
+        $this->skipIfTestServerIsNotRunning();
+
+        $this->resetLog();
+    }
+
+    /** @test */
+    public function it_should_not_follow_robots_txt_disallowed_links()
+    {
+        $this->createCrawler()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertNotCrawled([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_does_not_allow_a_root_ignored_url()
+    {
+        $this->createCrawler()
+            ->startCrawling('http://localhost:8080/txt-disallow');
+
+        $this->assertNotCrawled([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_should_follow_robots_txt_disallowed_links_when_robots_are_ignored()
+    {
+        $this->createCrawler()
+            ->ignoreRobots()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertCrawledOnce([['url' => 'http://localhost:8080/txt-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_should_not_follow_robots_meta_disallowed_links()
+    {
+        $this->createCrawler()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertNotCrawled([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_should_follow_robots_meta_disallowed_links_when_robots_are_ignored()
+    {
+        $this->createCrawler()
+            ->ignoreRobots()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_should_not_follow_robots_header_disallowed_links()
+    {
+        $this->createCrawler()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertNotCrawled([['url' => 'http://localhost:8080/header-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    /** @test */
+    public function it_should_follow_robots_header_disallowed_links_when_robots_are_ignored()
+    {
+        $this->createCrawler()
+            ->ignoreRobots()
+            ->startCrawling('http://localhost:8080');
+
+        $this->assertCrawledOnce([['url' => 'http://localhost:8080/header-disallow', 'foundOn' => 'http://localhost:8080/']]);
+    }
+
+    private function createCrawler(): Crawler
+    {
+        return Crawler::create()
+            ->setMaximumDepth(1)
+            ->setCrawlObserver(new CrawlLogger());
+    }
+}