diff --git a/src/Crawler.php b/src/Crawler.php index 547d3b0..3d54d46 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -447,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl) } } - public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node + public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node { if (is_null($this->maximumDepth)) { return new Node((string) $url); @@ -457,7 +457,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $returnNode = null; - if ($node->getValue() === (string) $parentUrl) { + if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) { $newNode = new Node((string) $url); $node->addChild($newNode); @@ -466,7 +466,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node } foreach ($node->getChildren() as $currentNode) { - $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); + $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl); if (! is_null($returnNode)) { break; diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 4f45b49..7df172a 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -62,8 +62,9 @@ public function __invoke(ResponseInterface $response, $index) } $baseUrl = $this->getBaseUrl($response, $crawlUrl); + $originalUrl = $crawlUrl->url; - $this->urlParser->addFromHtml($body, $baseUrl); + $this->urlParser->addFromHtml($body, $baseUrl, $originalUrl); usleep($this->crawler->getDelayBetweenRequests()); } diff --git a/src/UrlParsers/LinkUrlParser.php b/src/UrlParsers/LinkUrlParser.php index 69684ca..a48dcf4 100644 --- a/src/UrlParsers/LinkUrlParser.php +++ b/src/UrlParsers/LinkUrlParser.php @@ -21,15 +21,15 @@ public function __construct(Crawler $crawler) $this->crawler = $crawler; } - public function addFromHtml(string $html, UriInterface $foundOnUrl): void + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); collect($allLinks) ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) - ->filter(function (Url $url) use ($foundOnUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { + ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) { return false; } diff --git a/src/UrlParsers/SitemapUrlParser.php b/src/UrlParsers/SitemapUrlParser.php index 04bf47a..ef614f2 100644 --- a/src/UrlParsers/SitemapUrlParser.php +++ b/src/UrlParsers/SitemapUrlParser.php @@ -20,15 +20,15 @@ public function __construct(Crawler $crawler) $this->crawler = $crawler; } - public function addFromHtml(string $html, UriInterface $foundOnUrl): void + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); collect($allLinks) ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) - ->filter(function (Url $url) use ($foundOnUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { + ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) { return false; } diff --git a/src/UrlParsers/UrlParser.php b/src/UrlParsers/UrlParser.php index 24ed582..e46a120 100644 --- a/src/UrlParsers/UrlParser.php +++ b/src/UrlParsers/UrlParser.php @@ -9,5 +9,5 @@ interface UrlParser { public function __construct(Crawler $crawler); - public function addFromHtml(string $html, UriInterface $foundOnUrl): void; + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void; } diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index b762356..ccfd953 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -364,6 +364,18 @@ public function shouldCrawl(UriInterface $url): bool assertCrawledUrlCount(3); }); +it('should handle redirects correctly when max depth is specified', function () { + createCrawler([ + RequestOptions::ALLOW_REDIRECTS => [ + 'track_redirects' => true, + ], + ]) + ->setMaximumDepth(5) + ->startCrawling('http://localhost:8080/redirect-home/'); + + expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'])->toBeCrawledOnce(); +}); + it('respects the requested delay between requests', function () { $baseUrl = 'http://localhost:8080'; diff --git a/tests/server/server.js b/tests/server/server.js index b0dd455..bfc8e3e 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -70,6 +70,10 @@ app.get('/meta-nofollow', function (request, response) { response.end('
\n\nno follow it'); }); +app.get('/redirect-home/', function (request, response) { + response.redirect(301, '/'); +}); + app.get('/dir1/internal-redirect-entry/', function (request, response) { response.end('trapped trap-start'); });