diff --git a/Makefile b/Makefile index b103142..8391c77 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ deps: venv ## install requiements dev-env: deps clean ## Install Development Version - $(PYTHON) -m pip uninstall deadlinks -y + pip uninstall deadlinks -y pip install -e . diff --git a/deadlinks/robots_txt.py b/deadlinks/robots_txt.py index 89b0d32..93b6934 100644 --- a/deadlinks/robots_txt.py +++ b/deadlinks/robots_txt.py @@ -23,10 +23,9 @@ """ # -- Imports ------------------------------------------------------------------- -from typing import (Any) +from typing import (Any, List, Tuple) -from reppy.robots import Robots -from reppy.exceptions import ReppyException +from urllib.robotparser import (RobotFileParser, Entry) from .request import user_agent from .url import URL @@ -46,18 +45,68 @@ def allowed(self, url: URL) -> bool: # We actually can't find out is there robots.txt or not # so we going to allow all in this case. - if self.state is False: + if self.state is False or self.state.allow_all: return True - return bool(self.state.allowed(str(url), user_agent)) + if not self.state.last_checked and self.state.disallow_all: + return False + + # find entry + return allowed(matched_rules(self._entry(), url)) def request(self, url: str) -> None: """ Perform robots.txt request """ - if not (self.state is None): + if self.state is not None: return try: - self.state = Robots.fetch(url) + self.state = RobotFileParser() + self.state.set_url(url) + self.state.read() - except ReppyException: + except Exception: self.state = False + + # This is mostly transferred logics from robotparser.py, + # but we trying to follow 2019 extension of the Google's Robots Txt + # protocol and allow, disallowed pathes. + # https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/ + # https://tools.ietf.org/html/draft-koster-rep-04 + + def _entry(self) -> Entry: + + for entry in self.state.entries: + if entry.applies_to(user_agent): + return entry + + return self.state.default_entry + + +def matched_rules(entry: Entry, url: URL) -> List[Tuple[bool, str]]: + result: List[Tuple[str, bool]] = [] + + path = url.path + if not path: + path = "/" + + for line in entry.rulelines: + if not line.applies_to(path): + continue + + if len(line.path) > len(path): + continue + + result.append(( + line.allowance, + line.path, + )) + + return sorted(result, key=lambda x: x[1]) + + +def allowed(rules: List[Tuple[bool, str]]) -> bool: + + if not rules: + return True + + return rules[-1][0] diff --git a/requirements.txt b/requirements.txt index 9a91053..0baac94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,6 @@ idna>=2.8 requests>=2.22.0 click>=7.0 urllib3>=1.25.6 -reppy==0.4.14 six==1.15.0 PyOpenSSL==19.1.0; python_full_version < '3.6.0' diff --git a/tests/features/tests_robots.py b/tests/features/tests_robots.py index 7563b1a..bbc25e3 100644 --- a/tests/features/tests_robots.py +++ b/tests/features/tests_robots.py @@ -11,20 +11,15 @@ # -- Imports ------------------------------------------------------------------- import pytest -from flaky import flaky from copy import deepcopy as copy -from typing import (Optional, Dict) +from typing import Dict from ..utils import Page from deadlinks import (Settings, Crawler) -from deadlinks import user_agent -from deadlinks import ( - DeadlinksIgnoredURL, - DeadlinksSettingsBase, -) +from deadlinks import DeadlinksIgnoredURL server_pages = { '^/$': Page("".join(["%s" % (x, x) for x in range(1, 101)])).exists(), @@ -97,13 +92,16 @@ def test_failed_domain(): from random import choice from string import ascii_lowercase - domain = "http://%s.com/" % ''.join([choice(ascii_lowercase) for x in range(42)]) + domain = "http://%s.com/" % ''.join(choice(ascii_lowercase) for x in range(42)) c = Crawler(Settings(domain)) c.start() assert len(c.failed) == 1 +# Allow is Deeper then Disallowed. +# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/ +# https://tools.ietf.org/html/draft-koster-rep-04 def test_failed_google(): c = Crawler(