Skip to content

Commit

Permalink
feature: Removing reppy dependency
Browse files Browse the repository at this point in the history
Because of the reppy isn't supported anymore
( see seomoz/reppy#122 ), it's functionality
replaced by default python module urllib.robotparser.RobotFileParser
with small google oriented extension.
  • Loading branch information
butuzov committed Apr 11, 2021
1 parent decb06f commit 184bd58
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 18 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ deps: venv ## install requiements


dev-env: deps clean ## Install Development Version
$(PYTHON) -m pip uninstall deadlinks -y
pip uninstall deadlinks -y
pip install -e .


Expand Down
63 changes: 55 additions & 8 deletions deadlinks/robots_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
"""

# -- Imports -------------------------------------------------------------------
from typing import (Any)
from typing import (Any, List, Tuple)

from reppy.robots import Robots
from reppy.exceptions import ReppyException
from urllib.robotparser import (RobotFileParser, Entry)

from .request import user_agent
from .url import URL
Expand All @@ -46,18 +45,66 @@ def allowed(self, url: URL) -> bool:

# We actually can't find out is there robots.txt or not
# so we going to allow all in this case.
if self.state is False:
if self.state is False or self.state.allow_all:
return True

return bool(self.state.allowed(str(url), user_agent))
if not self.state.last_checked and self.state.disallow_all:
return False

# find entry
return self._allowed(self._results(self._entry(), url))

def request(self, url: str) -> None:
""" Perform robots.txt request """
if not (self.state is None):
if self.state is not None:
return

try:
self.state = Robots.fetch(url)
self.state = RobotFileParser()
self.state.set_url(url)
self.state.read()

except ReppyException:
except Exception:
self.state = False

# This is mostly transferred logics from robotparser.py,
# but we trying to follow 2019 extension of the Google's Robots Txt
# protocol and allow, disallowed pathes.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04

def _entry(self) -> Entry:

for entry in self.state.entries:
if entry.applies_to(user_agent):
return entry

return self.state.default_entry

def _results(self, entry: Entry, url: URL) -> List[Tuple[str, bool]]:
result: List[Tuple[str, bool]] = []

path = url.path
if not path:
path = "/"

for line in entry.rulelines:
if not line.applies_to(path):
continue

if len(line.path) > len(path):
continue

result.append((
line.allowance,
line.path,
))

return sorted(result, key=lambda x: x[1])

def _allowed(self, rules: List[Tuple[str, bool]]) -> bool:

if not rules:
return True

return rules[-1][0]
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ idna>=2.8
requests>=2.22.0
click>=7.0
urllib3>=1.25.6
reppy==0.4.14
six==1.15.0
PyOpenSSL==19.1.0; python_full_version < '3.6.0'

Expand Down
14 changes: 6 additions & 8 deletions tests/features/tests_robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,15 @@
# -- Imports -------------------------------------------------------------------

import pytest
from flaky import flaky

from copy import deepcopy as copy
from typing import (Optional, Dict)
from typing import Dict

from ..utils import Page

from deadlinks import (Settings, Crawler)
from deadlinks import user_agent

from deadlinks import (
DeadlinksIgnoredURL,
DeadlinksSettingsBase,
)
from deadlinks import DeadlinksIgnoredURL

server_pages = {
'^/$': Page("".join(["<a href='/link-%s'>%s</a>" % (x, x) for x in range(1, 101)])).exists(),
Expand Down Expand Up @@ -97,13 +92,16 @@ def test_failed_domain():
from random import choice
from string import ascii_lowercase

domain = "http://%s.com/" % ''.join([choice(ascii_lowercase) for x in range(42)])
domain = "http://%s.com/" % ''.join(choice(ascii_lowercase) for x in range(42))
c = Crawler(Settings(domain))
c.start()

assert len(c.failed) == 1


# Allow is Deeper then Disallowed.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04
def test_failed_google():

c = Crawler(
Expand Down

0 comments on commit 184bd58

Please sign in to comment.