Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: Removing reppy dependency #94

Merged
merged 1 commit into from
Apr 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ deps: venv ## install requiements


dev-env: deps clean ## Install Development Version
$(PYTHON) -m pip uninstall deadlinks -y
pip uninstall deadlinks -y
pip install -e .


Expand Down
65 changes: 57 additions & 8 deletions deadlinks/robots_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
"""

# -- Imports -------------------------------------------------------------------
from typing import (Any)
from typing import (Any, List, Tuple)

from reppy.robots import Robots
from reppy.exceptions import ReppyException
from urllib.robotparser import RobotFileParser

from .request import user_agent
from .url import URL
Expand All @@ -46,18 +45,68 @@ def allowed(self, url: URL) -> bool:

# We actually can't find out is there robots.txt or not
# so we going to allow all in this case.
if self.state is False:
if self.state is False or self.state.allow_all:
return True

return bool(self.state.allowed(str(url), user_agent))
if not self.state.last_checked and self.state.disallow_all:
return False

# find entry
return allowed(matched_rules(self._entry(), url))

def request(self, url: str) -> None:
""" Perform robots.txt request """
if not (self.state is None):
if self.state is not None:
return

try:
self.state = Robots.fetch(url)
self.state = RobotFileParser()
self.state.set_url(url)
self.state.read()

except ReppyException:
except Exception:
self.state = False

# This is mostly transferred logics from robotparser.py,
# but we trying to follow 2019 extension of the Google's Robots Txt
# protocol and allow, disallowed pathes.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04

def _entry(self) -> Any:

for entry in self.state.entries:
if entry.applies_to(user_agent):
return entry

return self.state.default_entry


def matched_rules(entry: Any, url: URL) -> List[Tuple[bool, str]]:
result: List[Tuple[bool, str]] = []

path = url.path
if not path:
path = "/"

for line in entry.rulelines:
if not line.applies_to(path):
continue

if len(line.path) > len(path):
continue

result.append((
line.allowance,
line.path,
))

return sorted(result, key=lambda x: x[1])


def allowed(rules: List[Tuple[bool, str]]) -> bool:

if not rules:
return True

return rules[-1][0]
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ idna>=2.8
requests>=2.22.0
click>=7.0
urllib3>=1.25.6
reppy==0.4.14
six==1.15.0
PyOpenSSL==19.1.0; python_full_version < '3.6.0'

Expand Down
14 changes: 6 additions & 8 deletions tests/features/tests_robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,15 @@
# -- Imports -------------------------------------------------------------------

import pytest
from flaky import flaky

from copy import deepcopy as copy
from typing import (Optional, Dict)
from typing import Dict

from ..utils import Page

from deadlinks import (Settings, Crawler)
from deadlinks import user_agent

from deadlinks import (
DeadlinksIgnoredURL,
DeadlinksSettingsBase,
)
from deadlinks import DeadlinksIgnoredURL

server_pages = {
'^/$': Page("".join(["<a href='/link-%s'>%s</a>" % (x, x) for x in range(1, 101)])).exists(),
Expand Down Expand Up @@ -97,13 +92,16 @@ def test_failed_domain():
from random import choice
from string import ascii_lowercase

domain = "http://%s.com/" % ''.join([choice(ascii_lowercase) for x in range(42)])
domain = "http://%s.com/" % ''.join(choice(ascii_lowercase) for x in range(42))
c = Crawler(Settings(domain))
c.start()

assert len(c.failed) == 1


# Allow is Deeper then Disallowed.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04
def test_failed_google():

c = Crawler(
Expand Down