Skip to content

Commit

Permalink
Merge pull request #278 from openzim/redirection_loop
Browse files Browse the repository at this point in the history
Avoid and detect redirection loops
  • Loading branch information
benoit74 authored May 24, 2024
2 parents dcfb025 + 9907fe6 commit e9ab5e3
Showing 1 changed file with 33 additions and 2 deletions.
35 changes: 33 additions & 2 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,10 @@ def gather_information_from_warc(self):
redirection_zim_path = normalize(
HttpUrl(urljoin(url, redirect_location))
)
self.redirections[zim_path] = redirection_zim_path
# Redirection to same ZIM path have to be ignored (occurs for
# instance when redirecting from http to https)
if zim_path != redirection_zim_path:
self.redirections[zim_path] = redirection_zim_path
else:
logger.warning(f"Redirection target is empty for {zim_path}")
else:
Expand Down Expand Up @@ -468,12 +471,16 @@ def gather_information_from_warc(self):
logger.debug(f"Favicon: {self.favicon_url or self.favicon_path}")
main_page_found = True

logger.info(f"Expecting {len(self.expected_zim_items)} ZIM entries to files")

if not main_page_found:
raise KeyError(
f"Unable to find WARC record for main page: {self.main_path}, aborting"
)

redirections_to_ignore = set()

logger.debug(f"Preparing {len(self.redirections)} redirections")
for redirect_source, redirect_target in self.redirections.items():
# if the URL is already expected, then just ignore the redirection
if redirect_source in self.expected_zim_items:
Expand All @@ -486,22 +493,46 @@ def gather_information_from_warc(self):
final_redirect_target in self.redirections
and final_redirect_target != redirect_source
):
# If redirection target is identical, we have finished looping
# This should not happen here / be handled upper-level, but it is better
# to check than finishing in a dead loop
if final_redirect_target == self.redirections[final_redirect_target]:
logger.warning(
f"Redirection to self found for {final_redirect_target}"
)
break
final_redirect_target = self.redirections[final_redirect_target]

if final_redirect_target in self.expected_zim_items:
if final_redirect_target == redirect_source:
# If the redirect target is the source ... we obviously have an issue
logger.warning(
f"Redirection loop found for {redirect_source}, will be ignored"
)
redirections_to_ignore.add(redirect_source)
elif final_redirect_target in self.expected_zim_items:
# if final redirection target is including inside the ZIM, simply add
# the redirection source to the list of expected ZIM items so that URLs
# are properly rewritten
self.expected_zim_items.add(redirect_source)
else:
# otherwise add it to a temporary list of items that will have to be
# dropped from the list of redirections to create
logger.warning(
f"Redirection target of {redirect_source} is missing "
f"({final_redirect_target} is not expected in the ZIM)"
)
redirections_to_ignore.add(redirect_source)

logger.debug(f"{len(redirections_to_ignore)} redirections will be ignored")

# update the list of redirections to create
for redirect_source in redirections_to_ignore:
self.redirections.pop(redirect_source)

logger.info(
f"Expecting {len(self.expected_zim_items)} ZIM entries including redirects"
)

def find_icon_and_language(self, record, content):
soup = BeautifulSoup(content, "html.parser")

Expand Down

0 comments on commit e9ab5e3

Please sign in to comment.