diff --git a/CHANGELOG.md b/CHANGELOG.md index 422afa6..c64f9ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Handle case where the redirect target is bad / unsupported (#332 and #356) + ## [2.0.3] - 2024-07-24 ### Changed diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 16f6ce5..5fd0483 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -412,15 +412,27 @@ def gather_information_from_warc(self): # check for duplicates, might happen due to fuzzy rules if zim_path not in self.redirections: if redirect_location := record.http_headers.get("Location"): - redirection_zim_path = normalize( - HttpUrl(urljoin(url, redirect_location)) - ) - # Redirection to same ZIM path have to be ignored (occurs for - # instance when redirecting from http to https) - if zim_path != redirection_zim_path: - self.redirections[zim_path] = redirection_zim_path + try: + redirection_zim_path = normalize( + HttpUrl(urljoin(url, redirect_location)) + ) + # Redirection to same ZIM path have to be ignored (occurs + # for instance when redirecting from http to https) + if zim_path != redirection_zim_path: + self.redirections[zim_path] = redirection_zim_path + except Exception as exc: + # Ignore exceptions in redirection handling, this is too + # common to have bad redirections target just like we have + # many bad URLs in HTML code + logger.debug( + f"Failed to process redirection of " + f"{zim_path.value} to {redirect_location} : {exc} ; " + "no ZIM item will be created" + ) else: - logger.warning(f"Redirection target is empty for {zim_path}") + logger.warning( + f"Redirection target is empty for {zim_path.value}" + ) else: self.expected_zim_items.add(zim_path) diff --git a/test-website/Caddyfile b/test-website/Caddyfile index ebdd8e3..b5854f7 100644 --- a/test-website/Caddyfile +++ b/test-website/Caddyfile @@ -75,6 +75,8 @@ redir /bad-redir-loop-B /bad-redir-loop-C 307 redir /bad-redir-loop-C /bad-redir-loop-D 307 redir /bad-redir-loop-D /bad-redir-loop-B 307 + redir /bad-redir-target-A https://I%20mNotAhostname 307 + redir /bad-redir-target-B intent://example.com/path#Intent;scheme=http;package=com.example.myapp;component=com.example.myapp/.MainActivity;end 307 header /content-types/script1.js Content-Type application/javascript header /content-types/script2.js Content-Type text/javascript diff --git a/test-website/content/redirection-loops.html b/test-website/content/bad-redirections.html similarity index 65% rename from test-website/content/redirection-loops.html rename to test-website/content/bad-redirections.html index 702893b..122abf7 100644 --- a/test-website/content/redirection-loops.html +++ b/test-website/content/bad-redirections.html @@ -13,14 +13,20 @@ -

Redirection loops

+

Bad redirections

-

Links below are indefinitely redirecting

+

Links below are indefinitely redirecting and hence not working

Redirect to self through loop

Redirect to inner-loop

+

Links below is targeting something which is not working

+ +

Redirect to silly HTTP URL

+ +

Redirect to an intent (not working inside ZIM)

+ diff --git a/test-website/content/index.html b/test-website/content/index.html index 4e68ab3..8197da2 100644 --- a/test-website/content/index.html +++ b/test-website/content/index.html @@ -47,7 +47,7 @@
  • Base href
  • onxxx HTML events
  • links to folder instead of file
  • -
  • Bad redirections loops
  • +
  • Bad redirections
  • Handling of content types
  • diff --git a/tests/data/bad-redirections.warc.gz b/tests/data/bad-redirections.warc.gz new file mode 100644 index 0000000..16b231f Binary files /dev/null and b/tests/data/bad-redirections.warc.gz differ diff --git a/tests/data/redir-loops.warc.gz b/tests/data/redir-loops.warc.gz deleted file mode 100644 index 64a804b..0000000 Binary files a/tests/data/redir-loops.warc.gz and /dev/null differ diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index c60b06f..10f230b 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -788,24 +788,24 @@ def test_http_return_codes(self, tmp_path): zim_output, f"website.test.openzim.org/{ignored_website_items}" ) - def test_redirection_loops(self, tmp_path): - zim_output = "test-redir-loops.zim" + def test_bad_redirections(self, tmp_path): + zim_output = "test-bad-redirections.zim" main( [ - os.path.join(TEST_DATA_DIR, "redir-loops.warc.gz"), + os.path.join(TEST_DATA_DIR, "bad-redirections.warc.gz"), "--output", str(tmp_path), "--zim-file", zim_output, "--name", - "test-redir-loops", + "test-bad-redirections", ] ) zim_output = tmp_path / zim_output for exising_website_items in [ - "redirection-loops.html", + "bad-redirections.html", ]: self.assert_item_exist( zim_output, f"website.test.openzim.org/{exising_website_items}" @@ -816,6 +816,8 @@ def test_redirection_loops(self, tmp_path): "/bad-redir-loop-B", "/bad-redir-loop-C", "/bad-redir-loop-D", + "/bad-redir-target-A", + "/bad-redir-target-B", ]: self.assert_item_does_not_exist( zim_output, f"website.test.openzim.org/{ignored_website_items}"