From be3c0e3919005fa5bbb6a1243e946109ddb1ada9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Jun 2024 09:00:50 +0000 Subject: [PATCH] Use same automatic encoding detection for all contents Only HTML content was benefiting from automatic encoding detection. CSS used custom tinycss detection method Now, all content type (HTML, JS, CSS and JSON) use the same automatic encoding detection. --- src/warc2zim/content_rewriting/generic.py | 6 +- tests/test_rewriting.py | 125 ++++++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 tests/test_rewriting.py diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py index c9f0ea1..e5aefd4 100644 --- a/src/warc2zim/content_rewriting/generic.py +++ b/src/warc2zim/content_rewriting/generic.py @@ -186,7 +186,7 @@ def rewrite_html(self, pre_head_template: Template, post_head_template: Template @no_title def rewrite_css(self) -> str | bytes: - return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content) + return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content_str) @no_title def rewrite_js(self, opts: dict[str, Any]) -> str | bytes: @@ -197,11 +197,11 @@ def rewrite_js(self, opts: dict[str, Any]) -> str | bytes: notify_js_module=self.js_module_found, base_href=None, ) - return rewriter.rewrite(self.content.decode(), opts) + return rewriter.rewrite(self.content_str, opts) @no_title def rewrite_jsonp(self) -> str | bytes: - content = self.content.decode() + content = self.content_str match = JSONP_REGEX.match(content) if not match: return content diff --git a/tests/test_rewriting.py b/tests/test_rewriting.py new file mode 100644 index 0000000..b2897fc --- /dev/null +++ b/tests/test_rewriting.py @@ -0,0 +1,125 @@ +import io + +import pytest +from jinja2 import Template +from warcio import StatusAndHeaders +from warcio.recordloader import ArcWarcRecord + +from warc2zim.content_rewriting.generic import Rewriter +from warc2zim.url_rewriting import ZimPath + + +@pytest.fixture(scope="module") +def rewrite_generator(): + """A fixture which return a generator for a generic rewriter""" + + def generate_and_call( + content: bytes = b"dummy", content_type: str = "text/html; charset=UTF-8" + ): + rec_headers = StatusAndHeaders( + "WARC/1.1", + headers=[("WARC-Target-URI", "http://www.example.com")], + ) + http_headers = StatusAndHeaders( + "HTTP/1.1 200 OK", + headers=[("Content-Type", content_type)], + ) + return Rewriter( + ZimPath("www.example.com"), + ArcWarcRecord( + "warc", # format = warc + "response", # rec_type = response + rec_headers, + io.BytesIO(content), + http_headers, + "application/http; msgtype=response", + content.__len__(), + ), + set(), + set(), + set(), + ).rewrite(Template(""), Template("")) + + yield generate_and_call + + +@pytest.mark.parametrize( + "content_str, encoding, content_type", + [ + pytest.param("Bérénice", "UTF-8", "text/html", id="html_content_utf8_auto"), + pytest.param("Bérénice", "UTF-8", "text/css", id="js_content_utf8_auto"), + pytest.param( + "Bérénice", "UTF-8", "text/javascript", id="css_content_utf8_auto" + ), + pytest.param( + "Bérénice", "UTF-8", "youdontknowme", id="unknown_content_utf8_auto" + ), + pytest.param("Bérénice", "ISO-8859-1", "text/html", id="html_content_iso_auto"), + pytest.param("Bérénice", "ISO-8859-1", "text/css", id="js_content_iso_auto"), + pytest.param( + "Bérénice", "ISO-8859-1", "text/javascript", id="css_content_iso_auto" + ), + pytest.param( + "Bérénice", "ISO-8859-1", "youdontknowme", id="unknown_content_iso_auto" + ), + pytest.param( + "Bérénice", + "UTF-8", + "text/html; charset=UTF-8", + id="html_content_utf8_declared", + ), + pytest.param( + "Bérénice", + "UTF-8", + "text/css; charset=UTF-8", + id="js_content_utf8_declared", + ), + pytest.param( + "Bérénice", + "UTF-8", + "text/javascript; charset=UTF-8", + id="css_content_utf8_declared", + ), + pytest.param( + "Bérénice", + "UTF-8", + "youdontknowme; charset=UTF-8", + id="unknown_content_utf8_declared", + ), + pytest.param( + "Bérénice", + "ISO-8859-1", + "text/html; charset=ISO-8859-1", + id="html_content_iso_declared", + ), + pytest.param( + "Bérénice", + "ISO-8859-1", + "text/css; charset=ISO-8859-1", + id="js_content_iso_declared", + ), + pytest.param( + "Bérénice", + "ISO-8859-1", + "text/javascript; charset=ISO-8859-1", + id="css_content_iso_declared", + ), + pytest.param( + "Bérénice", + "ISO-8859-1", + "youdontknowme; charset=ISO-8859-1", + id="unknown_content_iso_declared", + ), + ], +) +def test_generic_rewriting_encoding_handling( + rewrite_generator, content_str, encoding, content_type +): + """Test handling of encoding in various content types""" + content_bytes = content_str.encode(encoding) + (_, content) = rewrite_generator(content=content_bytes, content_type=content_type) + if isinstance(content, bytes): + # we return original bytes if content is not rewriten + assert content == content_bytes + else: + assert content == content_str