Skip to content

Commit

Permalink
Use same automatic encoding detection for all contents
Browse files Browse the repository at this point in the history
Only HTML content was benefiting from automatic encoding detection.
CSS used custom tinycss detection method
Now, all content type (HTML, JS, CSS and JSON) use the same automatic
encoding detection.
  • Loading branch information
benoit74 committed Jun 10, 2024
1 parent 25b2804 commit be3c0e3
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def rewrite_html(self, pre_head_template: Template, post_head_template: Template

@no_title
def rewrite_css(self) -> str | bytes:
return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content)
return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content_str)

@no_title
def rewrite_js(self, opts: dict[str, Any]) -> str | bytes:
Expand All @@ -197,11 +197,11 @@ def rewrite_js(self, opts: dict[str, Any]) -> str | bytes:
notify_js_module=self.js_module_found,
base_href=None,
)
return rewriter.rewrite(self.content.decode(), opts)
return rewriter.rewrite(self.content_str, opts)

@no_title
def rewrite_jsonp(self) -> str | bytes:
content = self.content.decode()
content = self.content_str
match = JSONP_REGEX.match(content)
if not match:
return content
Expand Down
125 changes: 125 additions & 0 deletions tests/test_rewriting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import io

import pytest
from jinja2 import Template
from warcio import StatusAndHeaders
from warcio.recordloader import ArcWarcRecord

from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath


@pytest.fixture(scope="module")
def rewrite_generator():
"""A fixture which return a generator for a generic rewriter"""

def generate_and_call(
content: bytes = b"dummy", content_type: str = "text/html; charset=UTF-8"
):
rec_headers = StatusAndHeaders(
"WARC/1.1",
headers=[("WARC-Target-URI", "http://www.example.com")],
)
http_headers = StatusAndHeaders(
"HTTP/1.1 200 OK",
headers=[("Content-Type", content_type)],
)
return Rewriter(
ZimPath("www.example.com"),
ArcWarcRecord(
"warc", # format = warc
"response", # rec_type = response
rec_headers,
io.BytesIO(content),
http_headers,
"application/http; msgtype=response",
content.__len__(),
),
set(),
set(),
set(),
).rewrite(Template(""), Template(""))

yield generate_and_call


@pytest.mark.parametrize(
"content_str, encoding, content_type",
[
pytest.param("Bérénice", "UTF-8", "text/html", id="html_content_utf8_auto"),
pytest.param("Bérénice", "UTF-8", "text/css", id="js_content_utf8_auto"),
pytest.param(
"Bérénice", "UTF-8", "text/javascript", id="css_content_utf8_auto"
),
pytest.param(
"Bérénice", "UTF-8", "youdontknowme", id="unknown_content_utf8_auto"
),
pytest.param("Bérénice", "ISO-8859-1", "text/html", id="html_content_iso_auto"),
pytest.param("Bérénice", "ISO-8859-1", "text/css", id="js_content_iso_auto"),
pytest.param(
"Bérénice", "ISO-8859-1", "text/javascript", id="css_content_iso_auto"
),
pytest.param(
"Bérénice", "ISO-8859-1", "youdontknowme", id="unknown_content_iso_auto"
),
pytest.param(
"Bérénice",
"UTF-8",
"text/html; charset=UTF-8",
id="html_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"text/css; charset=UTF-8",
id="js_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"text/javascript; charset=UTF-8",
id="css_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"youdontknowme; charset=UTF-8",
id="unknown_content_utf8_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/html; charset=ISO-8859-1",
id="html_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/css; charset=ISO-8859-1",
id="js_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/javascript; charset=ISO-8859-1",
id="css_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"youdontknowme; charset=ISO-8859-1",
id="unknown_content_iso_declared",
),
],
)
def test_generic_rewriting_encoding_handling(
rewrite_generator, content_str, encoding, content_type
):
"""Test handling of encoding in various content types"""
content_bytes = content_str.encode(encoding)
(_, content) = rewrite_generator(content=content_bytes, content_type=content_type)
if isinstance(content, bytes):
# we return original bytes if content is not rewriten
assert content == content_bytes
else:
assert content == content_str

0 comments on commit be3c0e3

Please sign in to comment.