Skip to content

Commit

Permalink
Merge pull request #302 from openzim/encoding_detection
Browse files Browse the repository at this point in the history
Use same automatic encoding detection for all contents
  • Loading branch information
benoit74 authored Jun 11, 2024
2 parents 25b2804 + be3c0e3 commit a9de85d
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def rewrite_html(self, pre_head_template: Template, post_head_template: Template

@no_title
def rewrite_css(self) -> str | bytes:
return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content)
return CssRewriter(self.url_rewriter, base_href=None).rewrite(self.content_str)

@no_title
def rewrite_js(self, opts: dict[str, Any]) -> str | bytes:
Expand All @@ -197,11 +197,11 @@ def rewrite_js(self, opts: dict[str, Any]) -> str | bytes:
notify_js_module=self.js_module_found,
base_href=None,
)
return rewriter.rewrite(self.content.decode(), opts)
return rewriter.rewrite(self.content_str, opts)

@no_title
def rewrite_jsonp(self) -> str | bytes:
content = self.content.decode()
content = self.content_str
match = JSONP_REGEX.match(content)
if not match:
return content
Expand Down
125 changes: 125 additions & 0 deletions tests/test_rewriting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import io

import pytest
from jinja2 import Template
from warcio import StatusAndHeaders
from warcio.recordloader import ArcWarcRecord

from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath


@pytest.fixture(scope="module")
def rewrite_generator():
"""A fixture which return a generator for a generic rewriter"""

def generate_and_call(
content: bytes = b"dummy", content_type: str = "text/html; charset=UTF-8"
):
rec_headers = StatusAndHeaders(
"WARC/1.1",
headers=[("WARC-Target-URI", "http://www.example.com")],
)
http_headers = StatusAndHeaders(
"HTTP/1.1 200 OK",
headers=[("Content-Type", content_type)],
)
return Rewriter(
ZimPath("www.example.com"),
ArcWarcRecord(
"warc", # format = warc
"response", # rec_type = response
rec_headers,
io.BytesIO(content),
http_headers,
"application/http; msgtype=response",
content.__len__(),
),
set(),
set(),
set(),
).rewrite(Template(""), Template(""))

yield generate_and_call


@pytest.mark.parametrize(
"content_str, encoding, content_type",
[
pytest.param("Bérénice", "UTF-8", "text/html", id="html_content_utf8_auto"),
pytest.param("Bérénice", "UTF-8", "text/css", id="js_content_utf8_auto"),
pytest.param(
"Bérénice", "UTF-8", "text/javascript", id="css_content_utf8_auto"
),
pytest.param(
"Bérénice", "UTF-8", "youdontknowme", id="unknown_content_utf8_auto"
),
pytest.param("Bérénice", "ISO-8859-1", "text/html", id="html_content_iso_auto"),
pytest.param("Bérénice", "ISO-8859-1", "text/css", id="js_content_iso_auto"),
pytest.param(
"Bérénice", "ISO-8859-1", "text/javascript", id="css_content_iso_auto"
),
pytest.param(
"Bérénice", "ISO-8859-1", "youdontknowme", id="unknown_content_iso_auto"
),
pytest.param(
"Bérénice",
"UTF-8",
"text/html; charset=UTF-8",
id="html_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"text/css; charset=UTF-8",
id="js_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"text/javascript; charset=UTF-8",
id="css_content_utf8_declared",
),
pytest.param(
"Bérénice",
"UTF-8",
"youdontknowme; charset=UTF-8",
id="unknown_content_utf8_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/html; charset=ISO-8859-1",
id="html_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/css; charset=ISO-8859-1",
id="js_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"text/javascript; charset=ISO-8859-1",
id="css_content_iso_declared",
),
pytest.param(
"Bérénice",
"ISO-8859-1",
"youdontknowme; charset=ISO-8859-1",
id="unknown_content_iso_declared",
),
],
)
def test_generic_rewriting_encoding_handling(
rewrite_generator, content_str, encoding, content_type
):
"""Test handling of encoding in various content types"""
content_bytes = content_str.encode(encoding)
(_, content) = rewrite_generator(content=content_bytes, content_type=content_type)
if isinstance(content, bytes):
# we return original bytes if content is not rewriten
assert content == content_bytes
else:
assert content == content_str

0 comments on commit a9de85d

Please sign in to comment.