diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9197dfb..d726ae2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,11 +11,11 @@ repos: hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.8 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.361 + rev: v1.1.367 hooks: - id: pyright name: pyright (system) diff --git a/openzim.toml b/openzim.toml index 3456315..e57b1b4 100644 --- a/openzim.toml +++ b/openzim.toml @@ -6,7 +6,7 @@ execute_after=[ [files.assets.actions."wombat.js"] action="get_file" -source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.7.0/dist/wombat.js" +source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.7.5/dist/wombat.js" target_file="wombat.js" [files.assets.actions."wombatSetup.js"] # fallback if this script has not been properly build (should happen only in dev) diff --git a/pyproject.toml b/pyproject.toml index 30a26ee..3e91902 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] # jinja2 is required to generate JS and Python rules at build time -requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.3"] +requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4"] build-backend = "hatchling.build" [project] @@ -10,16 +10,16 @@ description = "Convert WARC to ZIM" readme = "README.md" dependencies = [ "warcio==1.7.4", - "requests==2.31.0", + "requests==2.32.3", "zimscraperlib==3.3.2", - "jinja2==3.1.3", + "jinja2==3.1.4", "chardet==5.2.0", # to support possible brotli content in warcs, must be added separately "brotlipy==0.7.0", "cdxj_indexer==1.4.5", "tinycss2==1.3.0", "beautifulsoup4==4.12.3", # used to parse base href - "lxml==5.2.1", # used by beautifulsoup4 for parsing html + "lxml==5.2.2", # used by beautifulsoup4 for parsing html ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -39,17 +39,17 @@ scripts = [ ] lint = [ "black==24.4.2", - "ruff==0.4.3", + "ruff==0.4.8", ] check = [ - "pyright==1.1.361", + "pyright==1.1.367", ] test = [ - "pytest==8.2.0", - "coverage==7.5.0", + "pytest==8.2.2", + "coverage==7.5.3", ] dev = [ - "pre-commit==3.6.2", + "pre-commit==3.7.1", "debugpy==1.8.1", "warc2zim[scripts]", "warc2zim[lint]", diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py index 624ea24..04b9c1e 100644 --- a/src/warc2zim/content_rewriting/generic.py +++ b/src/warc2zim/content_rewriting/generic.py @@ -147,6 +147,8 @@ def get_rewrite_mode(self, record, mimetype): f"mimetype: {mimetype}, resourcetype: {resourcetype})" ) + return resourcetype_rewrite_mode + def get_resourcetype_rewrite_mode(self, record, resourcetype, mimetype): """Get current record rewrite mode based on WARC-Resource-Type and mimetype""" @@ -160,12 +162,12 @@ def get_resourcetype_rewrite_mode(self, record, resourcetype, mimetype): if resourcetype == "stylesheet": return "css" - if resourcetype in ["script", "fetch"] and ( + if resourcetype in ["script", "fetch", "xhr"] and ( mimetype == "application/json" or self.path.value.endswith(".json") ): return "json" - if resourcetype == "script" and mimetype in [ + if resourcetype in ["script", "xhr"] and mimetype in [ "text/javascript", "application/javascript", "application/x-javascript", diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 27532b6..e62020b 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -796,3 +796,31 @@ def test_redirection_loops(self, tmp_path): self.assert_item_does_not_exist( zim_output, f"website.test.openzim.org/{ignored_website_items}" ) + + def test_content_resource_types(self, tmp_path): + zim_output = "tests_en_content-resource-types.zim" + + main( + [ + os.path.join(TEST_DATA_DIR, "content-resource-types.warc.gz"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "tests_en_content-resource-types", + ] + ) + zim_output = tmp_path / zim_output + + res = self.get_article( + zim_output, "website.test.openzim.org/content-types/index.html" + ) + assert b"" in res # simple check that rewriting has been done + + for js_file in [ + "website.test.openzim.org/content-types/script1.js", + "website.test.openzim.org/content-types/script2.js", + ]: + res = self.get_article(zim_output, js_file) + assert b"wombat" in res # simple check that rewriting has been done