Skip to content

Commit

Permalink
Detect rewrite mode based on WARC-Resource-Type when available
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jun 11, 2024
1 parent ae15473 commit a0be3c6
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Use the new `WARC-Resource-Type` header to decide rewrite mode (when present in WARC) (#296)

### Fixed

- Drop `integrity` attribute in HTML `<script>` and `<link>` tags (#298)
Expand Down
58 changes: 58 additions & 0 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,64 @@ def rewrite(
return ("", self.content)

def get_rewrite_mode(self, record, mimetype):
"""Get current record rewrite mode
The rewrite mode is used to decide which kind of resource we have (html, css,
js, ...) and this is used to decide how it should be parsed and rewritten.
"""
mimetype_rewrite_mode = self.get_mimetype_rewrite_mode(record, mimetype)

resourcetype = record.rec_headers["WARC-Resource-Type"]
if not resourcetype:
return mimetype_rewrite_mode # fallback for WARCs without resource type
if not isinstance(resourcetype, str):
raise Exception(f"Unsupported resourcetype class: {resourcetype.__class__}")

Check warning on line 136 in src/warc2zim/content_rewriting/generic.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/content_rewriting/generic.py#L136

Added line #L136 was not covered by tests
resourcetype = resourcetype.lower().strip()

resourcetype_rewrite_mode = self.get_resourcetype_rewrite_mode(
record, resourcetype, mimetype
)

if mimetype_rewrite_mode != resourcetype_rewrite_mode:
logger.warning(
f"Rewrite mode has changed in 2.0.1 for {self.path.value} record: was "
f"{mimetype_rewrite_mode}, now is {resourcetype_rewrite_mode} ("
f"mimetype: {mimetype}, resourcetype: {resourcetype})"
)

def get_resourcetype_rewrite_mode(self, record, resourcetype, mimetype):
"""Get current record rewrite mode based on WARC-Resource-Type and mimetype"""

if resourcetype == "document":
# TODO : Handle header "Accept" == "application/json"
if getattr(record, "method", "GET") == "GET":
return "html"

return None

Check warning on line 158 in src/warc2zim/content_rewriting/generic.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/content_rewriting/generic.py#L158

Added line #L158 was not covered by tests

if resourcetype == "stylesheet":
return "css"

if resourcetype in ["script", "fetch"] and (
mimetype == "application/json" or self.path.value.endswith(".json")
):
return "json"

if resourcetype == "script" and mimetype in [
"text/javascript",
"application/javascript",
"application/x-javascript",
]:
if extract_jsonp_callback(self.orig_url_str):
return "jsonp"

Check warning on line 174 in src/warc2zim/content_rewriting/generic.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/content_rewriting/generic.py#L174

Added line #L174 was not covered by tests

return "javascript"

return None

def get_mimetype_rewrite_mode(self, record, mimetype):
"""Get current record rewrite mode based on mimetype"""

if mimetype == "text/html":
if getattr(record, "method", "GET") == "POST":
return None
Expand Down
Binary file added tests/data/content-resource-types.warc.gz
Binary file not shown.

0 comments on commit a0be3c6

Please sign in to comment.