Skip to content

Commit

Permalink
Merge pull request #120 from openzim/embed_tags
Browse files Browse the repository at this point in the history
Rewrite embed tags to provide link to original content online
  • Loading branch information
benoit74 authored Dec 16, 2024
2 parents e33d18e + 33791d7 commit 81e2369
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 2 deletions.
24 changes: 24 additions & 0 deletions scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,27 @@ def rewrite_img_tags(
+ [("src", new_attr_value)]
)
return f"<img {values}{'/>' if auto_close else '>'}"


@html_rules.rewrite_tag()
def rewrite_embed_tags(
tag: str,
attrs: AttrsList,
*,
auto_close: bool,
):

if tag != "embed":
return
if not (src_value := get_attr_value_from(attrs, "src")):
return # no need to rewrite this embed without src

# There is 99% chance the embed src is not inside the ZIM, so we assume it is not
# (we can't know anyway with current software architecture)
return (
"This content is not inside the ZIM. "
f'View content online at <a href="{src_value}" target="_blank">'
f"{src_value}"
"</a>"
f'{ "" if auto_close else "<embed>"}'
)
33 changes: 31 additions & 2 deletions scraper/tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,10 @@ def test_html_iframe_rewriting(
</video>""",
"""<video class="mt-media" controls="controls" preload="auto">
<source src="" type="video/mp4" />
<embed class="mt-media" src="" autoplay="False" autostart="False" scale="tofit" """
"""wmode="opaque" allowfullscreen="true" />
This content is not inside the ZIM. View content online at """
'<a href="https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4"'
' target="_blank">https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4</a>'
"""
</video>""",
id="video_src",
),
Expand Down Expand Up @@ -277,6 +279,33 @@ def test_html_unknown_src_href_rewriting(
assert html_rewriter.rewrite(source_html).content == expected_html


@pytest.mark.parametrize(
"source_html, expected_html",
[
pytest.param(
"""<embed
class="mt-media"
src="https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4"
autoplay="False"
autostart="False"
scale="tofit"
wmode="opaque"
allowfullscreen="true"
/>""",
"This content is not inside the ZIM. View content online at "
'<a href="https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4"'
' target="_blank">https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4'
"</a>",
id="embed_src",
),
],
)
def test_html_embed_rewriting(
html_rewriter: HtmlRewriter, source_html: str, expected_html: str
):
assert html_rewriter.rewrite(source_html).content == expected_html


@pytest.mark.parametrize(
"source_html, expected_html, expected_items_to_download",
[
Expand Down

0 comments on commit 81e2369

Please sign in to comment.