Skip to content

Commit

Permalink
fixup! Be more tolerant to invalid css.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Feb 7, 2024
1 parent 7cb9682 commit d6b1c89
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 25 deletions.
25 changes: 14 additions & 11 deletions src/warc2zim/content_rewriting/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,23 @@
from tinycss2.serializer import serialize_url

from warc2zim.content_rewriting import UrlRewriterProto
from warc2zim.content_rewriting.rx_replacer import RxRewriter, m2str
from warc2zim.content_rewriting.rx_replacer import RxRewriter


class FallbackRegexCssRewriter(RxRewriter):
def __init__(self, url_rewriter: UrlRewriterProto):
rules = [
(
re.compile(r"""url\((?P<quote>['"])?.+?(?P=quote)(?<!\\)\)"""),
m2str(url_rewriter),
re.compile(r"""url\((?P<quote>['"])?(?P<url>.+?)(?P=quote)(?<!\\)\)"""),
lambda m_object, _opts: "".join(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"]),
m_object["quote"],
")",
]
),
)
]
super().__init__(rules)
Expand All @@ -28,6 +36,7 @@ def __init__(self, url_rewriter: UrlRewriterProto):
class CssRewriter:
def __init__(self, url_rewriter: UrlRewriterProto):
self.url_rewriter = url_rewriter
self.fallback_rewriter = FallbackRegexCssRewriter(url_rewriter)

def rewrite(self, content: str | bytes) -> str:
if isinstance(content, bytes):
Expand All @@ -39,12 +48,7 @@ def rewrite(self, content: str | bytes) -> str:
try:
output = serialize(rules)
except Exception:
fallback_rewriter = FallbackRegexCssRewriter(self.url_rewriter)
if isinstance(content, bytes):
content = content.decode()
return fallback_rewriter.rewrite_content(
content, {} # pyright: ignore[reportArgumentType]
)
return self.fallback_rewriter.rewrite_content(content, {})
return output

def rewrite_inline(self, content: str) -> str:
Expand All @@ -54,8 +58,7 @@ def rewrite_inline(self, content: str) -> str:
output = serialize(rules)
return output
except Exception:
fallback_rewriter = FallbackRegexCssRewriter(self.url_rewriter)
return fallback_rewriter.rewrite_content(content, {})
return self.fallback_rewriter.rewrite_content(content, {})

def process_list(self, components: Iterable[ast.Node]):
if components: # May be null
Expand Down
56 changes: 42 additions & 14 deletions tests/test_css_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,40 +38,68 @@ def no_rewrite_content(request):
def test_no_rewrite(no_rewrite_content):
assert (
CssRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set())).rewrite(
no_rewrite_content.input_
no_rewrite_content.input_bytes
)
== no_rewrite_content.expected.decode()
== no_rewrite_content.expected_bytes.decode()
)


@pytest.fixture(
params=[
ContentForTests(b'"border:'),
ContentForTests(b"border: solid 1px #c0c0c0; width= 100%"),
ContentForTests(b"width:"),
ContentForTests(b"border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
ContentForTests('"border:'),
ContentForTests("border: solid 1px #c0c0c0; width= 100%"),
# Despite being invalid, tinycss parse it as "width" property without value.
ContentForTests("width:", "width:;"),
ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
ContentForTests(
'background: url("http://exemple.com/foo.png"); width=',
'background: url("exemple.com/foo.png"); width=',
),
]
)
def invalid_content(request):
def invalid_content_inline(request):
yield request.param


def test_invalid_css(invalid_content):
def test_invalid_css_inline(invalid_content_inline):
assert (
CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite(
invalid_content.input_
)
== invalid_content.expected.decode()
CssRewriter(
ArticleUrlRewriter(invalid_content_inline.article_url, set())
).rewrite_inline(invalid_content_inline.input_str)
== invalid_content_inline.expected_str
)


@pytest.fixture(
params=[ContentForTests('p{background: url("http://exemple.com/foo.png"); width=}')]
params=[
# Tinycss parse `"border:}` as a string with an unexpected eof in string.
# At serialization, tiny try to recover and close the opened rule
ContentForTests(b'p {"border:}', b'p {"border:}}'),
ContentForTests(b'"p {border:}'),
ContentForTests(b"p { border: solid 1px #c0c0c0; width= 100% }"),
ContentForTests(b"p { width: }"),
ContentForTests(
b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }"
),
ContentForTests(
b'p { background: url("http://exemple.com/foo.png"); width= }',
b'p { background: url("exemple.com/foo.png"); width= }',
),
]
)
def invalid_and_rewrite(request):
def invalid_content(request):
yield request.param


def test_invalid_cssl(invalid_content):
assert (
CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite(
invalid_content.input_bytes
)
== invalid_content.expected_bytes.decode()
)


def test_rewrite():
content = b"""
/* A comment with a link : http://foo.com */
Expand Down

0 comments on commit d6b1c89

Please sign in to comment.