Skip to content

Commit

Permalink
Don't ignore images (#2)
Browse files Browse the repository at this point in the history
Image in beginning of reply is incorrectly ignored. Fix.

Originally reported in closeio#22 and solved by @andreip in closeio#26

In his words:
"Couldn't think of a different approach, since an img isn't really a block, so it'll never have a text within it, so no point in generating a different html in get_line_info functions. Instead, what was missing was it being treated as a special case: don't want to slice a line from the HTML by just looking at the plain text lines, since that could slice an img, need to also look at the start/end refs for replaced tags.

See more about a replaced element (https://developer.mozilla.org/en-US/docs/Web/CSS/Replaced_element). I think it might be worth adding a few more things to the list? e.g. video, embed etc. ; not sure about iframe and how that'd be treated in lxml parsing though, but I suppose you could have an iframe with just an image in it, in which case you'd still want to keep it?

Full list would be a total of 9 replaced elements (or 10 if we also count input; although I'm not sure of all examples where that'd generate sth even if it apparently has no text in it)."
  • Loading branch information
afzalIbnSH authored Sep 16, 2021
1 parent 426b517 commit d26bd16
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
6 changes: 3 additions & 3 deletions quotequail/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ def unwrap_html(html):
"type": typ,
}

top_range = _html.trim_slice(lines, top_range)
main_range = _html.trim_slice(lines, main_range)
bottom_range = _html.trim_slice(lines, bottom_range)
top_range = _html.trim_slice(lines, top_range, start_refs, end_refs)
main_range = _html.trim_slice(lines, main_range, start_refs, end_refs)
bottom_range = _html.trim_slice(lines, bottom_range, start_refs, end_refs)

if top_range:
top_tree = _html.slice_tree(
Expand Down
26 changes: 20 additions & 6 deletions quotequail/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
"td",
"th",
]

# replaced by binary data, so should be preserved in HTML no matter the text
# around it.
REPLACED_TAGS = ["img"]
BEGIN = "begin"
END = "end"

Expand Down Expand Up @@ -71,7 +73,11 @@ def trim_tree_before(element, include_element=True, keep_head=True):
el = parent_el


def trim_slice(lines, slice_tuple):
def is_replaced(el):
return isinstance(el.tag, string_class) and el.tag.lower() in REPLACED_TAGS


def trim_slice(lines, slice_tuple, start_refs, end_refs):
"""
Trim a slice tuple (begin, end) so it starts at the first non-empty line
(obtained via indented_tree_line_generator / get_line_info) and ends at the
Expand All @@ -92,11 +98,19 @@ def _empty(line):
slice_end = len(lines)

# Trim from beginning
while slice_start < slice_end and _empty(lines[slice_start]):
while (
slice_start < slice_end
and _empty(lines[slice_start])
and not is_replaced(start_refs[slice_start][0])
):
slice_start += 1

# Trim from end
while slice_end > slice_start and _empty(lines[slice_end - 1]):
while (
slice_end > slice_start
and _empty(lines[slice_end - 1])
and not is_replaced(end_refs[slice_end - 1][0])
):
slice_end -= 1

return (slice_start, slice_end)
Expand Down Expand Up @@ -173,9 +187,9 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None):
new_tree = tree

if start_ref:
include_start = start_ref[1] == BEGIN
include_start = start_ref[1] == BEGIN or is_replaced(start_ref[0])
if end_ref:
include_end = end_ref[1] == END
include_end = end_ref[1] == END or is_replaced(end_ref[0])

# If start_ref is the same as end_ref, and we don't include the element,
# we are removing the entire tree. We need to handle this separately,
Expand Down
13 changes: 13 additions & 0 deletions tests/test_quotequail.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,19 @@ def test_gmail_reply(self):
},
)

def test_reply_with_image(self):
html = 'Test 2.<br><br>On Jun 05, 2018, at 09:56 AM, John Doe &lt;[email protected]&gt; wrote:<br><blockquote><img src="https://example.com" class="fr-fic fr-dib"><br>Some text 1.<br><br>Bart</blockquote>'
self.assertEqual(
unwrap_html(html),
{
"date": "Jun 05, 2018, at 09:56 AM",
"from": "John Doe <[email protected]>",
"html": u'<div><img src="https://example.com" class="fr-fic fr-dib"><br>Some text 1.<br><br>Bart</div>',
"html_top": u"Test 2.",
"type": "reply",
},
)

def test_outlook_forward(self):
data = self.read_file("outlook_forward.html")
result = unwrap_html(data)
Expand Down

0 comments on commit d26bd16

Please sign in to comment.