Don't ignore images (#2)

Image in beginning of reply is incorrectly ignored. Fix. Originally reported in closeio#22 and solved by @andreip in closeio#26 In his words: "Couldn't think of a different approach, since an img isn't really a block, so it'll never have a text within it, so no point in generating a different html in get_line_info functions. Instead, what was missing was it being treated as a special case: don't want to slice a line from the HTML by just looking at the plain text lines, since that could slice an img, need to also look at the start/end refs for replaced tags. See more about a replaced element (https://developer.mozilla.org/en-US/docs/Web/CSS/Replaced_element). I think it might be worth adding a few more things to the list? e.g. video, embed etc. ; not sure about iframe and how that'd be treated in lxml parsing though, but I suppose you could have an iframe with just an image in it, in which case you'd still want to keep it? Full list would be a total of 9 replaced elements (or 10 if we also count input; although I'm not sure of all examples where that'd generate sth even if it apparently has no text in it)."
freightwalla · Sep 16, 2021 · d26bd16 · d26bd16
1 parent 426b517
commit d26bd16
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 9 deletions.
diff --git a/quotequail/__init__.py b/quotequail/__init__.py
@@ -146,9 +146,9 @@ def unwrap_html(html):
             "type": typ,
         }
 
-        top_range = _html.trim_slice(lines, top_range)
-        main_range = _html.trim_slice(lines, main_range)
-        bottom_range = _html.trim_slice(lines, bottom_range)
+        top_range = _html.trim_slice(lines, top_range, start_refs, end_refs)
+        main_range = _html.trim_slice(lines, main_range, start_refs, end_refs)
+        bottom_range = _html.trim_slice(lines, bottom_range, start_refs, end_refs)
 
         if top_range:
             top_tree = _html.slice_tree(

diff --git a/quotequail/_html.py b/quotequail/_html.py
@@ -22,7 +22,9 @@
     "td",
     "th",
 ]
-
+# replaced by binary data, so should be preserved in HTML no matter the text
+# around it.
+REPLACED_TAGS = ["img"]
 BEGIN = "begin"
 END = "end"
 
@@ -71,7 +73,11 @@ def trim_tree_before(element, include_element=True, keep_head=True):
         el = parent_el
 
 
-def trim_slice(lines, slice_tuple):
+def is_replaced(el):
+    return isinstance(el.tag, string_class) and el.tag.lower() in REPLACED_TAGS
+
+
+def trim_slice(lines, slice_tuple, start_refs, end_refs):
     """
     Trim a slice tuple (begin, end) so it starts at the first non-empty line
     (obtained via indented_tree_line_generator / get_line_info) and ends at the
@@ -92,11 +98,19 @@ def _empty(line):
         slice_end = len(lines)
 
     # Trim from beginning
-    while slice_start < slice_end and _empty(lines[slice_start]):
+    while (
+        slice_start < slice_end
+        and _empty(lines[slice_start])
+        and not is_replaced(start_refs[slice_start][0])
+    ):
         slice_start += 1
 
     # Trim from end
-    while slice_end > slice_start and _empty(lines[slice_end - 1]):
+    while (
+        slice_end > slice_start
+        and _empty(lines[slice_end - 1])
+        and not is_replaced(end_refs[slice_end - 1][0])
+    ):
         slice_end -= 1
 
     return (slice_start, slice_end)
@@ -173,9 +187,9 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None):
         new_tree = tree
 
     if start_ref:
-        include_start = start_ref[1] == BEGIN
+        include_start = start_ref[1] == BEGIN or is_replaced(start_ref[0])
     if end_ref:
-        include_end = end_ref[1] == END
+        include_end = end_ref[1] == END or is_replaced(end_ref[0])
 
     # If start_ref is the same as end_ref, and we don't include the element,
     # we are removing the entire tree. We need to handle this separately,

diff --git a/tests/test_quotequail.py b/tests/test_quotequail.py
@@ -949,6 +949,19 @@ def test_gmail_reply(self):
             },
         )
 
+    def test_reply_with_image(self):
+        html = 'Test 2.<br><br>On Jun 05, 2018, at 09:56 AM, John Doe &lt;[email protected]&gt; wrote:<br><blockquote><img src="https://example.com" class="fr-fic fr-dib"><br>Some text 1.<br><br>Bart</blockquote>'
+        self.assertEqual(
+            unwrap_html(html),
+            {
+                "date": "Jun 05, 2018, at 09:56 AM",
+                "from": "John Doe <[email protected]>",
+                "html": u'<div><img src="https://example.com" class="fr-fic fr-dib"><br>Some text 1.<br><br>Bart</div>',
+                "html_top": u"Test 2.",
+                "type": "reply",
+            },
+        )
+
     def test_outlook_forward(self):
         data = self.read_file("outlook_forward.html")
         result = unwrap_html(data)