diff --git a/quotequail/__init__.py b/quotequail/__init__.py index 21e0ab8..a316961 100644 --- a/quotequail/__init__.py +++ b/quotequail/__init__.py @@ -146,9 +146,9 @@ def unwrap_html(html): "type": typ, } - top_range = _html.trim_slice(lines, top_range) - main_range = _html.trim_slice(lines, main_range) - bottom_range = _html.trim_slice(lines, bottom_range) + top_range = _html.trim_slice(lines, top_range, start_refs, end_refs) + main_range = _html.trim_slice(lines, main_range, start_refs, end_refs) + bottom_range = _html.trim_slice(lines, bottom_range, start_refs, end_refs) if top_range: top_tree = _html.slice_tree( diff --git a/quotequail/_html.py b/quotequail/_html.py index 69ffca5..5e95b5a 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -22,7 +22,9 @@ "td", "th", ] - +# replaced by binary data, so should be preserved in HTML no matter the text +# around it. +REPLACED_TAGS = ["img"] BEGIN = "begin" END = "end" @@ -71,7 +73,11 @@ def trim_tree_before(element, include_element=True, keep_head=True): el = parent_el -def trim_slice(lines, slice_tuple): +def is_replaced(el): + return isinstance(el.tag, string_class) and el.tag.lower() in REPLACED_TAGS + + +def trim_slice(lines, slice_tuple, start_refs, end_refs): """ Trim a slice tuple (begin, end) so it starts at the first non-empty line (obtained via indented_tree_line_generator / get_line_info) and ends at the @@ -92,11 +98,19 @@ def _empty(line): slice_end = len(lines) # Trim from beginning - while slice_start < slice_end and _empty(lines[slice_start]): + while ( + slice_start < slice_end + and _empty(lines[slice_start]) + and not is_replaced(start_refs[slice_start][0]) + ): slice_start += 1 # Trim from end - while slice_end > slice_start and _empty(lines[slice_end - 1]): + while ( + slice_end > slice_start + and _empty(lines[slice_end - 1]) + and not is_replaced(end_refs[slice_end - 1][0]) + ): slice_end -= 1 return (slice_start, slice_end) @@ -173,9 +187,9 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None): new_tree = tree if start_ref: - include_start = start_ref[1] == BEGIN + include_start = start_ref[1] == BEGIN or is_replaced(start_ref[0]) if end_ref: - include_end = end_ref[1] == END + include_end = end_ref[1] == END or is_replaced(end_ref[0]) # If start_ref is the same as end_ref, and we don't include the element, # we are removing the entire tree. We need to handle this separately, diff --git a/tests/test_quotequail.py b/tests/test_quotequail.py index d2de31a..8dcd513 100644 --- a/tests/test_quotequail.py +++ b/tests/test_quotequail.py @@ -949,6 +949,19 @@ def test_gmail_reply(self): }, ) + def test_reply_with_image(self): + html = 'Test 2.

On Jun 05, 2018, at 09:56 AM, John Doe <john@example.com> wrote:

Some text 1.

Bart
' + self.assertEqual( + unwrap_html(html), + { + "date": "Jun 05, 2018, at 09:56 AM", + "from": "John Doe ", + "html": u'

Some text 1.

Bart
', + "html_top": u"Test 2.", + "type": "reply", + }, + ) + def test_outlook_forward(self): data = self.read_file("outlook_forward.html") result = unwrap_html(data)