From 4431512dca81043812aecab7e0a0f59f0290e6c6 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 6 Nov 2015 10:39:37 +0000
Subject: [PATCH 01/14] Add xhtml inline png image support

Images are already present in the Document model when read from RTF.
This change converts any PNG images found into <img> tags with inline
base64 encoded data elements. Other images (for example WMF alternatives
and jpegs) are ignored.

Previous behaviour was to write out the hex-encoded image string.
---
 pyth/plugins/xhtml/writer.py | 40 +++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 12 deletions(-)
diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
index 37bca07..3e40c29 100644
--- a/pyth/plugins/xhtml/writer.py
+++ b/pyth/plugins/xhtml/writer.py
@@ -6,6 +6,7 @@
 
 from pyth import document
 from pyth.format import PythWriter
+import base64
 
 from cStringIO import StringIO
 
@@ -50,26 +51,31 @@ def __init__(self, doc, target, cssClasses=True, pretty=False):
             document.List: self._list,
             document.Paragraph: self._paragraph
         }
-        
+        self.paragraphContentDispatch = {
+            document.Text: self._text,
+            document.Image: self._image,
+        }
+
 
     def go(self):
 
         self.listLevel = -1
-        
+
         tag = Tag("div")
-        
+
         for element in self.document.content:
             handler = self.paragraphDispatch[element.__class__]
             tag.content.extend(handler(element))
 
         tag.render(self.target)
         return self.target
-    
+
 
     def _paragraph(self, paragraph):
         p = Tag("p")
-        for text in paragraph.content:
-            p.content.append(self._text(text))
+        for item in paragraph.content:
+            handler = self.paragraphContentDispatch[item.__class__]
+            p.content.append(handler(item))
 
         if self.pretty:
             return [_prettyBreak, p, _prettyBreak]
@@ -79,12 +85,12 @@ def _paragraph(self, paragraph):
 
     def _list(self, lst):
         self.listLevel += 1
-        
+
         ul = Tag("ul")
 
         if self.cssClasses:
             ul.attrs['class'] = 'pyth_list_%s' % self.listLevel
-        
+
         for entry in lst.content:
             li = Tag("li")
             for element in entry.content:
@@ -93,7 +99,7 @@ def _list(self, lst):
             ul.content.append(li)
 
         self.listLevel -= 1
-            
+
         return [ul]
 
 
@@ -124,13 +130,23 @@ def _text(self, text):
 
         return tag
 
+    def _image(self, image):
+        if image.properties.get(u'pngblip'):
+            tag = Tag("img")
+            image_data = bytearray.fromhex(image.content[0])
+            base64_image = base64.b64encode(image_data)
+            tag.attrs['src'] = "data:image/png;base64,{}".format(base64_image)
+            tag.attrs['alt'] = 'Inline image'
+            return tag
+        else:
+            return Tag(None)
 
 
 _prettyBreak = object()
 
 
 class Tag(object):
-    
+
     def __init__(self, tag, attrs=None, content=None):
         self.tag = tag
         self.attrs = attrs or {}
@@ -155,13 +171,13 @@ def render(self, target):
 
         if self.tag is not None:
             target.write('</%s>' % self.tag)
-        
+
 
     def attrString(self):
         return " ".join(
             '%s="%s"' % (k, quoteAttr(v))
             for (k, v) in self.attrs.iteritems())
-            
+
 
     def __repr__(self):
         return "T(%s)[%s]" % (self.tag, repr(self.content))

From a1f08cc8e49afd28b6dfad624d30fecccf95d745 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 13 Nov 2015 09:03:12 +0000
Subject: [PATCH 02/14] Add image support from html to rtf as well

Prior commit took images being read in RTF and added them as inline png images to XHTML.
This completes the reverse: inline png images in XHTML are read into the Document model,
and writing those Documents to RTF will now include the inline image.

Width/ height attributes are also transformed, assuming a standard conversion of 15 twips
per pixel.

Only PNG images are supported.
---
 examples/writing/htmlToRtf15.py   | 12 +++++++
 examples/writing/rtf15ToXhtml.py  | 12 +++++++
 pyth/document.py                  | 18 +++++-----
 pyth/plugins/rtf15/reader.py      | 26 +++++++-------
 pyth/plugins/rtf15/writer.py      | 50 +++++++++++++++++++-------
 pyth/plugins/xhtml/reader.py      | 49 ++++++++++++++++++++++++++
 pyth/plugins/xhtml/writer.py      | 19 +++++++++-
 tests/html/sample-with-image.html |  3 ++
 tests/rtfs/sample-with-image.rtf  | 58 +++++++++++++++++++++++++++++++
 tests/test_readrtf15.py           | 11 ++++++
 tests/test_readxhtml.py           | 16 +++++++++
 tests/test_writertf15.py          | 19 ++++++++++
 tests/test_writexhtml.py          | 19 ++++++++++
 13 files changed, 276 insertions(+), 36 deletions(-)
 create mode 100644 examples/writing/htmlToRtf15.py
 create mode 100644 examples/writing/rtf15ToXhtml.py
 create mode 100644 tests/html/sample-with-image.html
 create mode 100644 tests/rtfs/sample-with-image.rtf
 create mode 100644 tests/test_writertf15.py
 create mode 100644 tests/test_writexhtml.py

diff --git a/examples/writing/htmlToRtf15.py b/examples/writing/htmlToRtf15.py
new file mode 100644
index 0000000..df7e8d5
--- /dev/null
+++ b/examples/writing/htmlToRtf15.py
@@ -0,0 +1,12 @@
+from pyth.plugins.xhtml.reader import XHTMLReader
+from pyth.plugins.rtf15.writer import Rtf15Writer
+import sys
+
+if len(sys.argv) > 1:
+    filename = sys.argv[1]
+else:
+    filename = "tests/html/sample-with-image.html"
+source = open(filename, "rb")
+doc = XHTMLReader.read(source)
+
+print Rtf15Writer.write(doc).getvalue()
diff --git a/examples/writing/rtf15ToXhtml.py b/examples/writing/rtf15ToXhtml.py
new file mode 100644
index 0000000..d350e9d
--- /dev/null
+++ b/examples/writing/rtf15ToXhtml.py
@@ -0,0 +1,12 @@
+from pyth.plugins.xhtml.writer  import XHTMLWriter
+from pyth.plugins.rtf15.reader import Rtf15Reader
+import sys
+
+if len(sys.argv) > 1:
+    filename = sys.argv[1]
+else:
+    filename = "tests/rtfs/sample-with-image.rtf"
+source = open(filename, "rb")
+doc = Rtf15Reader.read(source)
+
+print XHTMLWriter.write(doc).getvalue()
diff --git a/pyth/document.py b/pyth/document.py
index 864d519..d0cb6d8 100644
--- a/pyth/document.py
+++ b/pyth/document.py
@@ -7,7 +7,7 @@ class _PythBase(object):
     def __init__(self, properties={}, content=[]):
         self.properties = {}
         self.content = []
-        
+
         for (k,v) in properties.iteritems():
             self[k] = v
 
@@ -33,7 +33,7 @@ def append(self, item):
 
         If the item is of the wrong type, and if this element has a sub-type,
         then try to create such a sub-type and insert the item into that, instead.
-        
+
         This happens recursively, so (in python-markup):
           L [ u'Foo' ]
         actually creates:
@@ -51,7 +51,7 @@ def append(self, item):
                     okay = False
             else:
                 okay = False
-                
+
         if not okay:
             raise TypeError("Wrong content type for %s: %s (%s)" % (
                 self.__class__.__name__, repr(type(item)), repr(item)))
@@ -94,10 +94,10 @@ class Image(Paragraph):
     """
     An image is stored in bytes. All properties of images from the rtf definition are allowed.
     """
-    
-    validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', 
-                       'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', 
-                       'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', 
+
+    validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap',
+                       'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal',
+                       'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr',
                        'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap')
     contentType = bytes
 
@@ -122,7 +122,7 @@ class List(Paragraph):
 
     validProperties = ()
     contentType = ListEntry
-    
+
 
 
 class Document(_PythBase):
@@ -130,6 +130,6 @@ class Document(_PythBase):
     Top-level item. One document is exactly one file.
     Documents consist of a list of paragraphs.
     """
-    
+
     validProperties = ('title', 'subject', 'author')
     contentType = Paragraph
diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
index 7a74162..f569afa 100644
--- a/pyth/plugins/rtf15/reader.py
+++ b/pyth/plugins/rtf15/reader.py
@@ -58,9 +58,9 @@
 
 # All the ones named by number in my 2.6 encodings dir, and those listed above
 _CODEPAGES_BY_NUMBER = dict(
-    (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, 
+    (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856,
                               857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
-                              875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, 
+                              875, 932, 936, 949, 950, 1006, 1026, 1140, 1250,
                               1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361))
 
 # Miscellaneous, incomplete
@@ -224,7 +224,7 @@ def __init__(self, doc, clean_paragraphs=True):
     def flushRun(self):
         if self.block is None:
             self.block = document.Paragraph()
-        
+
         if self.isImage:
             self.block.content.append(
                 document.Image(self.propStack[-1].copy(),
@@ -321,7 +321,7 @@ def handle_Para(self, para):
             self.listStack[-1].append(l)
 
         self.block = None
-    
+
     def handle_Pict(self, pict):
         self.flushRun()
         self.isImage = True
@@ -354,7 +354,7 @@ def handle_ImageMarker(self, marker):
                 del self.propStack[-1][marker.name]
             else:
                 self.propStack[-1][marker.name] = True
-    
+
 
 
 class Group(object):
@@ -398,11 +398,11 @@ def handle(self, control, digits):
         if control == '*':
             self.destination = True
             return
-        
-        if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 
-                                      'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 
-                                      'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', 
-                                      'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 
+
+        if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile',
+                                      'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes',
+                                      'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley',
+                                      'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp',
                                       'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap']:
             self.content.append(ImageMarker(control, digits))
             return
@@ -650,7 +650,7 @@ def handle_tab(self):
 
     def handle_trowd(self):
         self.content.append(u'\n')
-        
+
     #Handle the image tag
     def handle_pict(self):
         p = Pict()
@@ -658,7 +658,7 @@ def handle_pict(self):
         self.image = p
         #Remove the destination control group of the parent, so that the image is preserved
         self.parent.destination = False
-    
+
     def handle_field(self):
         def finalize():
             if len(self.content) != 2:
@@ -745,7 +745,7 @@ def __init__(self):
 
     def __repr__(self):
         return "!Image!"
-            
+
 class Para(ReadableMarker):
     listLevel = None
 
diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py
index bf00511..f7bc85a 100644
--- a/pyth/plugins/rtf15/writer.py
+++ b/pyth/plugins/rtf15/writer.py
@@ -4,6 +4,7 @@
 http://www.biblioscape.com/rtf15_spec.htm
 """
 
+import binascii
 from pyth import document
 from pyth.format import PythWriter
 
@@ -55,12 +56,16 @@ def __init__(self, doc, target, family):
             document.List: self._list,
             document.Paragraph: self._paragraph
         }
+        self.paragraphContentDispatch = {
+            document.Text: self._text,
+            document.Image: self._image,
+        }
 
 
     def go(self):
         self.listLevel = -1
         self.addSpacing = None
-        
+
         self.target.write('{')
         self._writeHeader()
         self._writeDocument()
@@ -105,7 +110,7 @@ def _getFontTable(self):
         # We need Symbol for list bullets
         output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1))
         self.symbolFontNumber = i+1
-        
+
         output.append('}')
         return "".join(output)
 
@@ -138,7 +143,7 @@ def _getListTable(self):
 
         output.append('}}')
         return "".join(output)
-    
+
 
     def _getListOverrides(self):
         # I have no idea what the point is of this,
@@ -153,7 +158,7 @@ def _getRevTable(self):
 
     # -----------------------------------------------
     # Document section
-    
+
 
     def _writeDocument(self):
 
@@ -193,14 +198,15 @@ def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING):
         if self.addSpacing is not None:
             self.target.write(r'\sb%d' % self.addSpacing)
             self.addSpacing = None
-        
+
         # Space after the paragraph,
         # expressed in units of god-knows-what
         self.target.write(r'\sa%d{' % spacing)
-        
-        for text in paragraph.content:
-            self._text(text)
-            
+
+        for item in paragraph.content:
+            handler = self.paragraphContentDispatch[item.__class__]
+            handler(item)
+
         self.target.write(r'}\par\pard' '\n')
 
 
@@ -241,12 +247,12 @@ def _text(self, text):
         for prop in text.properties:
             if prop in _styleFlags:
                 props.append(_styleFlags[prop])
-        
+
         if props:
             self.target.write("".join(props) + " ")
 
-        
-        for run in text.content:                    
+
+        for run in text.content:
             for unichar in run:
                 if unichar == '\n':
                     self.target.write(r'\line ')
@@ -257,7 +263,7 @@ def _text(self, text):
                     self.target.write(str(unichar))
                 else:
                     self.target.write(r'\u%d?' % point)
-            
+
         if props:
             self.target.write("".join("%s0" % p for p in props) + " ")
 
@@ -266,3 +272,21 @@ def _text(self, text):
 
         if 'url' in text.properties:
             self.target.write('}}')
+
+    def _image(self, image):
+        self.target.write(r'{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE  "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict')
+        properties = "".join('\\' + prop + (val if val != True else '') for prop, val in image.properties.iteritems())
+        self.target.write(properties)
+        self.target.write(' \n')
+        image_data = binascii.hexlify(image.content[0])
+        for i in chunk(image_data):
+            self.target.write(i)
+            self.target.write('\n')
+        self.target.write(r'}}}}')
+
+def chunk(data, size=200):
+    length = len(data)
+    end = 0
+    while length > end:
+        end = end + size
+        yield data[end-size:end]
diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py
index 775bf58..8e9710f 100644
--- a/pyth/plugins/xhtml/reader.py
+++ b/pyth/plugins/xhtml/reader.py
@@ -2,6 +2,8 @@
 Read documents from xhtml
 """
 
+import base64
+
 import BeautifulSoup
 
 from pyth import document
@@ -9,6 +11,9 @@
 from pyth.plugins.xhtml.css import CSS
 
 
+BASE64_PNG_IMG_SRC = 'data:image/png;base64,'
+
+
 class XHTMLReader(PythReader):
 
     @classmethod
@@ -110,6 +115,21 @@ def url(self, node):
         else:
             return self.link_callback(a_node.get('href'))
 
+    def dimensions(self, node):
+        """
+        return (int(width), int(height)) in pixels if a node has these declared in px in a style attribute, else None for either
+        or both attributes
+        """
+        try:
+            style = node['style']
+        except KeyError:
+            return None, None
+        else:
+            declarations = self.css.parse_declarations(style)
+            width = _parse_px(declarations.get('width', None))
+            height = _parse_px(declarations.get('height', None))
+            return width, height
+
     def process_text(self, node):
         """
         Return a pyth Text object from a BeautifulSoup node or None if
@@ -161,5 +181,34 @@ def process_into(self, node, obj):
             new_obj = document.ListEntry()
             obj.append(new_obj)
             obj = new_obj
+        elif node.name == 'img':
+            if node.get('src', '').startswith(BASE64_PNG_IMG_SRC):
+                base64_data = node['src'][len(BASE64_PNG_IMG_SRC):]
+                new_obj = document.Image()
+                new_obj.append(base64.b64decode(base64_data))
+                new_obj['pngblip'] = True
+                width, height = self.dimensions(node)
+                if height:
+                    height = unicode(_px_to_twips(height))
+                    new_obj['pich'] = height
+                    new_obj['pichgoal'] = height
+                if width:
+                    width = unicode(_px_to_twips(width))
+                    new_obj['picw'] = width
+                    new_obj['picwgoal'] = width
+                new_obj['picscalex'] = '100'
+                new_obj['picscaley'] = '100'
+
+                obj.content.append(new_obj)
+                return  # img is not allowed to have children as per DTD
         for child in node:
             self.process_into(child, obj)
+
+
+def _parse_px(node):
+    if node and node.lower().endswith('px'):
+        return int(node[:-2])
+
+
+def _px_to_twips(px):
+    return px * 15
diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
index 3e40c29..eb0879d 100644
--- a/pyth/plugins/xhtml/writer.py
+++ b/pyth/plugins/xhtml/writer.py
@@ -136,7 +136,15 @@ def _image(self, image):
             image_data = bytearray.fromhex(image.content[0])
             base64_image = base64.b64encode(image_data)
             tag.attrs['src'] = "data:image/png;base64,{}".format(base64_image)
-            tag.attrs['alt'] = 'Inline image'
+            height = image['pichgoal']
+            width = image['picwgoal']
+            if width or height:
+                styles = []
+                styles.append(_twips_to_style_px('width', width))
+                styles.append(_twips_to_style_px('height', height))
+                style = ';'.join(s for s in styles if s)
+                if style:
+                    tag.attrs['style'] = style
             return tag
         else:
             return Tag(None)
@@ -195,3 +203,12 @@ def quoteAttr(text):
     return quoteText(text).replace(
         u'"', u"&quot;").replace(
         u"'", u"&apos;")
+
+
+def _twips_to_style_px(tag, twips):
+    try:
+        twips = int(twips)
+    except ValueError:
+        pass
+    px = int(round(twips / 15.0))
+    return "{}:{}px".format(tag, px)
diff --git a/tests/html/sample-with-image.html b/tests/html/sample-with-image.html
new file mode 100644
index 0000000..f1eea0e
--- /dev/null
+++ b/tests/html/sample-with-image.html
@@ -0,0 +1,3 @@
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAIAAACRXR/mAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAEiJJREFUeNpsWWlwW9d1fhse8LCDIACCBAlKlEStlBRtlmVRkmU5cZzJ2HEctc7Yru1kMlnaaSeTSf+0P/o/P1p3knGSxkviJLbHkyadJJIl25FjWdUuUytJiTslkiBBkASxvLXfufcBhDx9JB8fHt6799yzfOc754onz5yRBEEQRYlO7MBHxxHw49iObTuWZdm2bZq2ZZk42/hv8XsOHqNnHX4h1EbAhVQ78FFmh6Io9bPH42n8iMdwwR/mbymSiA+CO4DIPmB0mo1NaFs2voIEOFkWhuAC8cNpOLg0XKD/V6z6obDjMzfxJM5cLJzpe0GiDzKTiQ9KkjFFCI7iMN0Ito1fs6YnLhPOXFtC7ai/XpeJz8c/NkpWF4Wryv3IbuE1RSbtifyOJNbsyGzKTEmTkzktC/Ip3IQ1aep2rB9iw7EiSk0g2I5LpHg8dUXijCFoYNsuV3XDMnVdx0MKF4m9TsPhQaHmYSKTjIvA5XPqLsVtzXRcPwSag//Qs6Zp6ZjGMK1Kparr5QqOaqm0XFouQoL5QmFxYQk3q+VScblYKpWXl5cr5bJe1SGVB+JDMqZbtjJ3ia6ria6PmbQc+LxlVXS9WsHrmKmKU7FYxHDLmA2jLy0tLhVLS5ilWKqU8UW5VMLDmEmvVg0DujCZXmy2BNIGKU+REAIeVYUqVY/qDwUVv9/v9foc0iKmNvC2XqkUsZyl4mKhgCmXlouLC4uYbWFhAVel5UqxjMmwrAqexkymaTiWTcpD6Agi5qG1emAy1eP1ej2eYCSiqrimH9Xj8fo8Pl+AXSp0V/VCLMgD/ZA7KaQWZXJy7J2331rI55cWF0tYGWmgrBs6lkSeC0NJIh7ks+DXp6p+ry+WSKheFZP6vD6F7uMPJ5VcldQv4QTHEUn15BUYxeFBwH0QapB4VGECkYzufhB5XCu/eOXHt2/eSrakQpo/EYthdZgK02EOnDGNoNDw9ajBn0PhJjLtwPtY1NocujiM8alq/imIFj1ki+ScFl5xw8lmXkIRXsM6GohfikppYfGrR48aiH3Y2xH9mg+vs8hUZFG2GaRiJv4afbSwNFtw18Zu16JRdPHOqcGiKPE7pA6JBJS4TtjkjoR7AhOOgaUjiHypkiPaSjQaml/Mnzj2Hlx444YN+/bvN3UTtsbDthvzUg0pa3PzOOD/oS+JZuei8fCEWhE9XDY8BO+DM8P/8IN7LFBFJifexWfSERuZ2ZVJrqTb2sbHxxfm84jFTFs7UJUgQ2L+wJ9rQCbXNK6W2FgSk0AQDPJ60qxIShFw06N45gr5//3kk+npqVhT/MF9vU3RqGmZItkKinJkJhDLD6TJOqzQxJFYbHJiAssKhUKplqRpwIIucrlKYdc8AYjslxyTo5Uk+X0aQAG/cHwXksmtSeKKXvrjH353/cql0kLh+uWL//3uW5VKSRHIkBhDdp+06ZenUVGA28BP4fYSYj0/nYOGW9NpX0DDesnzuAeLKzqqace1KwtQ0R/w37h1/T///Ucv/8ePZnM5xCKXCQdC8/r1a5Pj44cefuSb3/7ewUeODA8OXLl0UfV5sQCjStiCoGUQ7wBpMCsUWS6XyGNERxkdGQP04euOzixzTIv8UFaAN1iFZRKqu47NQk/gviUKfq///Pmz7779G+BoLNYEf+K+jEEpuQvOyMhwW3v7rr0PQQU7du+em57OZDvyhfzJ48cH+29B0zt373n48KMLhflfvf6LeCKRm5rqXNP15FeeBvwqdwYHgNjRaDSZarUMHbMDpPL52Zl7M+m2TDgcBk5At4Jj6nAcy2H+JHp9vlMffvg/f3gXq06n2557/sVUMlXRK0wsklyv6PmZ2faODuCKDtuJ8uNPfAUoOzo8dPXS+XUbNphV/f1jf1zXvT4SDi/Mz42PjQDot2zbSm7miMrdiUlAdTLVogU0ZAbAYSGff/ON16anpyORSDQWw1etrZm21jSuQoGg16sBfN4//udTJ44DLlZ3rfn6cy/G403IdnXiQSTNtpAVWpBtMCgtxvHIkl4ut6Tb/uH7P1T9/htX+65eu4ZQi0XCSGmJVOq5F16KxxPVagVPK0gvWOCq1atZliVfRmht2rwFTGEunwfu352cvHzxAlA7GAzF483tHZ3INoM3r0bCoc7VXV964umAP1CulJlHCnUoIe8XpPxcnrFEyCkM3erPrsoiWE+dOnXh7GmMjxCBh8JHqtXqtu6NmUx2ebkoMJdR8Ap0kmnvBHMBe4ChgsHAo499ce++/Rcunr907tz8/Bx8GRBfWl5eXFgYHrqdiMcDgeC6det7jzwKtgIGUIMzFrOSCO6jen3pTObqlctT09Pt2Y7rfX0/+8nLTz59NByKfPTBiae+dhRu88Zrr8KJbUaQMD4U69RSA2JByGTaQ7EYkjunDQhGeFsgGDr82Od37Nxz+cLZi+fPF+bzKqVdj6b5fJoPWX/nnr0IJV03a5nDqUWpmw927t594eyZ13/2yrqNm673XUG4tHe0T9+dwhzxZEtuegquvbS05LRwpmQ7bmDRSQ4HA3v27UumW22WrAhOarkEbuFVfWvWrd22fSfgcXRkmOgsY3NwmFRLOpFMQTFiQwLgB4IZqaw5mdQ07erlS4O3bgDgj3zx8Z27HtCCgduD/SeP/enuxDi0AIHWdq9///ixzq6ujZs2WYZFwAOXh/c1J1I2IQGGwj381RKmIzGeTLbN5WbgugAXfOOpViH68PCd7s1buJOLtQzJUIenNsGo6g/1Hly7Zl1uNgcEgU2hHoT2S9/67tCdwVRLC6gYNKF5tSeefKo1mzV004UiTAh+kgJ/SLeCq1KSFGyF0ocEOoZ49nrVeRaYN65di0Sju3bvnZycIE5nGKB4Pdt3qB6lnsh5qcKMYHPtYaWhcATRhxSCzOgwWuf1aW3tHf5AsKkpgUjHZJu2bG1KNCMG6xlCgYyDA7c39/Swddssy8kglu8d+zOUnEymLl28eGdwsDmROPq3z67fsDGXy/XfugHaA1ifmZ7qWr3atgyOsg4htcUZvM1qIQSQyQ5SpeOCMSyjMx5SV+xv3nwVLO2xLz/BQIvoiSJK3nv37oFVR2JRqmRIKnlpcWF+dqZQmL/dfwszAT6e/OrXMplOcNcDhw7fuT2ABRmGPjo01L12XVU0BJff0DSIp2KlpPn8llkF9YXzQUmgjDJxB1E3q4SOMB6yiIE1yHhjaHAAifhxUBrBZEaU5GhTtFIqJRKJTKZVsE2/x1FFOx6Nbt7So2mBufwc6pDe3oMbN/eUKhUYBW4OJU1MjHPGv/1zOzgh41ZEZACEXv/5Ty+cO3Pq5AkACjwaWhkY6P/k41Ojw7ej0VgoHC6VS3B5r+YF9Qf37bt0UVHVXQ/sBa9VVC+kUpKp5vHhkeHb/Tt2bA2oYsAjEUFBDMry3r17i6XimdOnYS8yBHErERrtPXj41s3rYPH3JieR0YLhiGObVDTUKM9CIQ+JEcJ//csHrW2ZLVu3vfPmayhLYLarfVe+84/fn8/P/fzHL7e1Z4DVf/eNb4KLI+X5/L4bn145/dFHGaSsrrVdEOPu3bHiQj7gVWSOQMRobFOv9vRspYpE0+r1IGh+Wya77XO7TMPAWkdHR1BN8PgTaxwDoh/+whde+s7fI8ddvnTOp2lff+HFf/7Xf3v2hZdmQb7uTXpYaT85PtbZ2RkKhjEhoGRqcvLtN39568bVNojVnskAIVFQjQ8NAyTZ7JyqCyjvtGD4qb955oF9D5mG6daBoojk82DvoWA4hNQ0MT7ONcSIqlMvqQN+P5h7pj1bKhZxoVfNd3/76z/9/vdIYiiDEXAYpPfgw9/9px8kW9LAJpRy7/z6DRj3medf2NyzTUI51rkqq2nq+MQIxXANHC1yP3LSTKYDGm6snU2W2nfs3rt9x879vQfgfJzC14oaopxQJIIrP5cDJ5vNzbz1y1dFRdrU08PLFHgjNJxdtZqyuo3kKMMf7gze7uru3r5rNzGO0uJ8z9ZN1cXZ5aW5fKHQHIvBTwwRtAucFu9SC0Jo4PI8yKuV8sHDn1eJGctwO7HOFlnuQMkEf58YHe27fOngI4+aFqK2uu/AISSv06c+xKiMvPFqndXGAAVJTKVbxoaGRoaHs+1ZaSaXa2nJqP5gWTcnJiZNWa3aIn7NBlmc+675P5tqNWiOQqFRZuIwIPdIf2OjQ+vWb0DKT7ekuzdtfP2Vn7zxXz8F4bl57SoqdBiREyG8ANyBmz//jW/BuB//5QOAFHzPWF4qx1NtY8PD/f0Dazf12IzI4pCYOXgmqeFlPSuLbmFIKOn2nhzOlx0BBffuBx7s6t5AFYAs48mnn3n2Wt/lYDBSmJ8D+0AUJVNpJBEGdRI4Yzja3LG6C5H08Uen9uw7IB96bO3gzdz+3t7u7vWJdFoL+MELVjofhPsSj3tey9cTIAGvS2bqPJ+SB/5M22zLdMRjzbQ8oKllgxplV3WlU6lstjPZ0qJp/p17HkSiZKWDAyRvbc8km1PhcGRiYhS+K/7wX35w8th7lVIZFK85kYw1RXERikX9mt/jVeGgVJizqqxmSMlNyI7DxHQLVFzarIcIl/ZpKiKX2kxUzlAWQqol8OaFl7sqt5DB+x6fBq+Av8qkWgGkUlnfFut69stj4xPDE7nR8btjo8OIUrAoRHi0qQmpEEcsnohEQtRw8PolVhUKvLNUaxkJ9XYlgZZVLutUxjvMHer1JRX8wGqHyg8yAvtKlvGNXikzLUvIIlgEZlGqllf2BrZsX33gSBxpyzCNiYmJ/oGB3Ozs1L3pkaHBi+fOEtz5EemBUCTcBN7enGhujocjsUAg4EUBonhoHmRlx2YBz2KL9ABccb0Q35Pb2azD1lgPU8+MWi9QKi2CdStgdQVmBifVjSqQg4/mUXxbt26Hq+IB1G7HTpxAMcDqMgdPjg7d6fv0sqkbwF5fwB9DDZJMwe7RWFM4CkGDKIoYy2W6o0adZQr1fqYtQVkkrIMLgTcmmLiWQEjGtAh6JShg4rxTwGkcFiI7pkFYRWpH6vUHI/4oFOlk2zOPHzmCVIiUMD8PGoaKC9xmGjXWtb5P6WHZEwgGQLDA9YLhYDQaD0WiwVBI8xHbJjfD2PizRBPrdSwYTWQsEmW/RAyThTPIJwCC/INaNSQ5npMI0JkRWOOFTlQKwfedVCrFO82xSCQei4lr1mAY0B6wwuJyMZ+HoPOg/KjDlhYLMzNTpoEaVYQ/+nz+QDjsDwabIiF4dyAY9gcCCgt4HkZg5xaBLMtgIsoLW2GGtyhUJCLhDnFk6iTIUr0rRHWH6vGmUq2sW2kZrJUrrtTYEuqZSDjame3EGwDYCrUwy4VCAWfqTywVgWTzpeW5qbvsRYn0pwKLfF6Q1IDm0cBeQz6v7KEwRz2Besg0HC6Hs9IW57QRdpb5xgG06lFhUPJGyxZWSh3HcdtCIhFlx0HEsN6vDfqKEBYZEWKbDSbyOs7lSgWFHc5AkIWFgjE7i1fwqur1wFkhI+oqVM2KIFHfiDWWBN6X4SqwmaEdt8clcB+m5jdMThSDbREQVjGvZm1aVqRQsQfHtVjflhcMDgs+n8+HJUejEeImuskWYKMoQObhzVEUB/ADSIxbCk8arDNHfMGkPmK9mKGEwwXmGx2EgTQZqDhFGE+IDC1F3qZntFu0Lctu6N3f177nOYuhHKhEwO9Bdc5b+NBpqUQdg6ViUeFMBpBDVTfvHrJ6m++WsBRDSM/gQqB+gmmIbLfFJtvKHKyFhm0fZ2XThQODvdKzc4R6HuMLxzC6YdcKASkYDIL+9/RsVlifyGJdRkSpTB0xDESSUBUE5dGyEB0U26wpa/OWoit6fS+jYQtoZevFadjduK9DJjTqjr4FLPv9GuBZVRXDqChwCNZqZYmXBRoBrWVxPie7rVGHL81VC0Wr+JmtqJpArtnc+Vaq//vkqB+QhnYqCIOoci7rVduxZMNUmNYBDgAPkSc45t+sNw39sXTMrMiMyKbFAzbbd+EyiLVmuuviNfU07uY1KoZlwvomDk+vtmHS1HbZypPHVxR4JwUUdynRpZcWyzNun5wYA1yLdTV5HnM4KaABWWfdVRvlNcfttTZu5dWFY+hTdzR3M4kvxHQ3XRwmNAg/Zx+ur7oeILEdAO6kgFMP7iiC5dzH2AV7ZcupttPpNEz22a28mhFXSK7T4Hw0KRnT/fR/AgwASCAyT2PPvv8AAAAASUVORK5CYII=" style="height:50px; width:50px" /></p>
+
+<p><u>This</u> is a <em>pretty</em> <strong>boring</strong>&nbsp;graphic...</p>
diff --git a/tests/rtfs/sample-with-image.rtf b/tests/rtfs/sample-with-image.rtf
new file mode 100644
index 0000000..f1bf87f
--- /dev/null
+++ b/tests/rtfs/sample-with-image.rtf
@@ -0,0 +1,58 @@
+{\rtf1\ansi\deff1
+{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;}
+{\stylesheet{\s1 List Paragraph;}}
+{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}}
+{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}}
+\sa150{{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE  "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict\pngblip\picw20714\pich12143\picwgoal750\pichgoal750\picscalex100\picscaley100
+89504e470d0a1a0a0000000d4948445200000032000000320802000000915d1fe60000001974455874536f6674776172650041646f626520496d616765526561647971c9653c000012224944415478da6c5969705bd7757e1b1ef0b08320008204094a94
+44ad94146d966551926539719cc9d8711cb5ced8aeed643259da69279349ffb43ffa3f3f5a77927192c64be224b6c793269d249225db916359d52e532b49893b259220419004b1bcb5dfb9f701843c7d241f1f1edebbf7dcb37ce73be78a27cf9c910441
+1045894eecc047c711f0e3d88e6d3b9665d9b66d9ab6659938dbf86ff17b0e1ea3671d7e21d446c085543bf0516687a228f5b3c7e369fc88c770c11fe66f2992880f823b80c83e60749a8d4d685b36be820438591686e002f1c36938b8345ca0ff57acfa
+a1b0e33337f124ce5c2c9ce97b41a20f3293890f4a923145088ee230dd08b68d5fb3a6272e13ce5c5b42eda8bf5e9789cfc73f364a561785abcafdc86ee1354526ed89fc8e24d6ecc86cca4c499393392d0bf229dc843569ea76ac1f62c3b1224a4d20d8
+8e4ba4783c7545e28c216860db2e5775c332755dc7430a1789bd4ec3e141a1e66122938c8bc0e573ea2ec56dcd745c3f049a83ffd0b3a669e998c630ad4aa5aaebe50a8e6aa9b45c5a2e4282f9426171610937abe55271b9582a959797972be5b25ed521
+9507e24332a65bb6327789aeab89ae8f99b41cf8bc655574bd5ac1eb98a98a53b158c470cb980da32f2d2d2e154b4b98a558aa94f145b954c2c39849af560d03ba30995e6cb604d206294f9110021e55852a558fea0f0515bfdfeff5fa1cd222a636f0b6
+5ea914b19ca5e262a1802997968b8b0b8b986d61610157a5e54ab18cc9b0ac0a9ec64ca66938964dca43e80822e6a1b57a6032d5e3f57a3d9e6024a2aab8a61fd5e3f1fa3c3e5f805d2a7457f5422cc803fd903b29a416657272ec9db7df5ac8e7971617
+4b581969a0ac1b3a96449e0b4349221ee4b3e0d7a7aa7eaf2f9648a85e1593fabc3e85eee30f27955c95d42fe104c71149f5e41518c5e141c07d106a9078546102918cee7e10795c2bbf78e5c7b76fde4ab6a4429a3f118b6175980ad3610e9c318da0d0
+f0f5a8c19f43e12632edc0fb58d4da1cba388cf1a96afe2988163d648be49c165e71c3c9665e42115ec33a1a885f8a4a6961f1ab478f1a887dd8db11fd9a0fafb3c8546451b619a46226fe1a7db4b0345b70d7c66ed7a25174f1cea9c1a228f13ba40e89
+0494b84ed8e48e847b02138e81a523887ca99223da4a341a9a5fcc9f38f61e5c78e3860dfbf6ef377513b6c6c3b61bf3520d296b73f338e0ffa12f8966e7a2f1f0845a113d5c363c04ef8333c3fff0837b2c50452627dec567d2111b99d99549aea4dbda
+c6c7c717e6f388c54c5b3b5095204362fec09f6b4026d734ae96d858129340100cf27ad2ac484a1170d3a378e60af9fffde493e9e9a95853fcc17dbd4dd1a8699922d90a8a72642610cb0fa4c93aacd0c491586c726202cb0a8542a996a469c0822e72b9
+4a61d73c0188ec971c93a39524f97d1a4001bf707c1792c9ad49e28a5efae31f7e77fdcaa5d242e1fae58bfffdee5b954a4911c8901843769fb4e997a7515180dbc04fe1f612623d3f9d83865bd3695f40c37ac9f3b8078b2b3aaa69c7b52b0b50d11ff0
+dfb875fd3ffffd472fffc78f667339c42297090742f3faf56b93e3e3871e7ee49bdffedec1478e0c0f0e5cb97451f579b100a34ad882a06510ef0069302b14592e97c8634447191d1903f4e1eb8ece2c734c8bfc5056803758856512aabb8ecd424fe0be
+250a7eaffffcf9b3efbefd1be0682cd6047fe2be8c4129b90bcec8c8705b7bfbaebd0f41053b76ef9e9b9ece643bf285fcc9e3c707fb6f41d33b77ef79f8f0a30b85f95fbdfe8b7822919b9aea5cd3f5e4579e06fc2a77060780d8d16834996ab50c1db3
+03a4f2f9d9997b33e9b64c381c064e40b78263ea701ccb61fe247a7dbe531f7ef83f7f7817ab4ea7db9e7bfec5543255d12b4c2c925cafe8f999d9f68e0ee08a0edb89f2e34f7c05283b3a3c74f5d2f9751b369855fdfd637f5cd7bd3e120e2fcccf8d8f
+8d00e8b76cdb4a6ee688cadd8949407532d5a205346406c061219f7ff38dd7a6a7a72391483416c357adad99b6d634ae4281a0d7ab017cde3ffee753278e032e5677adf9fa732fc6e34dc87675e24124cdb690155a906d30282dc6f1c8925e2eb7a4dbfe
+e1fb3f54fdfe1b57fbae5ebb86508b45c248698954eab9175e8ac713d56a054f2b482f58e0aad5ab5996255f46686ddabc054c612e9f07eedf9d9cbc7cf102503b180cc5e3cded1d9dc8368337af46c2a1ced55d5f7ae2e9803f50ae9499470a752821ef
+17a4fc5c9eb144c8290cddeacfaeca22584f9d3a75e1ec698c8f108187c247aad5eab6ee8d994c7679b928309751f00a749269ef0473017b80a182c1c0a38f7d71efbefd172e9ebf74eedcfcfc1c7c19105f5a5e5e5c58181eba9d88c70381e0ba75eb7b
+8f3c0ab6020650833316b39208eea37a7de94ce6ea95cb53d3d3edd98eeb7d7d3ffbc9cb4f3e7d341c8a7cf4c189a7be76146ef3c66bafc2896d4690303e14ebd452036241c864da43b118923ba70d0846785b20183afcd8e777ecdc73f9c2d98be7cf17
+e6f32aa55d8fa6f97c9a0f597fe79ebd08255d376b99c3a945a99b0f76eede7de1ec99d77ff6caba8d9baef75d41b8b477b44fdf9dc21cf1644b6e7a0aaebdb4b4e4b470a6643b6e60d1490e07037bf6ed4ba65b6d96ac084e6ab9046ee1557d6bd6addd
+b67d27e071746498e82c63737098544b3a914c4131624302e0078219a9ac3999d434edeae54b83b76e00e08f7cf1f19dbb1ed08281db83fd278ffde9eec438b40081d676af7ffff8b1ceaeae8d9b36598645c0039787f735275236210186c23dfcd512a6
+23319e4cb6cde566e0ba00177ce3a95621faf0f09deecd5bb8938bb50cc95087a736c1a8ea0ff51e5cbb665d6e360704814da11e84f64bdffaeed09dc1544b0ba81834a179b5279e7caa359b3574d385224c087e92027f48b782ab5292146c85d287043a
+8678f67ad579169837ae5d8b44a3bb76ef9d9c9c204e6718a0783ddb77a81ea59ec879a9c28c6073ed61a5a17004d1871482cce8305ae7f5696ded1dfe40b0a9298148c7649bb66c6d4a342306eb1942818c8303b737f7f4b075db2ccbc92096ef1dfb33
+949c4ca62e5dbc786770b0399138fab7cfaedfb03197cbf5dfba01da03589f999eea5abddab60c8eb20e21b5c519bccd6a210490c90e52a5e382312ca3331e5257ec6fde7c152cedb12f3fc1408be889224ade7bf7ee8155476251aa64482a796971617e
+76a65098bfdd7f0b33013e9efcead732994e70d703870edfb93d800519863e3a34d4bd765d55340497dfd03488a762a5a4f9fc965905f585f34149a08c3271075137ab848e301eb2888135c8786368700089f871501ac1644694e46853b4522a2512894c
+a655b04dbfc751453b1e8d6eded2a36981b9fc1cea90dede831b37f7942a1518056e0e254d4c8c73c6bffd733b3821e356446400845efff94f2f9c3b73eae409000a3c1a5a1918e8ffe4e353a3c3b7a3d158281c2e954b7079afe605f507f7edbb745151
+d55d0fec05af55542fa45292a9e6f1e191e1dbfd3b766c0da862c0231141410ccaf2debd7b8ba5e299d3a7612f3204712b111aed3d78f8d6cdeb60f1f7262791d182e188639b5434d428cf42210f8911c27ffdcb07ad6d992d5bb7bdf3e66b284b60b6ab
+7d57bef38fdf9fcfcffdfcc72fb7b56780d57ff78d6f828b23e5f9fcbe1b9f5e39fdd14719a4acaeb55d10e3eeddb1e2423ee055648e40c4686c53aff6f46ca58a44d3eaf520687e5b26bbed73bb4cc3c05a474747504df0f8136b1c03a21ffec2175efa
+cedf23c75dbe74cea7695f7fe1c57ffed77f7bf685976641beee4d7a58693f393ed6d9d9190a863121a0646a72f2ed377f79ebc6d53688d59ec9002151508d0f0d0324d9ec9caa0b28efb460f8a9bf79e6817d0f9986e9d681a288e4f360efa1603884d4
+34313ece35c488aa532fa9037e3f987ba63d5b2a1671a157cd777ffbeb3ffdfef748622883117018a4f7e0c3dffda71f245bd2c0269472effcfa0d18f799e75fd8dcb34d4239d6b92aab69eaf8c408c5700d1c2d723f72d24ca6031a6eac9d4d96da77ec
+debb7dc7cefdbd07e07c9cc2d78a1aa29c5024822b3f9703279bcdcdbcf5cb574545dad4d3c3cb147823349c5db59ab2ba8de428c31fee0cdeeeeaeedebe6b37318ed2e27ccfd64dd5c5d9e5a5b97ca1d01c8bc14f0c11b40b9c16ef520b4268e0f23cc8
+ab95f2c1c39f578919cb703bb1ce1659ee40c9047f9f181dedbb7ce9e0238f9a16a2b6baefc02124afd3a73ec4a88cbcf16a9dd5c60005494ca55bc68686468687b3ed596926976b69c9a8fe6059372726264d59adda227ecd06599cfbaef93f9b6a3568
+8e42a15166e23020f7487f63a343ebd66f40ca4fb7a4bb376d7cfd959fbcf15f3f05e1b979ed2a2a7418911321bc00dc819b3fff8d6fc1b81fffe50380147ccf585e2ac7536d63c3c3fdfd036b37f5d88cc8e29098397826a9e1653d2b8b6e614828e9f6
+9e1cce971d0105f7ee071eeceade4015802ce3c9a79f79f65adfe5603052989f03fb40142553692411067512386338dadcb1ba0b91f4f147a7f6ec3b201f7a6cede0cddcfededeeeeef589745a0bf8c10b563a1f84fb128f7b5ecbd7132001af4b66ea3c
+9f9207fe4cdb6ccb74c463cdb43ca0a965831a655775a553a96cb633d9d2a269fe9d7b1e44a264a58303246f6dcf249b53e17064626214be2bfef05f7e70f2d87b95521914af39918c354571118a45fd9adfe355e1a05498b3aaac6648c94dc88ec3c474
+0b545cdaac870897f6692a2297da4c54ce501642aa25f0e68597bb2ab790c1fb1e9f06af80bfcaa45a01a45259df16eb7af6cb63e313c313b9d1f1bb63a3c38852b0284478b4a909a910472c9e884442d470f0fa2556150abcb3546b1909f57625819655
+2eeb54c63bcc1deaf52515fcc06a87ca0f3202fb4a96f18d5e29332d4bc82258046651aa9657f606b66c5f7de0481c69cb308d898989fe8181dcececd4bde991a1c18be7ce12dcf911e9815024dc04dede9c686e8e8723b14020e04501a278681e6465c7
+6601cf628bf4005c71bd10df93dbd9acc3d6580f53cf8c5a2f502a2d82752b607505660627d58d2a90838fe6517c5bb76e87abe201d46ec74e9c4031c0ea32074f8e0edde9fbf4b2a91bc05e5fc01f430d924cc1eed15853380a4183288a18cb65baa346
+9d650af57ea62d415924ac830b81372698b8964048c6b4087a252860e2bc53c0691c16223ba64158456a47eaf50723fe2814e964db338f1f3982548894303f0f1a868a0bdc661a35d6b5be4fe961d913080640b0c0f582e160341a0f45a2c15048f311db
+2637c3d8f8b34413eb752c184d642c1265bf440c938533c8270082fc835a3524399e9308d0991158e3854e540ac1f79d542ac53bcdb148241e8b896bd66018d01eb0c2e272319f87a0f3a0fca8c396160b333353a6811a55843ffa7cfe4038ec0f069b22
+2178772018f607020a0b781e4660e716812ccb6022ca0b5b6186b728542422e10e7164ea24c852bd2b447587eaf1a652adac5b6919ac952baed4d812ea994838da99edc41b00d80ab530cb85420167ea4f2c158164f3a5e5b9a9bbec4589f4a7028b7c5e
+90d480e6d1c05e433eafeca130473d817ac8341c2e87b3d216e7b4117696f9c601b4ea516150f246cb16564a1dc771db42221165c741c4b0deaf0dfa8a10161911629b0d26f23acee54a05851dce4090858582313b8b57f0aaeaf5c0592123ea2a54cd8a
+2051df88359604de97e12ab099a11db7c725701fa6e6374c4e14836d11105631af666d5a56a450b107c7b558df96170c0e0b3e9fcf872547a311e226bac91660a32840e6e1cd511407f003488c5b0a4f1aac33477cc1a43e62bd98a184c305e61b1d8481
+3419a83845184f880c2d45dea667b45bb42dcb6ee8dddfd7bee7398ba11ca844c0ef4175ce5bf8d069a9441d83a56251e14c0690435537ef1eb27a9bef96b0144348cfe042a07e8269886cb7c526dbca1cac85866d1f6765d3850383bdd2b373847a1ee3
+0bc730ba61d70a0129180c82fef7f46c56589fc8625d4644a94c1d310c4492501504e5d1b2101d14dbac296bf396a22b7a7d2fa3610b6865ebc569d8ddb8af432634ea8ebe052cfbfd1ae0595515c3a8287008d66a658997051a01ad65713e27bbad5187
+2fcd550b45abf899ada89a40aed9dcf956aafffbe4a81f9086762a0883a8722eeb55dbb164c35498d6010e000f912738e6dfac370dfdb174ccacc88cc8a6c50336db77e13288b566baebe235f534eee6352a8665c2fa260e4fafb661d2d476d9ca93c757
+14782705147729d1a59716cb336e9f9c18035c8b7535791e733829a0015967dd551be535c7edb5366ee5d58563e85377347733892fc474375d1c2634083f671faeafba1e20b11d00eea480530fee2882e5dcc7d8057b65cba9b6d3e9344cf6d9adbc9a11
+5748aed3e07c342919d3fdf47f020c004820324f63cfbeff0000000049454e44ae426082
+}}}}}\par\pard
+\sa150{This is a \i pretty\i0 \b boring\b0 \u160?graphic...}\par\pard
+}
diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py
index e29d283..b2dcbc5 100644
--- a/tests/test_readrtf15.py
+++ b/tests/test_readrtf15.py
@@ -30,6 +30,17 @@ class TestRtfFile(unittest.TestCase):
     pass
 
 
+class TestRtfWithImage(unittest.TestCase):
+
+    def test_inline_png(self):
+        sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
+        with open(sample_with_image, 'rb') as rtf:
+                doc = Rtf15Reader.read(rtf)
+                image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image))
+                expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143',
+                            'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'}
+                self.assertEquals(expected, image.properties)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py
index 978c277..9e7bb90 100644
--- a/tests/test_readxhtml.py
+++ b/tests/test_readxhtml.py
@@ -80,6 +80,22 @@ def test_url(self):
         text = doc.content[0].content[0]
         assert text['url'] == "http://google.com"
 
+    def test_inline_png(self):
+        pixels = 50
+        twips = pixels * 15
+        height = width = str(twips)  # in retrospect choosing a square image wasn't a great idea :)
+        with open('tests/html/sample-with-image.html', 'rb') as xhtml:
+            doc = XHTMLReader.read(xhtml)
+            image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image))
+            self.assertEquals(image.content[0][1:4], u'PNG')
+            self.assertEquals(image['pngblip'], True)
+            self.assertEquals(image['pich'], height)
+            self.assertEquals(image['pichgoal'], height)
+            self.assertEquals(image['picw'], width)
+            self.assertEquals(image['picwgoal'], width)
+            self.assertEquals(image['picscaley'], '100')
+            self.assertEquals(image['picscalex'], '100')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_writertf15.py b/tests/test_writertf15.py
new file mode 100644
index 0000000..c09c33d
--- /dev/null
+++ b/tests/test_writertf15.py
@@ -0,0 +1,19 @@
+import os
+import unittest
+from pyth.plugins.xhtml.reader import XHTMLReader
+from pyth.plugins.rtf15.writer import Rtf15Writer
+
+class TestRtfWithImage(unittest.TestCase):
+
+    def test_inline_png(self):
+        sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "html", "sample-with-image.html")
+        with open(sample_with_image, 'rb') as rtf:
+            source = XHTMLReader.read(rtf)
+            doc = Rtf15Writer.write(source).getvalue()
+            self.assertIn('pngblip', doc)
+            self.assertIn('picwgoal750\\', doc)
+            self.assertIn('pichgoal750\\', doc)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_writexhtml.py b/tests/test_writexhtml.py
new file mode 100644
index 0000000..f07418c
--- /dev/null
+++ b/tests/test_writexhtml.py
@@ -0,0 +1,19 @@
+import os
+import unittest
+from pyth.plugins.rtf15.reader import Rtf15Reader
+from pyth.plugins.xhtml.writer import XHTMLWriter
+
+class TestHtmlWithImage(unittest.TestCase):
+
+    def test_inline_png(self):
+        sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
+        with open(sample_with_image, 'rb') as rtf:
+            source = Rtf15Reader.read(rtf)
+            doc = XHTMLWriter.write(source).getvalue()
+            self.assertIn('<img src="data:image/png;base64,', doc)
+            self.assertIn('width:50px', doc)
+            self.assertIn('height:50px', doc)
+
+
+if __name__ == '__main__':
+    unittest.main()

From a3fa2ab4d19c2782f1df3a5b118073d998435e07 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 27 Nov 2015 16:49:22 +0000
Subject: [PATCH 03/14] Strip out unsupported control characters from RTF

Comment regarding ~, - and _  in commit c72d4572 suggests dropping them was
intended behavior.. but instead they are included as text output.

Spec at http://www.biblioscape.com/rtf15_spec.htm:
\~: Nonbreaking space.
\-: Optional hyphen.
\_: Nonbreaking hyphen.

A future extension might be to extend document to represent these, and then
let writers decide whether they want to include them or not (e.g. as &NBSP;
in XHTML).
---
 pyth/plugins/rtf15/reader.py |  2 +-
 tests/rtfs/control_chars.rtf | 11 +++++++++++
 tests/test_readrtf15.py      | 25 ++++++++++++++++++++-----
 3 files changed, 32 insertions(+), 6 deletions(-)
 create mode 100644 tests/rtfs/control_chars.rtf

diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
index f569afa..7fe7cc8 100644
--- a/pyth/plugins/rtf15/reader.py
+++ b/pyth/plugins/rtf15/reader.py
@@ -157,7 +157,7 @@ def getControl(self):
             if not next:
                 break
 
-            if first and next in '\\{}':
+            if first and next in '\\{}~_-':
                 chars.extend("control_symbol")
                 digits.append(next)
                 break
diff --git a/tests/rtfs/control_chars.rtf b/tests/rtfs/control_chars.rtf
new file mode 100644
index 0000000..a5cfe6e
--- /dev/null
+++ b/tests/rtfs/control_chars.rtf
@@ -0,0 +1,11 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170
+{\fonttbl\f0\froman\fcharset0 TimesNewRomanPSMT;}
+{\colortbl;\red255\green255\blue255;}
+{\info
+{\author Kris Powell}}\paperw11900\paperh16840\margl1134\margr1134\margb1134\margt1134\vieww10800\viewh8400\viewkind0
+\deftab709
+\pard\pardeftab709
+
+\f0\fs14 \cf0 \~\~\~\~\~\~\~\~\~  NB Spaces
+\fs24 \
+}
diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py
index b2dcbc5..b1c6a84 100644
--- a/tests/test_readrtf15.py
+++ b/tests/test_readrtf15.py
@@ -35,11 +35,26 @@ class TestRtfWithImage(unittest.TestCase):
     def test_inline_png(self):
         sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
         with open(sample_with_image, 'rb') as rtf:
-                doc = Rtf15Reader.read(rtf)
-                image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image))
-                expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143',
-                            'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'}
-                self.assertEquals(expected, image.properties)
+            doc = Rtf15Reader.read(rtf)
+            image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image))
+            expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143',
+                        'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'}
+            self.assertEquals(expected, image.properties)
+
+class TestRtfWithNonbreakingSpaces(unittest.TestCase):
+
+    def test_tildes_are_parsed(self):
+        sample_with_tildes = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "control_chars.rtf")
+        with open(sample_with_tildes, 'rb') as rtf:
+            doc = Rtf15Reader.read(rtf)
+            traverse_text(doc, lambda text: self.assertNotIn('~', text))
+
+def traverse_text(element, function):
+    if element.__class__ == pyth.document.Text:
+        map(function, element.content)
+    else:
+        for child in element.content:
+            traverse_text(child, function)
 
 
 if __name__ == '__main__':

From 6ab83756988fe539317b1b774ac7bcd78789fe79 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Thu, 3 Dec 2015 17:16:14 +0000
Subject: [PATCH 04/14] Fix nested lists rtf15.reader bug

Previously in RTF documents containing nested lists that 'ended' on a nested
item, the outer most item would be added into the list above it, but the list
above it would never be added in the lists/ doc above that, so would get dropped.
---
 pyth/plugins/rtf15/reader.py | 10 +++++++---
 tests/test_readrtf15.py      | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
index 7fe7cc8..c9fe4fd 100644
--- a/pyth/plugins/rtf15/reader.py
+++ b/pyth/plugins/rtf15/reader.py
@@ -317,8 +317,12 @@ def handle_Para(self, para):
             self.listStack.append(l)
 
         elif self.listLevel < prevListLevel:
-            l = self.listStack.pop()
-            self.listStack[-1].append(l)
+            times = prevListLevel + 1
+            if self.listLevel is not None:
+                times = times - (self.listLevel + 1)
+            for _ in xrange(times):
+                l = self.listStack.pop()
+                self.listStack[-1].append(l)
 
         self.block = None
 
@@ -603,7 +607,7 @@ def handle_strike(self, onOff=None):
 
     def handle_ilvl(self, level):
         if self.currentParaTag is not None:
-            self.currentParaTag.listLevel = level
+            self.currentParaTag.listLevel = int(level)
         else:
             # Well, now we're in trouble. But I'm pretty sure this
             # isn't supposed to happen anyway.
diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py
index b1c6a84..7bef397 100644
--- a/tests/test_readrtf15.py
+++ b/tests/test_readrtf15.py
@@ -49,6 +49,25 @@ def test_tildes_are_parsed(self):
             doc = Rtf15Reader.read(rtf)
             traverse_text(doc, lambda text: self.assertNotIn('~', text))
 
+
+class TestNestedLists(unittest.TestCase):
+
+    def test_when_last_item_sublist_item(self):
+        """ With structures like this, both lists were getting dropped
+        Start
+         * 1
+           * 1.1
+        """
+        list_bug = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "list-bug.rtf")
+        with open(list_bug, 'rb') as rtf:
+            doc = Rtf15Reader.read(rtf)
+            text = []
+            traverse_text(doc, lambda x: text.append(x))
+            self.assertIn('Start', text)
+            self.assertIn('1', text)
+            self.assertIn('1.1', text)
+
+
 def traverse_text(element, function):
     if element.__class__ == pyth.document.Text:
         map(function, element.content)

From ae8c81352136b19373c6f7bf900982a50068cae1 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Thu, 3 Dec 2015 18:28:14 +0000
Subject: [PATCH 05/14] Add forgotten rtf example for test

---
 tests/rtfs/list-bug.rtf | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tests/rtfs/list-bug.rtf

diff --git a/tests/rtfs/list-bug.rtf b/tests/rtfs/list-bug.rtf
new file mode 100644
index 0000000..8598069
--- /dev/null
+++ b/tests/rtfs/list-bug.rtf
@@ -0,0 +1,10 @@
+{\rtf1\ansi\deff1
+{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;}
+{\stylesheet{\s1 List Paragraph;}}
+{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}}
+{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}}
+\sa150{Start}\par\pard
+\ilvl0\ls0\li720\s1\sa50{1}\par\pard
+\ilvl0\ls0\li720\s1\ilvl1\ls0\li1440\s1\sa50{1.1}\par\pard
+}
\ No newline at end of file

From bc3a87418bf54e1231e15f91deabe8759534f3a4 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 4 Dec 2015 08:31:28 +0000
Subject: [PATCH 06/14] Add underline to XHTMLReader

Also confirmed round trip from XHTML to RTF with tests:
 - Checking that RTF reads underlining markup into Document
 - Checking that RTF writes underline formatting
 - Checking that XHTML reads u tags or css underline styling into Document
 - Checking that XHTML writes u tags
---
 pyth/plugins/xhtml/css.py      |  3 +++
 pyth/plugins/xhtml/reader.py   |  6 ++++++
 tests/rtfs/text-attributes.rtf |  8 ++++++++
 tests/test_readrtf15.py        | 10 ++++++++++
 tests/test_readxhtml.py        | 18 ++++++++++++++++++
 tests/test_writertf15.py       |  9 +++++++++
 tests/test_writexhtml.py       |  9 +++++++++
 7 files changed, 63 insertions(+)
 create mode 100644 tests/rtfs/text-attributes.rtf

diff --git a/pyth/plugins/xhtml/css.py b/pyth/plugins/xhtml/css.py
index e2fe5be..e1e44eb 100644
--- a/pyth/plugins/xhtml/css.py
+++ b/pyth/plugins/xhtml/css.py
@@ -135,3 +135,6 @@ def is_super(self, node):
         properties = self.get_properties(node)
         return properties.get('vertical-align') == 'super'
 
+    def is_underline(self, node):
+        properties = self.get_properties(node)
+        return properties.get('text-decoration') == 'underline'
diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py
index 8e9710f..2889251 100644
--- a/pyth/plugins/xhtml/reader.py
+++ b/pyth/plugins/xhtml/reader.py
@@ -85,6 +85,10 @@ def is_italic(self, node):
         return (node.findParent(['em', 'i']) is not None
                 or self.css.is_italic(node))
 
+    def is_underline(self, node):
+        return (node.findParent(['u']) is not None or
+                self.css.is_underline(node))
+
     def is_sub(self, node):
         """
         Return true if the BeautifulSoup node needs to be rendered as
@@ -145,6 +149,8 @@ def process_text(self, node):
             properties['bold'] = True
         if self.is_italic(node):
             properties['italic'] = True
+        if self.is_underline(node):
+            properties['underline'] = True
         if self.url(node):
             properties['url'] = self.url(node)
         if self.is_sub(node):
diff --git a/tests/rtfs/text-attributes.rtf b/tests/rtfs/text-attributes.rtf
new file mode 100644
index 0000000..ae2a60a
--- /dev/null
+++ b/tests/rtfs/text-attributes.rtf
@@ -0,0 +1,8 @@
+{\rtf1\ansi\deff1
+{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;}
+{\stylesheet{\s1 List Paragraph;}}
+{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}}
+{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}}
+\sa150{\ul Underlined\ul0 }\par\pard
+}
\ No newline at end of file
diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py
index 7bef397..c8b6807 100644
--- a/tests/test_readrtf15.py
+++ b/tests/test_readrtf15.py
@@ -68,6 +68,16 @@ def test_when_last_item_sublist_item(self):
             self.assertIn('1.1', text)
 
 
+class TestTextProperties(unittest.TestCase):
+
+    def test_reads_underline(self):
+        text = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "text-attributes.rtf")
+        with open(text, 'rb') as rtf:
+            doc = Rtf15Reader.read(rtf)
+            underlined = doc.content[0].content[0]
+            self.assertTrue(underlined['underline'])
+
+
 def traverse_text(element, function):
     if element.__class__ == pyth.document.Text:
         map(function, element.content)
diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py
index 9e7bb90..9814fd9 100644
--- a/tests/test_readxhtml.py
+++ b/tests/test_readxhtml.py
@@ -53,6 +53,24 @@ def test_italic(self):
         text = doc.content[0].content[0]
         assert text['italic']
 
+    def test_underline(self):
+        """
+        Try to read a paragraph containing underline
+        """
+        xhtml = "<div><p><u>sub</u></p></div>"
+        doc = XHTMLReader.read(xhtml)
+        text = doc.content[0].content[0]
+        assert text['underline']
+
+    def test_underline_styling(self):
+        """
+        Try to read a paragraph containing underline via CSS
+        """
+        xhtml = '<div><p style="text-decoration: underline;">underline</p></div>'
+        doc = XHTMLReader.read(xhtml)
+        text = doc.content[0].content[0]
+        assert text['underline']
+
     def test_sub(self):
         """
         Try to read a paragraph containing subscript
diff --git a/tests/test_writertf15.py b/tests/test_writertf15.py
index c09c33d..d44e1c0 100644
--- a/tests/test_writertf15.py
+++ b/tests/test_writertf15.py
@@ -2,6 +2,7 @@
 import unittest
 from pyth.plugins.xhtml.reader import XHTMLReader
 from pyth.plugins.rtf15.writer import Rtf15Writer
+from pyth.document import Document, Paragraph, Text
 
 class TestRtfWithImage(unittest.TestCase):
 
@@ -15,5 +16,13 @@ def test_inline_png(self):
             self.assertIn('pichgoal750\\', doc)
 
 
+    def test_underline_output(self):
+        text = Text(content=[u'Underlined'], properties={'underline': True})
+        para = Paragraph(content=[text])
+        doc = Document(content=[para])
+        result = Rtf15Writer.write(doc).getvalue()
+        self.assertIn('\\ul Underlined\\ul0', result)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_writexhtml.py b/tests/test_writexhtml.py
index f07418c..cc12f40 100644
--- a/tests/test_writexhtml.py
+++ b/tests/test_writexhtml.py
@@ -2,6 +2,7 @@
 import unittest
 from pyth.plugins.rtf15.reader import Rtf15Reader
 from pyth.plugins.xhtml.writer import XHTMLWriter
+from pyth.document import Document, Paragraph, Text
 
 class TestHtmlWithImage(unittest.TestCase):
 
@@ -15,5 +16,13 @@ def test_inline_png(self):
             self.assertIn('height:50px', doc)
 
 
+    def test_underline(self):
+        text = Text(content=[u'Underlined'], properties={'underline': True})
+        para = Paragraph(content=[text])
+        doc = Document(content=[para])
+        result = XHTMLWriter.write(doc).getvalue()
+        self.assertIn('<u>Underlined</u>', result)
+
+
 if __name__ == '__main__':
     unittest.main()

From 5db24d515c8633be2e47880d92fdaf773247cbe4 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 4 Dec 2015 09:27:42 +0000
Subject: [PATCH 07/14] Use sub/ super xhtml tags for super/subscript text

As per http://www.w3.org/TR/xhtml1/dtds.html#dtdentry_xhtml1-strict.dtd_sub
this is the recommended way.
---
 pyth/plugins/xhtml/writer.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
index eb0879d..d0ffb40 100644
--- a/pyth/plugins/xhtml/writer.py
+++ b/pyth/plugins/xhtml/writer.py
@@ -15,6 +15,8 @@
     'bold': 'strong',
     'italic': 'em',
     'underline': 'u', # ?
+    'super': 'sup',
+    'sub': 'sub',
 }
 
 
@@ -112,20 +114,12 @@ def _text(self, text):
 
         current = tag
 
-        for prop in ('bold', 'italic', 'underline'):
+        for prop in ('bold', 'italic', 'underline', 'sub', 'super'):
             if prop in text.properties:
                 newTag = Tag(_tagNames[prop])
                 current.content.append(newTag)
                 current = newTag
 
-        for prop in ('sub', 'super'):
-            if prop in text.properties:
-                if current.tag is None:
-                    newTag = Tag("span")
-                    current.content.append(newTag)
-                    current = newTag
-                current.attrs['style'] = "vertical-align: %s; font-size: smaller" % prop
-
         current.content.append(u"".join(text.content))
 
         return tag

From 4171d2997eb3797d5264f4d53d97e452def0a3bc Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Fri, 4 Dec 2015 15:50:37 +0000
Subject: [PATCH 08/14] Treat xhtml ol as ul

For now its better to parse html ordered lists as unordered lists rather than creating invalid document structures that crash
parsing. (ListItems right under Paras because ol is ignored)
---
 pyth/plugins/xhtml/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py
index 2889251..312845e 100644
--- a/pyth/plugins/xhtml/reader.py
+++ b/pyth/plugins/xhtml/reader.py
@@ -177,7 +177,7 @@ def process_into(self, node, obj):
             new_obj = document.Paragraph()
             obj.append(new_obj)
             obj = new_obj
-        elif node.name == 'ul':
+        elif node.name in ('ul', 'ol'):
             # add a new list
             new_obj = document.List()
             obj.append(new_obj)

From 496657306d9c2b7a491ae105377f93714dab2585 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Thu, 31 Dec 2015 10:37:22 +0000
Subject: [PATCH 09/14] Fix for RTF documents that open with a list

Found plenty of examples of these in the wild.. This fix adds a para up front but doesn't add it to list stack,
so we also hold on the last pop of the list stack when unwinding lists because there is no final holding paragraph
---
 pyth/plugins/rtf15/reader.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
index c9fe4fd..a8da8df 100644
--- a/pyth/plugins/rtf15/reader.py
+++ b/pyth/plugins/rtf15/reader.py
@@ -320,7 +320,8 @@ def handle_Para(self, para):
             times = prevListLevel + 1
             if self.listLevel is not None:
                 times = times - (self.listLevel + 1)
-            for _ in xrange(times):
+            depth = len(self.listStack) - 1
+            for _ in xrange(min(times, depth)):
                 l = self.listStack.pop()
                 self.listStack[-1].append(l)
 
@@ -606,12 +607,12 @@ def handle_strike(self, onOff=None):
 
 
     def handle_ilvl(self, level):
-        if self.currentParaTag is not None:
-            self.currentParaTag.listLevel = int(level)
-        else:
-            # Well, now we're in trouble. But I'm pretty sure this
-            # isn't supposed to happen anyway.
-            pass
+        if self.currentParaTag is None:
+            # this can happen where documents open straight with lists rather than a containing Para..
+            p = Para()
+            self.content.append(p)
+            self.currentParaTag = p
+        self.currentParaTag.listLevel = int(level)
 
 
     def handle_up(self, amount):

From b8ab9c320a9425d43ad86fd73253e365d89ae11d Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Thu, 31 Dec 2015 12:18:15 +0000
Subject: [PATCH 10/14] Better sublist handling in xhtml writer

Previously, sublists were always added to their own li element, but this renders as double bullets in HTML:

 * Top level
 * - Sub list item

Now we add the nested ul directly to the prior non-list flow item (Top level para in example above), which gives
expected single-bullet nesting:

 * Top level
   - Sub list item
---
 pyth/plugins/xhtml/writer.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
index d0ffb40..b5c5e9f 100644
--- a/pyth/plugins/xhtml/writer.py
+++ b/pyth/plugins/xhtml/writer.py
@@ -93,12 +93,23 @@ def _list(self, lst):
         if self.cssClasses:
             ul.attrs['class'] = 'pyth_list_%s' % self.listLevel
 
+        last_li = None
         for entry in lst.content:
             li = Tag("li")
             for element in entry.content:
+                # in practice list elements always have only one content child?
                 handler = self.paragraphDispatch[element.__class__]
-                li.content.extend(handler(element))
-            ul.content.append(li)
+                if handler == self._list:
+                    # this is a sublist, so we shouldn't create an empty li, but rather append ul to prior li.
+                    # Lists can't be immediately sublisted (e.g. there must be at least something at outer level)
+                    # but if that is not the case the last_li will be None and next line will bomb out, which is a
+                    # useful implicit assertion
+                    last_li.content.extend(handler(element))
+                else:
+                    li.content.extend(handler(element))
+                    last_li = li
+            if li.content:  # li might be empty..
+                ul.content.append(li)
 
         self.listLevel -= 1
 

From 8116549efaa76ff0077e5bbabc75ff512b25141f Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Wed, 27 Jan 2016 13:45:16 +0000
Subject: [PATCH 11/14] Workaround for non-utf chars embedded in charBuffer

---
 pyth/plugins/rtf15/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
index a8da8df..5e8aca0 100644
--- a/pyth/plugins/rtf15/reader.py
+++ b/pyth/plugins/rtf15/reader.py
@@ -391,7 +391,7 @@ def __init__(self, reader, parent=None, charsetTable=None):
 
 
     def flushChars(self):
-        chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors)
+        chars = u"".join(c.decode(self.charset, self.reader.errors) for c in self.charBuffer)
         self.content.append(chars)
         self.charBuffer = []
 

From 5abf0e1ab1906d373285bb95a38d2586e856061c Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Tue, 9 Feb 2016 16:17:51 +0000
Subject: [PATCH 12/14] Encode control chars {} and \ when writing RTF

Currently these characters get writter out verbatim to RTF stream, rendering the result invalid.
Instead they should be escaped with a leading backslash.
---
 pyth/plugins/rtf15/writer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py
index f7bc85a..b33b153 100644
--- a/pyth/plugins/rtf15/writer.py
+++ b/pyth/plugins/rtf15/writer.py
@@ -257,7 +257,10 @@ def _text(self, text):
                 if unichar == '\n':
                     self.target.write(r'\line ')
                     continue
-
+                # Escape control characters
+                if unichar in '\\{}':
+                    self.target.write(r'\%s' % unichar)
+                    continue
                 point = ord(unichar)
                 if point < 128:
                     self.target.write(str(unichar))

From fd95cbfd504f09d6558625f9a4a6f353efff0764 Mon Sep 17 00:00:00 2001
From: Kris Powell <krispowell@avocacapital.com>
Date: Thu, 3 Mar 2016 18:47:37 +0000
Subject: [PATCH 13/14] Don't 'double escape' html entities when writing

If HTML entities were escaped when converting from HTML to whatever other format,
don't escape the ampersands in them again on the way out from whatever format back
to HTML.
---
 pyth/plugins/xhtml/writer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
index b5c5e9f..38e907b 100644
--- a/pyth/plugins/xhtml/writer.py
+++ b/pyth/plugins/xhtml/writer.py
@@ -7,6 +7,7 @@
 from pyth import document
 from pyth.format import PythWriter
 import base64
+import re
 
 from cStringIO import StringIO
 
@@ -198,8 +199,8 @@ def __repr__(self):
 
 
 def quoteText(text):
-    return text.replace(
-        u"&", u"&amp;").replace(
+    return re.sub(
+        u'&(?!(amp|lt|gt);)', u'&amp;', text, flags=re.IGNORECASE).replace(
         u"<", u"&lt;").replace(
         u">", u"&gt;")
 

From 79d0dd195c1171fdcf539161d29c3eeba9831212 Mon Sep 17 00:00:00 2001
From: Kris Powell <kris.powell@kkr.com>
Date: Tue, 13 Aug 2019 12:53:06 +0100
Subject: [PATCH 14/14] Ignore images when writing to plaintext

---
 pyth/plugins/plaintext/writer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyth/plugins/plaintext/writer.py b/pyth/plugins/plaintext/writer.py
index 9dd8bfd..a00dc25 100644
--- a/pyth/plugins/plaintext/writer.py
+++ b/pyth/plugins/plaintext/writer.py
@@ -46,7 +46,8 @@ def go(self):
     def paragraph(self, paragraph, prefix=""):
         content = []
         for text in paragraph.content:
-            content.append(u"".join(text.content))
+            if text.__class__ != document.Image:
+                content.append(u"".join(text.content))
         content = u"".join(content).encode("utf-8")
             
         for line in content.split("\n"):