brendonh · kippr · Nov 6, 2015 · Nov 13, 2015 · Nov 27, 2015 · Dec 3, 2015
diff --git a/examples/writing/htmlToRtf15.py b/examples/writing/htmlToRtf15.py
@@ -0,0 +1,12 @@
+from pyth.plugins.xhtml.reader import XHTMLReader
+from pyth.plugins.rtf15.writer import Rtf15Writer
+import sys
+
+if len(sys.argv) > 1:
+    filename = sys.argv[1]
+else:
+    filename = "tests/html/sample-with-image.html"
+source = open(filename, "rb")
+doc = XHTMLReader.read(source)
+
+print Rtf15Writer.write(doc).getvalue()
diff --git a/examples/writing/rtf15ToXhtml.py b/examples/writing/rtf15ToXhtml.py
@@ -0,0 +1,12 @@
+from pyth.plugins.xhtml.writer  import XHTMLWriter
+from pyth.plugins.rtf15.reader import Rtf15Reader
+import sys
+
+if len(sys.argv) > 1:
+    filename = sys.argv[1]
+else:
+    filename = "tests/rtfs/sample-with-image.rtf"
+source = open(filename, "rb")
+doc = Rtf15Reader.read(source)
+
+print XHTMLWriter.write(doc).getvalue()
diff --git a/pyth/document.py b/pyth/document.py
@@ -7,7 +7,7 @@ class _PythBase(object):
     def __init__(self, properties={}, content=[]):
         self.properties = {}
         self.content = []
-        
+
         for (k,v) in properties.iteritems():
             self[k] = v
 
@@ -33,7 +33,7 @@ def append(self, item):
 
         If the item is of the wrong type, and if this element has a sub-type,
         then try to create such a sub-type and insert the item into that, instead.
-        
+
         This happens recursively, so (in python-markup):
           L [ u'Foo' ]
         actually creates:
@@ -51,7 +51,7 @@ def append(self, item):
                     okay = False
             else:
                 okay = False
-                
+
         if not okay:
             raise TypeError("Wrong content type for %s: %s (%s)" % (
                 self.__class__.__name__, repr(type(item)), repr(item)))
@@ -94,10 +94,10 @@ class Image(Paragraph):
     """
     An image is stored in bytes. All properties of images from the rtf definition are allowed.
     """
-    
-    validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', 
-                       'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', 
-                       'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', 
+
+    validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap',
+                       'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal',
+                       'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr',
                        'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap')
     contentType = bytes
 
@@ -122,14 +122,14 @@ class List(Paragraph):
 
     validProperties = ()
     contentType = ListEntry
-    
+
 
 
 class Document(_PythBase):
     """
     Top-level item. One document is exactly one file.
     Documents consist of a list of paragraphs.
     """
-    
+
     validProperties = ('title', 'subject', 'author')
     contentType = Paragraph
diff --git a/pyth/plugins/plaintext/writer.py b/pyth/plugins/plaintext/writer.py
@@ -46,7 +46,8 @@ def go(self):
     def paragraph(self, paragraph, prefix=""):
         content = []
         for text in paragraph.content:
-            content.append(u"".join(text.content))
+            if text.__class__ != document.Image:
+                content.append(u"".join(text.content))
         content = u"".join(content).encode("utf-8")
 
         for line in content.split("\n"):

diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
@@ -58,9 +58,9 @@
 
 # All the ones named by number in my 2.6 encodings dir, and those listed above
 _CODEPAGES_BY_NUMBER = dict(
-    (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, 
+    (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856,
                               857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
-                              875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, 
+                              875, 932, 936, 949, 950, 1006, 1026, 1140, 1250,
                               1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361))
 
 # Miscellaneous, incomplete
@@ -157,7 +157,7 @@ def getControl(self):
             if not next:
                 break
 
-            if first and next in '\\{}':
+            if first and next in '\\{}~_-':
                 chars.extend("control_symbol")
                 digits.append(next)
                 break
@@ -224,7 +224,7 @@ def __init__(self, doc, clean_paragraphs=True):
     def flushRun(self):
         if self.block is None:
             self.block = document.Paragraph()
-        
+
         if self.isImage:
             self.block.content.append(
                 document.Image(self.propStack[-1].copy(),
@@ -317,11 +317,16 @@ def handle_Para(self, para):
             self.listStack.append(l)
 
         elif self.listLevel < prevListLevel:
-            l = self.listStack.pop()
-            self.listStack[-1].append(l)
+            times = prevListLevel + 1
+            if self.listLevel is not None:
+                times = times - (self.listLevel + 1)
+            depth = len(self.listStack) - 1
+            for _ in xrange(min(times, depth)):
+                l = self.listStack.pop()
+                self.listStack[-1].append(l)
 
         self.block = None
-    
+
     def handle_Pict(self, pict):
         self.flushRun()
         self.isImage = True
@@ -354,7 +359,7 @@ def handle_ImageMarker(self, marker):
                 del self.propStack[-1][marker.name]
             else:
                 self.propStack[-1][marker.name] = True
-    
+
 
 
 class Group(object):
@@ -386,7 +391,7 @@ def __init__(self, reader, parent=None, charsetTable=None):
 
 
     def flushChars(self):
-        chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors)
+        chars = u"".join(c.decode(self.charset, self.reader.errors) for c in self.charBuffer)
         self.content.append(chars)
         self.charBuffer = []
 
@@ -398,11 +403,11 @@ def handle(self, control, digits):
         if control == '*':
             self.destination = True
             return
-        
-        if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 
-                                      'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 
-                                      'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', 
-                                      'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 
+
+        if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile',
+                                      'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes',
+                                      'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley',
+                                      'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp',
                                       'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap']:
             self.content.append(ImageMarker(control, digits))
             return
@@ -602,12 +607,12 @@ def handle_strike(self, onOff=None):
 
 
     def handle_ilvl(self, level):
-        if self.currentParaTag is not None:
-            self.currentParaTag.listLevel = level
-        else:
-            # Well, now we're in trouble. But I'm pretty sure this
-            # isn't supposed to happen anyway.
-            pass
+        if self.currentParaTag is None:
+            # this can happen where documents open straight with lists rather than a containing Para..
+            p = Para()
+            self.content.append(p)
+            self.currentParaTag = p
+        self.currentParaTag.listLevel = int(level)
 
 
     def handle_up(self, amount):
@@ -650,15 +655,15 @@ def handle_tab(self):
 
     def handle_trowd(self):
         self.content.append(u'\n')
-        
+
     #Handle the image tag
     def handle_pict(self):
         p = Pict()
         self.content.append(p)
         self.image = p
         #Remove the destination control group of the parent, so that the image is preserved
         self.parent.destination = False
-    
+
     def handle_field(self):
         def finalize():
             if len(self.content) != 2:
@@ -745,7 +750,7 @@ def __init__(self):
 
     def __repr__(self):
         return "!Image!"
-            
+
 class Para(ReadableMarker):
     listLevel = None
 

diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py
@@ -4,6 +4,7 @@
 http://www.biblioscape.com/rtf15_spec.htm
 """
 
+import binascii
 from pyth import document
 from pyth.format import PythWriter
 
@@ -55,12 +56,16 @@ def __init__(self, doc, target, family):
             document.List: self._list,
             document.Paragraph: self._paragraph
         }
+        self.paragraphContentDispatch = {
+            document.Text: self._text,
+            document.Image: self._image,
+        }
 
 
     def go(self):
         self.listLevel = -1
         self.addSpacing = None
-        
+
         self.target.write('{')
         self._writeHeader()
         self._writeDocument()
@@ -105,7 +110,7 @@ def _getFontTable(self):
         # We need Symbol for list bullets
         output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1))
         self.symbolFontNumber = i+1
-        
+
         output.append('}')
         return "".join(output)
 
@@ -138,7 +143,7 @@ def _getListTable(self):
 
         output.append('}}')
         return "".join(output)
-    
+
 
     def _getListOverrides(self):
         # I have no idea what the point is of this,
@@ -153,7 +158,7 @@ def _getRevTable(self):
 
     # -----------------------------------------------
     # Document section
-    
+
 
     def _writeDocument(self):
 
@@ -193,14 +198,15 @@ def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING):
         if self.addSpacing is not None:
             self.target.write(r'\sb%d' % self.addSpacing)
             self.addSpacing = None
-        
+
         # Space after the paragraph,
         # expressed in units of god-knows-what
         self.target.write(r'\sa%d{' % spacing)
-
-        for text in paragraph.content:
-            self._text(text)
-
+
+        for item in paragraph.content:
+            handler = self.paragraphContentDispatch[item.__class__]
+            handler(item)
+
         self.target.write(r'}\par\pard' '\n')
 
 
@@ -241,23 +247,26 @@ def _text(self, text):
         for prop in text.properties:
             if prop in _styleFlags:
                 props.append(_styleFlags[prop])
-        
+
         if props:
             self.target.write("".join(props) + " ")
 
-        
-        for run in text.content:                    
+
+        for run in text.content:
             for unichar in run:
                 if unichar == '\n':
                     self.target.write(r'\line ')
                     continue
-
+                # Escape control characters
+                if unichar in '\\{}':
+                    self.target.write(r'\%s' % unichar)
+                    continue
                 point = ord(unichar)
                 if point < 128:
                     self.target.write(str(unichar))
                 else:
                     self.target.write(r'\u%d?' % point)
-            
+
         if props:
             self.target.write("".join("%s0" % p for p in props) + " ")
 
@@ -266,3 +275,21 @@ def _text(self, text):
 
         if 'url' in text.properties:
             self.target.write('}}')
+
+    def _image(self, image):
+        self.target.write(r'{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE  "cid:[email protected]" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict')
+        properties = "".join('\\' + prop + (val if val != True else '') for prop, val in image.properties.iteritems())
+        self.target.write(properties)
+        self.target.write(' \n')
+        image_data = binascii.hexlify(image.content[0])
+        for i in chunk(image_data):
+            self.target.write(i)
+            self.target.write('\n')
+        self.target.write(r'}}}}')
+
+def chunk(data, size=200):
+    length = len(data)
+    end = 0
+    while length > end:
+        end = end + size
+        yield data[end-size:end]
diff --git a/pyth/plugins/xhtml/css.py b/pyth/plugins/xhtml/css.py
@@ -135,3 +135,6 @@ def is_super(self, node):
         properties = self.get_properties(node)
         return properties.get('vertical-align') == 'super'
 
+    def is_underline(self, node):
+        properties = self.get_properties(node)
+        return properties.get('text-decoration') == 'underline'