From 4431512dca81043812aecab7e0a0f59f0290e6c6 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Fri, 6 Nov 2015 10:39:37 +0000 Subject: [PATCH 01/14] Add xhtml inline png image support Images are already present in the Document model when read from RTF. This change converts any PNG images found into tags with inline base64 encoded data elements. Other images (for example WMF alternatives and jpegs) are ignored. Previous behaviour was to write out the hex-encoded image string. --- pyth/plugins/xhtml/writer.py | 40 +++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index 37bca07..3e40c29 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -6,6 +6,7 @@ from pyth import document from pyth.format import PythWriter +import base64 from cStringIO import StringIO @@ -50,26 +51,31 @@ def __init__(self, doc, target, cssClasses=True, pretty=False): document.List: self._list, document.Paragraph: self._paragraph } - + self.paragraphContentDispatch = { + document.Text: self._text, + document.Image: self._image, + } + def go(self): self.listLevel = -1 - + tag = Tag("div") - + for element in self.document.content: handler = self.paragraphDispatch[element.__class__] tag.content.extend(handler(element)) tag.render(self.target) return self.target - + def _paragraph(self, paragraph): p = Tag("p") - for text in paragraph.content: - p.content.append(self._text(text)) + for item in paragraph.content: + handler = self.paragraphContentDispatch[item.__class__] + p.content.append(handler(item)) if self.pretty: return [_prettyBreak, p, _prettyBreak] @@ -79,12 +85,12 @@ def _paragraph(self, paragraph): def _list(self, lst): self.listLevel += 1 - + ul = Tag("ul") if self.cssClasses: ul.attrs['class'] = 'pyth_list_%s' % self.listLevel - + for entry in lst.content: li = Tag("li") for element in entry.content: @@ -93,7 +99,7 @@ def _list(self, lst): ul.content.append(li) self.listLevel -= 1 - + return [ul] @@ -124,13 +130,23 @@ def _text(self, text): return tag + def _image(self, image): + if image.properties.get(u'pngblip'): + tag = Tag("img") + image_data = bytearray.fromhex(image.content[0]) + base64_image = base64.b64encode(image_data) + tag.attrs['src'] = "data:image/png;base64,{}".format(base64_image) + tag.attrs['alt'] = 'Inline image' + return tag + else: + return Tag(None) _prettyBreak = object() class Tag(object): - + def __init__(self, tag, attrs=None, content=None): self.tag = tag self.attrs = attrs or {} @@ -155,13 +171,13 @@ def render(self, target): if self.tag is not None: target.write('' % self.tag) - + def attrString(self): return " ".join( '%s="%s"' % (k, quoteAttr(v)) for (k, v) in self.attrs.iteritems()) - + def __repr__(self): return "T(%s)[%s]" % (self.tag, repr(self.content)) From a1f08cc8e49afd28b6dfad624d30fecccf95d745 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Fri, 13 Nov 2015 09:03:12 +0000 Subject: [PATCH 02/14] Add image support from html to rtf as well Prior commit took images being read in RTF and added them as inline png images to XHTML. This completes the reverse: inline png images in XHTML are read into the Document model, and writing those Documents to RTF will now include the inline image. Width/ height attributes are also transformed, assuming a standard conversion of 15 twips per pixel. Only PNG images are supported. --- examples/writing/htmlToRtf15.py | 12 +++++++ examples/writing/rtf15ToXhtml.py | 12 +++++++ pyth/document.py | 18 +++++----- pyth/plugins/rtf15/reader.py | 26 +++++++------- pyth/plugins/rtf15/writer.py | 50 +++++++++++++++++++------- pyth/plugins/xhtml/reader.py | 49 ++++++++++++++++++++++++++ pyth/plugins/xhtml/writer.py | 19 +++++++++- tests/html/sample-with-image.html | 3 ++ tests/rtfs/sample-with-image.rtf | 58 +++++++++++++++++++++++++++++++ tests/test_readrtf15.py | 11 ++++++ tests/test_readxhtml.py | 16 +++++++++ tests/test_writertf15.py | 19 ++++++++++ tests/test_writexhtml.py | 19 ++++++++++ 13 files changed, 276 insertions(+), 36 deletions(-) create mode 100644 examples/writing/htmlToRtf15.py create mode 100644 examples/writing/rtf15ToXhtml.py create mode 100644 tests/html/sample-with-image.html create mode 100644 tests/rtfs/sample-with-image.rtf create mode 100644 tests/test_writertf15.py create mode 100644 tests/test_writexhtml.py diff --git a/examples/writing/htmlToRtf15.py b/examples/writing/htmlToRtf15.py new file mode 100644 index 0000000..df7e8d5 --- /dev/null +++ b/examples/writing/htmlToRtf15.py @@ -0,0 +1,12 @@ +from pyth.plugins.xhtml.reader import XHTMLReader +from pyth.plugins.rtf15.writer import Rtf15Writer +import sys + +if len(sys.argv) > 1: + filename = sys.argv[1] +else: + filename = "tests/html/sample-with-image.html" +source = open(filename, "rb") +doc = XHTMLReader.read(source) + +print Rtf15Writer.write(doc).getvalue() diff --git a/examples/writing/rtf15ToXhtml.py b/examples/writing/rtf15ToXhtml.py new file mode 100644 index 0000000..d350e9d --- /dev/null +++ b/examples/writing/rtf15ToXhtml.py @@ -0,0 +1,12 @@ +from pyth.plugins.xhtml.writer import XHTMLWriter +from pyth.plugins.rtf15.reader import Rtf15Reader +import sys + +if len(sys.argv) > 1: + filename = sys.argv[1] +else: + filename = "tests/rtfs/sample-with-image.rtf" +source = open(filename, "rb") +doc = Rtf15Reader.read(source) + +print XHTMLWriter.write(doc).getvalue() diff --git a/pyth/document.py b/pyth/document.py index 864d519..d0cb6d8 100644 --- a/pyth/document.py +++ b/pyth/document.py @@ -7,7 +7,7 @@ class _PythBase(object): def __init__(self, properties={}, content=[]): self.properties = {} self.content = [] - + for (k,v) in properties.iteritems(): self[k] = v @@ -33,7 +33,7 @@ def append(self, item): If the item is of the wrong type, and if this element has a sub-type, then try to create such a sub-type and insert the item into that, instead. - + This happens recursively, so (in python-markup): L [ u'Foo' ] actually creates: @@ -51,7 +51,7 @@ def append(self, item): okay = False else: okay = False - + if not okay: raise TypeError("Wrong content type for %s: %s (%s)" % ( self.__class__.__name__, repr(type(item)), repr(item))) @@ -94,10 +94,10 @@ class Image(Paragraph): """ An image is stored in bytes. All properties of images from the rtf definition are allowed. """ - - validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', - 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', - 'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', + + validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', + 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', + 'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap') contentType = bytes @@ -122,7 +122,7 @@ class List(Paragraph): validProperties = () contentType = ListEntry - + class Document(_PythBase): @@ -130,6 +130,6 @@ class Document(_PythBase): Top-level item. One document is exactly one file. Documents consist of a list of paragraphs. """ - + validProperties = ('title', 'subject', 'author') contentType = Paragraph diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py index 7a74162..f569afa 100644 --- a/pyth/plugins/rtf15/reader.py +++ b/pyth/plugins/rtf15/reader.py @@ -58,9 +58,9 @@ # All the ones named by number in my 2.6 encodings dir, and those listed above _CODEPAGES_BY_NUMBER = dict( - (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, + (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, - 875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, + 875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361)) # Miscellaneous, incomplete @@ -224,7 +224,7 @@ def __init__(self, doc, clean_paragraphs=True): def flushRun(self): if self.block is None: self.block = document.Paragraph() - + if self.isImage: self.block.content.append( document.Image(self.propStack[-1].copy(), @@ -321,7 +321,7 @@ def handle_Para(self, para): self.listStack[-1].append(l) self.block = None - + def handle_Pict(self, pict): self.flushRun() self.isImage = True @@ -354,7 +354,7 @@ def handle_ImageMarker(self, marker): del self.propStack[-1][marker.name] else: self.propStack[-1][marker.name] = True - + class Group(object): @@ -398,11 +398,11 @@ def handle(self, control, digits): if control == '*': self.destination = True return - - if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', - 'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', - 'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', - 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', + + if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', + 'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', + 'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', + 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap']: self.content.append(ImageMarker(control, digits)) return @@ -650,7 +650,7 @@ def handle_tab(self): def handle_trowd(self): self.content.append(u'\n') - + #Handle the image tag def handle_pict(self): p = Pict() @@ -658,7 +658,7 @@ def handle_pict(self): self.image = p #Remove the destination control group of the parent, so that the image is preserved self.parent.destination = False - + def handle_field(self): def finalize(): if len(self.content) != 2: @@ -745,7 +745,7 @@ def __init__(self): def __repr__(self): return "!Image!" - + class Para(ReadableMarker): listLevel = None diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py index bf00511..f7bc85a 100644 --- a/pyth/plugins/rtf15/writer.py +++ b/pyth/plugins/rtf15/writer.py @@ -4,6 +4,7 @@ http://www.biblioscape.com/rtf15_spec.htm """ +import binascii from pyth import document from pyth.format import PythWriter @@ -55,12 +56,16 @@ def __init__(self, doc, target, family): document.List: self._list, document.Paragraph: self._paragraph } + self.paragraphContentDispatch = { + document.Text: self._text, + document.Image: self._image, + } def go(self): self.listLevel = -1 self.addSpacing = None - + self.target.write('{') self._writeHeader() self._writeDocument() @@ -105,7 +110,7 @@ def _getFontTable(self): # We need Symbol for list bullets output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1)) self.symbolFontNumber = i+1 - + output.append('}') return "".join(output) @@ -138,7 +143,7 @@ def _getListTable(self): output.append('}}') return "".join(output) - + def _getListOverrides(self): # I have no idea what the point is of this, @@ -153,7 +158,7 @@ def _getRevTable(self): # ----------------------------------------------- # Document section - + def _writeDocument(self): @@ -193,14 +198,15 @@ def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING): if self.addSpacing is not None: self.target.write(r'\sb%d' % self.addSpacing) self.addSpacing = None - + # Space after the paragraph, # expressed in units of god-knows-what self.target.write(r'\sa%d{' % spacing) - - for text in paragraph.content: - self._text(text) - + + for item in paragraph.content: + handler = self.paragraphContentDispatch[item.__class__] + handler(item) + self.target.write(r'}\par\pard' '\n') @@ -241,12 +247,12 @@ def _text(self, text): for prop in text.properties: if prop in _styleFlags: props.append(_styleFlags[prop]) - + if props: self.target.write("".join(props) + " ") - - for run in text.content: + + for run in text.content: for unichar in run: if unichar == '\n': self.target.write(r'\line ') @@ -257,7 +263,7 @@ def _text(self, text): self.target.write(str(unichar)) else: self.target.write(r'\u%d?' % point) - + if props: self.target.write("".join("%s0" % p for p in props) + " ") @@ -266,3 +272,21 @@ def _text(self, text): if 'url' in text.properties: self.target.write('}}') + + def _image(self, image): + self.target.write(r'{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict') + properties = "".join('\\' + prop + (val if val != True else '') for prop, val in image.properties.iteritems()) + self.target.write(properties) + self.target.write(' \n') + image_data = binascii.hexlify(image.content[0]) + for i in chunk(image_data): + self.target.write(i) + self.target.write('\n') + self.target.write(r'}}}}') + +def chunk(data, size=200): + length = len(data) + end = 0 + while length > end: + end = end + size + yield data[end-size:end] diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py index 775bf58..8e9710f 100644 --- a/pyth/plugins/xhtml/reader.py +++ b/pyth/plugins/xhtml/reader.py @@ -2,6 +2,8 @@ Read documents from xhtml """ +import base64 + import BeautifulSoup from pyth import document @@ -9,6 +11,9 @@ from pyth.plugins.xhtml.css import CSS +BASE64_PNG_IMG_SRC = 'data:image/png;base64,' + + class XHTMLReader(PythReader): @classmethod @@ -110,6 +115,21 @@ def url(self, node): else: return self.link_callback(a_node.get('href')) + def dimensions(self, node): + """ + return (int(width), int(height)) in pixels if a node has these declared in px in a style attribute, else None for either + or both attributes + """ + try: + style = node['style'] + except KeyError: + return None, None + else: + declarations = self.css.parse_declarations(style) + width = _parse_px(declarations.get('width', None)) + height = _parse_px(declarations.get('height', None)) + return width, height + def process_text(self, node): """ Return a pyth Text object from a BeautifulSoup node or None if @@ -161,5 +181,34 @@ def process_into(self, node, obj): new_obj = document.ListEntry() obj.append(new_obj) obj = new_obj + elif node.name == 'img': + if node.get('src', '').startswith(BASE64_PNG_IMG_SRC): + base64_data = node['src'][len(BASE64_PNG_IMG_SRC):] + new_obj = document.Image() + new_obj.append(base64.b64decode(base64_data)) + new_obj['pngblip'] = True + width, height = self.dimensions(node) + if height: + height = unicode(_px_to_twips(height)) + new_obj['pich'] = height + new_obj['pichgoal'] = height + if width: + width = unicode(_px_to_twips(width)) + new_obj['picw'] = width + new_obj['picwgoal'] = width + new_obj['picscalex'] = '100' + new_obj['picscaley'] = '100' + + obj.content.append(new_obj) + return # img is not allowed to have children as per DTD for child in node: self.process_into(child, obj) + + +def _parse_px(node): + if node and node.lower().endswith('px'): + return int(node[:-2]) + + +def _px_to_twips(px): + return px * 15 diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index 3e40c29..eb0879d 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -136,7 +136,15 @@ def _image(self, image): image_data = bytearray.fromhex(image.content[0]) base64_image = base64.b64encode(image_data) tag.attrs['src'] = "data:image/png;base64,{}".format(base64_image) - tag.attrs['alt'] = 'Inline image' + height = image['pichgoal'] + width = image['picwgoal'] + if width or height: + styles = [] + styles.append(_twips_to_style_px('width', width)) + styles.append(_twips_to_style_px('height', height)) + style = ';'.join(s for s in styles if s) + if style: + tag.attrs['style'] = style return tag else: return Tag(None) @@ -195,3 +203,12 @@ def quoteAttr(text): return quoteText(text).replace( u'"', u""").replace( u"'", u"'") + + +def _twips_to_style_px(tag, twips): + try: + twips = int(twips) + except ValueError: + pass + px = int(round(twips / 15.0)) + return "{}:{}px".format(tag, px) diff --git a/tests/html/sample-with-image.html b/tests/html/sample-with-image.html new file mode 100644 index 0000000..f1eea0e --- /dev/null +++ b/tests/html/sample-with-image.html @@ -0,0 +1,3 @@ +

+ +

This is a pretty boring graphic...

diff --git a/tests/rtfs/sample-with-image.rtf b/tests/rtfs/sample-with-image.rtf new file mode 100644 index 0000000..f1bf87f --- /dev/null +++ b/tests/rtfs/sample-with-image.rtf @@ -0,0 +1,58 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict\pngblip\picw20714\pich12143\picwgoal750\pichgoal750\picscalex100\picscaley100 +89504e470d0a1a0a0000000d4948445200000032000000320802000000915d1fe60000001974455874536f6674776172650041646f626520496d616765526561647971c9653c000012224944415478da6c5969705bd7757e1b1ef0b08320008204094a94 +44ad94146d966551926539719cc9d8711cb5ced8aeed643259da69279349ffb43ffa3f3f5a77927192c64be224b6c793269d249225db916359d52e532b49893b259220419004b1bcb5dfb9f701843c7d241f1f1edebbf7dcb37ce73be78a27cf9c910441 +1045894eecc047c711f0e3d88e6d3b9665d9b66d9ab6659938dbf86ff17b0e1ea3671d7e21d446c085543bf0516687a228f5b3c7e369fc88c770c11fe66f2992880f823b80c83e60749a8d4d685b36be820438591686e002f1c36938b8345ca0ff57acfa +a1b0e33337f124ce5c2c9ce97b41a20f3293890f4a923145088ee230dd08b68d5fb3a6272e13ce5c5b42eda8bf5e9789cfc73f364a561785abcafdc86ee1354526ed89fc8e24d6ecc86cca4c499393392d0bf229dc843569ea76ac1f62c3b1224a4d20d8 +8e4ba4783c7545e28c216860db2e5775c332755dc7430a1789bd4ec3e141a1e66122938c8bc0e573ea2ec56dcd745c3f049a83ffd0b3a669e998c630ad4aa5aaebe50a8e6aa9b45c5a2e4282f9426171610937abe55271b9582a959797972be5b25ed521 +9507e24332a65bb6327789aeab89ae8f99b41cf8bc655574bd5ac1eb98a98a53b158c470cb980da32f2d2d2e154b4b98a558aa94f145b954c2c39849af560d03ba30995e6cb604d206294f9110021e55852a558fea0f0515bfdfeff5fa1cd222a636f0b6 +5ea914b19ca5e262a1802997968b8b0b8b986d61610157a5e54ab18cc9b0ac0a9ec64ca66938964dca43e80822e6a1b57a6032d5e3f57a3d9e6024a2aab8a61fd5e3f1fa3c3e5f805d2a7457f5422cc803fd903b29a416657272ec9db7df5ac8e7971617 +4b581969a0ac1b3a96449e0b4349221ee4b3e0d7a7aa7eaf2f9648a85e1593fabc3e85eee30f27955c95d42fe104c71149f5e41518c5e141c07d106a9078546102918cee7e10795c2bbf78e5c7b76fde4ab6a4429a3f118b6175980ad3610e9c318da0d0 +f0f5a8c19f43e12632edc0fb58d4da1cba388cf1a96afe2988163d648be49c165e71c3c9665e42115ec33a1a885f8a4a6961f1ab478f1a887dd8db11fd9a0fafb3c8546451b619a46226fe1a7db4b0345b70d7c66ed7a25174f1cea9c1a228f13ba40e89 +0494b84ed8e48e847b02138e81a523887ca99223da4a341a9a5fcc9f38f61e5c78e3860dfbf6ef377513b6c6c3b61bf3520d296b73f338e0ffa12f8966e7a2f1f0845a113d5c363c04ef8333c3fff0837b2c50452627dec567d2111b99d99549aea4dbda +c6c7c717e6f388c54c5b3b5095204362fec09f6b4026d734ae96d858129340100cf27ad2ac484a1170d3a378e60af9fffde493e9e9a95853fcc17dbd4dd1a8699922d90a8a72642610cb0fa4c93aacd0c491586c726202cb0a8542a996a469c0822e72b9 +4a61d73c0188ec971c93a39524f97d1a4001bf707c1792c9ad49e28a5efae31f7e77fdcaa5d242e1fae58bfffdee5b954a4911c8901843769fb4e997a7515180dbc04fe1f612623d3f9d83865bd3695f40c37ac9f3b8078b2b3aaa69c7b52b0b50d11ff0 +dfb875fd3ffffd472fffc78f667339c42297090742f3faf56b93e3e3871e7ee49bdffedec1478e0c0f0e5cb97451f579b100a34ad882a06510ef0069302b14592e97c8634447191d1903f4e1eb8ece2c734c8bfc5056803758856512aabb8ecd424fe0be +250a7eaffffcf9b3efbefd1be0682cd6047fe2be8c4129b90bcec8c8705b7bfbaebd0f41053b76ef9e9b9ece643bf285fcc9e3c707fb6f41d33b77ef79f8f0a30b85f95fbdfe8b7822919b9aea5cd3f5e4579e06fc2a77060780d8d16834996ab50c1db3 +03a4f2f9d9997b33e9b64c381c064e40b78263ea701ccb61fe247a7dbe531f7ef83f7f7817ab4ea7db9e7bfec5543255d12b4c2c925cafe8f999d9f68e0ee08a0edb89f2e34f7c05283b3a3c74f5d2f9751b369855fdfd637f5cd7bd3e120e2fcccf8d8f +8d00e8b76cdb4a6ee688cadd8949407532d5a205346406c061219f7ff38dd7a6a7a72391483416c357adad99b6d634ae4281a0d7ab017cde3ffee753278e032e5677adf9fa732fc6e34dc87675e24124cdb690155a906d30282dc6f1c8925e2eb7a4dbfe +e1fb3f54fdfe1b57fbae5ebb86508b45c248698954eab9175e8ac713d56a054f2b482f58e0aad5ab5996255f46686ddabc054c612e9f07eedf9d9cbc7cf102503b180cc5e3cded1d9dc8368337af46c2a1ced55d5f7ae2e9803f50ae9499470a752821ef +17a4fc5c9eb144c8290cddeacfaeca22584f9d3a75e1ec698c8f108187c247aad5eab6ee8d994c7679b928309751f00a749269ef0473017b80a182c1c0a38f7d71efbefd172e9ebf74eedcfcfc1c7c19105f5a5e5e5c58181eba9d88c70381e0ba75eb7b +8f3c0ab6020650833316b39208eea37a7de94ce6ea95cb53d3d3edd98eeb7d7d3ffbc9cb4f3e7d341c8a7cf4c189a7be76146ef3c66bafc2896d4690303e14ebd452036241c864da43b118923ba70d0846785b20183afcd8e777ecdc73f9c2d98be7cf17 +e6f32aa55d8fa6f97c9a0f597fe79ebd08255d376b99c3a945a99b0f76eede7de1ec99d77ff6caba8d9baef75d41b8b477b44fdf9dc21cf1644b6e7a0aaebdb4b4e4b470a6643b6e60d1490e07037bf6ed4ba65b6d96ac084e6ab9046ee1557d6bd6addd +b67d27e071746498e82c63737098544b3a914c4131624302e0078219a9ac3999d434edeae54b83b76e00e08f7cf1f19dbb1ed08281db83fd278ffde9eec438b40081d676af7ffff8b1ceaeae8d9b36598645c0039787f735275236210186c23dfcd512a6 +23319e4cb6cde566e0ba00177ce3a95621faf0f09deecd5bb8938bb50cc95087a736c1a8ea0ff51e5cbb665d6e360704814da11e84f64bdffaeed09dc1544b0ba81834a179b5279e7caa359b3574d385224c087e92027f48b782ab5292146c85d287043a +8678f67ad579169837ae5d8b44a3bb76ef9d9c9c204e6718a0783ddb77a81ea59ec879a9c28c6073ed61a5a17004d1871482cce8305ae7f5696ded1dfe40b0a9298148c7649bb66c6d4a342306eb1942818c8303b737f7f4b075db2ccbc92096ef1dfb33 +949c4ca62e5dbc786770b0399138fab7cfaedfb03197cbf5dfba01da03589f999eea5abddab60c8eb20e21b5c519bccd6a210490c90e52a5e382312ca3331e5257ec6fde7c152cedb12f3fc1408be889224ade7bf7ee8155476251aa64482a796971617e +76a65098bfdd7f0b33013e9efcead732994e70d703870edfb93d800519863e3a34d4bd765d55340497dfd03488a762a5a4f9fc965905f585f34149a08c3271075137ab848e301eb2888135c8786368700089f871501ac1644694e46853b4522a2512894c +a655b04dbfc751453b1e8d6eded2a36981b9fc1cea90dede831b37f7942a1518056e0e254d4c8c73c6bffd733b3821e356446400845efff94f2f9c3b73eae409000a3c1a5a1918e8ffe4e353a3c3b7a3d158281c2e954b7079afe605f507f7edbb745151 +d55d0fec05af55542fa45292a9e6f1e191e1dbfd3b766c0da862c0231141410ccaf2debd7b8ba5e299d3a7612f3204712b111aed3d78f8d6cdeb60f1f7262791d182e188639b5434d428cf42210f8911c27ffdcb07ad6d992d5bb7bdf3e66b284b60b6ab +7d57bef38fdf9fcfcffdfcc72fb7b56780d57ff78d6f828b23e5f9fcbe1b9f5e39fdd14719a4acaeb55d10e3eeddb1e2423ee055648e40c4686c53aff6f46ca58a44d3eaf520687e5b26bbed73bb4cc3c05a474747504df0f8136b1c03a21ffec2175efa +cedf23c75dbe74cea7695f7fe1c57ffed77f7bf685976641beee4d7a58693f393ed6d9d9190a863121a0646a72f2ed377f79ebc6d53688d59ec9002151508d0f0d0324d9ec9caa0b28efb460f8a9bf79e6817d0f9986e9d681a288e4f360efa1603884d4 +34313ece35c488aa532fa9037e3f987ba63d5b2a1671a157cd777ffbeb3ffdfef748622883117018a4f7e0c3dffda71f245bd2c0269472effcfa0d18f799e75fd8dcb34d4239d6b92aab69eaf8c408c5700d1c2d723f72d24ca6031a6eac9d4d96da77ec +debb7dc7cefdbd07e07c9cc2d78a1aa29c5024822b3f9703279bcdcdbcf5cb574545dad4d3c3cb147823349c5db59ab2ba8de428c31fee0cdeeeeaeedebe6b37318ed2e27ccfd64dd5c5d9e5a5b97ca1d01c8bc14f0c11b40b9c16ef520b4268e0f23cc8 +ab95f2c1c39f578919cb703bb1ce1659ee40c9047f9f181dedbb7ce9e0238f9a16a2b6baefc02124afd3a73ec4a88cbcf16a9dd5c60005494ca55bc68686468687b3ed596926976b69c9a8fe6059372726264d59adda227ecd06599cfbaef93f9b6a3568 +8e42a15166e23020f7487f63a343ebd66f40ca4fb7a4bb376d7cfd959fbcf15f3f05e1b979ed2a2a7418911321bc00dc819b3fff8d6fc1b81fffe50380147ccf585e2ac7536d63c3c3fdfd036b37f5d88cc8e29098397826a9e1653d2b8b6e614828e9f6 +9e1cce971d0105f7ee071eeceade4015802ce3c9a79f79f65adfe5603052989f03fb40142553692411067512386338dadcb1ba0b91f4f147a7f6ec3b201f7a6cede0cddcfededeeeeef589745a0bf8c10b563a1f84fb128f7b5ecbd7132001af4b66ea3c +9f9207fe4cdb6ccb74c463cdb43ca0a965831a655775a553a96cb633d9d2a269fe9d7b1e44a264a58303246f6dcf249b53e17064626214be2bfef05f7e70f2d87b95521914af39918c354571118a45fd9adfe355e1a05498b3aaac6648c94dc88ec3c474 +0b545cdaac870897f6692a2297da4c54ce501642aa25f0e68597bb2ab790c1fb1e9f06af80bfcaa45a01a45259df16eb7af6cb63e313c313b9d1f1bb63a3c38852b0284478b4a909a910472c9e884442d470f0fa2556150abcb3546b1909f57625819655 +2eeb54c63bcc1deaf52515fcc06a87ca0f3202fb4a96f18d5e29332d4bc82258046651aa9657f606b66c5f7de0481c69cb308d898989fe8181dcececd4bde991a1c18be7ce12dcf911e9815024dc04dede9c686e8e8723b14020e04501a278681e6465c7 +6601cf628bf4005c71bd10df93dbd9acc3d6580f53cf8c5a2f502a2d82752b607505660627d58d2a90838fe6517c5bb76e87abe201d46ec74e9c4031c0ea32074f8e0edde9fbf4b2a91bc05e5fc01f430d924cc1eed15853380a4183288a18cb65baa346 +9d650af57ea62d415924ac830b81372698b8964048c6b4087a252860e2bc53c0691c16223ba64158456a47eaf50723fe2814e964db338f1f3982548894303f0f1a868a0bdc661a35d6b5be4fe961d913080640b0c0f582e160341a0f45a2c15048f311db +2637c3d8f8b34413eb752c184d642c1265bf440c938533c8270082fc835a3524399e9308d0991158e3854e540ac1f79d542ac53bcdb148241e8b896bd66018d01eb0c2e272319f87a0f3a0fca8c396160b333353a6811a55843ffa7cfe4038ec0f069b22 +2178772018f607020a0b781e4660e716812ccb6022ca0b5b6186b728542422e10e7164ea24c852bd2b447587eaf1a652adac5b6919ac952baed4d812ea994838da99edc41b00d80ab530cb85420167ea4f2c158164f3a5e5b9a9bbec4589f4a7028b7c5e +90d480e6d1c05e433eafeca130473d817ac8341c2e87b3d216e7b4117696f9c601b4ea516150f246cb16564a1dc771db42221165c741c4b0deaf0dfa8a10161911629b0d26f23acee54a05851dce4090858582313b8b57f0aaeaf5c0592123ea2a54cd8a +2051df88359604de97e12ab099a11db7c725701fa6e6374c4e14836d11105631af666d5a56a450b107c7b558df96170c0e0b3e9fcf872547a311e226bac91660a32840e6e1cd511407f003488c5b0a4f1aac33477cc1a43e62bd98a184c305e61b1d8481 +3419a83845184f880c2d45dea667b45bb42dcb6ee8dddfd7bee7398ba11ca844c0ef4175ce5bf8d069a9441d83a56251e14c0690435537ef1eb27a9bef96b0144348cfe042a07e8269886cb7c526dbca1cac85866d1f6765d3850383bdd2b373847a1ee3 +0bc730ba61d70a0129180c82fef7f46c56589fc8625d4644a94c1d310c4492501504e5d1b2101d14dbac296bf396a22b7a7d2fa3610b6865ebc569d8ddb8af432634ea8ebe052cfbfd1ae0595515c3a8287008d66a658997051a01ad65713e27bbad5187 +2fcd550b45abf899ada89a40aed9dcf956aafffbe4a81f9086762a0883a8722eeb55dbb164c35498d6010e000f912738e6dfac370dfdb174ccacc88cc8a6c50336db77e13288b566baebe235f534eee6352a8665c2fa260e4fafb661d2d476d9ca93c757 +14782705147729d1a59716cb336e9f9c18035c8b7535791e733829a0015967dd551be535c7edb5366ee5d58563e85377347733892fc474375d1c2634083f671faeafba1e20b11d00eea480530fee2882e5dcc7d8057b65cba9b6d3e9344cf6d9adbc9a11 +5748aed3e07c342919d3fdf47f020c004820324f63cfbeff0000000049454e44ae426082 +}}}}}\par\pard +\sa150{This is a \i pretty\i0 \b boring\b0 \u160?graphic...}\par\pard +} diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py index e29d283..b2dcbc5 100644 --- a/tests/test_readrtf15.py +++ b/tests/test_readrtf15.py @@ -30,6 +30,17 @@ class TestRtfFile(unittest.TestCase): pass +class TestRtfWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") + with open(sample_with_image, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image)) + expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143', + 'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'} + self.assertEquals(expected, image.properties) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py index 978c277..9e7bb90 100644 --- a/tests/test_readxhtml.py +++ b/tests/test_readxhtml.py @@ -80,6 +80,22 @@ def test_url(self): text = doc.content[0].content[0] assert text['url'] == "http://google.com" + def test_inline_png(self): + pixels = 50 + twips = pixels * 15 + height = width = str(twips) # in retrospect choosing a square image wasn't a great idea :) + with open('tests/html/sample-with-image.html', 'rb') as xhtml: + doc = XHTMLReader.read(xhtml) + image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image)) + self.assertEquals(image.content[0][1:4], u'PNG') + self.assertEquals(image['pngblip'], True) + self.assertEquals(image['pich'], height) + self.assertEquals(image['pichgoal'], height) + self.assertEquals(image['picw'], width) + self.assertEquals(image['picwgoal'], width) + self.assertEquals(image['picscaley'], '100') + self.assertEquals(image['picscalex'], '100') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_writertf15.py b/tests/test_writertf15.py new file mode 100644 index 0000000..c09c33d --- /dev/null +++ b/tests/test_writertf15.py @@ -0,0 +1,19 @@ +import os +import unittest +from pyth.plugins.xhtml.reader import XHTMLReader +from pyth.plugins.rtf15.writer import Rtf15Writer + +class TestRtfWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "html", "sample-with-image.html") + with open(sample_with_image, 'rb') as rtf: + source = XHTMLReader.read(rtf) + doc = Rtf15Writer.write(source).getvalue() + self.assertIn('pngblip', doc) + self.assertIn('picwgoal750\\', doc) + self.assertIn('pichgoal750\\', doc) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_writexhtml.py b/tests/test_writexhtml.py new file mode 100644 index 0000000..f07418c --- /dev/null +++ b/tests/test_writexhtml.py @@ -0,0 +1,19 @@ +import os +import unittest +from pyth.plugins.rtf15.reader import Rtf15Reader +from pyth.plugins.xhtml.writer import XHTMLWriter + +class TestHtmlWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") + with open(sample_with_image, 'rb') as rtf: + source = Rtf15Reader.read(rtf) + doc = XHTMLWriter.write(source).getvalue() + self.assertIn(' Date: Thu, 3 Dec 2015 17:16:14 +0000 Subject: [PATCH 04/14] Fix nested lists rtf15.reader bug Previously in RTF documents containing nested lists that 'ended' on a nested item, the outer most item would be added into the list above it, but the list above it would never be added in the lists/ doc above that, so would get dropped. --- pyth/plugins/rtf15/reader.py | 10 +++++++--- tests/test_readrtf15.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py index 7fe7cc8..c9fe4fd 100644 --- a/pyth/plugins/rtf15/reader.py +++ b/pyth/plugins/rtf15/reader.py @@ -317,8 +317,12 @@ def handle_Para(self, para): self.listStack.append(l) elif self.listLevel < prevListLevel: - l = self.listStack.pop() - self.listStack[-1].append(l) + times = prevListLevel + 1 + if self.listLevel is not None: + times = times - (self.listLevel + 1) + for _ in xrange(times): + l = self.listStack.pop() + self.listStack[-1].append(l) self.block = None @@ -603,7 +607,7 @@ def handle_strike(self, onOff=None): def handle_ilvl(self, level): if self.currentParaTag is not None: - self.currentParaTag.listLevel = level + self.currentParaTag.listLevel = int(level) else: # Well, now we're in trouble. But I'm pretty sure this # isn't supposed to happen anyway. diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py index b1c6a84..7bef397 100644 --- a/tests/test_readrtf15.py +++ b/tests/test_readrtf15.py @@ -49,6 +49,25 @@ def test_tildes_are_parsed(self): doc = Rtf15Reader.read(rtf) traverse_text(doc, lambda text: self.assertNotIn('~', text)) + +class TestNestedLists(unittest.TestCase): + + def test_when_last_item_sublist_item(self): + """ With structures like this, both lists were getting dropped + Start + * 1 + * 1.1 + """ + list_bug = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "list-bug.rtf") + with open(list_bug, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + text = [] + traverse_text(doc, lambda x: text.append(x)) + self.assertIn('Start', text) + self.assertIn('1', text) + self.assertIn('1.1', text) + + def traverse_text(element, function): if element.__class__ == pyth.document.Text: map(function, element.content) From ae8c81352136b19373c6f7bf900982a50068cae1 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Thu, 3 Dec 2015 18:28:14 +0000 Subject: [PATCH 05/14] Add forgotten rtf example for test --- tests/rtfs/list-bug.rtf | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tests/rtfs/list-bug.rtf diff --git a/tests/rtfs/list-bug.rtf b/tests/rtfs/list-bug.rtf new file mode 100644 index 0000000..8598069 --- /dev/null +++ b/tests/rtfs/list-bug.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{Start}\par\pard +\ilvl0\ls0\li720\s1\sa50{1}\par\pard +\ilvl0\ls0\li720\s1\ilvl1\ls0\li1440\s1\sa50{1.1}\par\pard +} \ No newline at end of file From bc3a87418bf54e1231e15f91deabe8759534f3a4 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Fri, 4 Dec 2015 08:31:28 +0000 Subject: [PATCH 06/14] Add underline to XHTMLReader Also confirmed round trip from XHTML to RTF with tests: - Checking that RTF reads underlining markup into Document - Checking that RTF writes underline formatting - Checking that XHTML reads u tags or css underline styling into Document - Checking that XHTML writes u tags --- pyth/plugins/xhtml/css.py | 3 +++ pyth/plugins/xhtml/reader.py | 6 ++++++ tests/rtfs/text-attributes.rtf | 8 ++++++++ tests/test_readrtf15.py | 10 ++++++++++ tests/test_readxhtml.py | 18 ++++++++++++++++++ tests/test_writertf15.py | 9 +++++++++ tests/test_writexhtml.py | 9 +++++++++ 7 files changed, 63 insertions(+) create mode 100644 tests/rtfs/text-attributes.rtf diff --git a/pyth/plugins/xhtml/css.py b/pyth/plugins/xhtml/css.py index e2fe5be..e1e44eb 100644 --- a/pyth/plugins/xhtml/css.py +++ b/pyth/plugins/xhtml/css.py @@ -135,3 +135,6 @@ def is_super(self, node): properties = self.get_properties(node) return properties.get('vertical-align') == 'super' + def is_underline(self, node): + properties = self.get_properties(node) + return properties.get('text-decoration') == 'underline' diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py index 8e9710f..2889251 100644 --- a/pyth/plugins/xhtml/reader.py +++ b/pyth/plugins/xhtml/reader.py @@ -85,6 +85,10 @@ def is_italic(self, node): return (node.findParent(['em', 'i']) is not None or self.css.is_italic(node)) + def is_underline(self, node): + return (node.findParent(['u']) is not None or + self.css.is_underline(node)) + def is_sub(self, node): """ Return true if the BeautifulSoup node needs to be rendered as @@ -145,6 +149,8 @@ def process_text(self, node): properties['bold'] = True if self.is_italic(node): properties['italic'] = True + if self.is_underline(node): + properties['underline'] = True if self.url(node): properties['url'] = self.url(node) if self.is_sub(node): diff --git a/tests/rtfs/text-attributes.rtf b/tests/rtfs/text-attributes.rtf new file mode 100644 index 0000000..ae2a60a --- /dev/null +++ b/tests/rtfs/text-attributes.rtf @@ -0,0 +1,8 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{\ul Underlined\ul0 }\par\pard +} \ No newline at end of file diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py index 7bef397..c8b6807 100644 --- a/tests/test_readrtf15.py +++ b/tests/test_readrtf15.py @@ -68,6 +68,16 @@ def test_when_last_item_sublist_item(self): self.assertIn('1.1', text) +class TestTextProperties(unittest.TestCase): + + def test_reads_underline(self): + text = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "text-attributes.rtf") + with open(text, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + underlined = doc.content[0].content[0] + self.assertTrue(underlined['underline']) + + def traverse_text(element, function): if element.__class__ == pyth.document.Text: map(function, element.content) diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py index 9e7bb90..9814fd9 100644 --- a/tests/test_readxhtml.py +++ b/tests/test_readxhtml.py @@ -53,6 +53,24 @@ def test_italic(self): text = doc.content[0].content[0] assert text['italic'] + def test_underline(self): + """ + Try to read a paragraph containing underline + """ + xhtml = "

sub

" + doc = XHTMLReader.read(xhtml) + text = doc.content[0].content[0] + assert text['underline'] + + def test_underline_styling(self): + """ + Try to read a paragraph containing underline via CSS + """ + xhtml = '

underline

' + doc = XHTMLReader.read(xhtml) + text = doc.content[0].content[0] + assert text['underline'] + def test_sub(self): """ Try to read a paragraph containing subscript diff --git a/tests/test_writertf15.py b/tests/test_writertf15.py index c09c33d..d44e1c0 100644 --- a/tests/test_writertf15.py +++ b/tests/test_writertf15.py @@ -2,6 +2,7 @@ import unittest from pyth.plugins.xhtml.reader import XHTMLReader from pyth.plugins.rtf15.writer import Rtf15Writer +from pyth.document import Document, Paragraph, Text class TestRtfWithImage(unittest.TestCase): @@ -15,5 +16,13 @@ def test_inline_png(self): self.assertIn('pichgoal750\\', doc) + def test_underline_output(self): + text = Text(content=[u'Underlined'], properties={'underline': True}) + para = Paragraph(content=[text]) + doc = Document(content=[para]) + result = Rtf15Writer.write(doc).getvalue() + self.assertIn('\\ul Underlined\\ul0', result) + + if __name__ == '__main__': unittest.main() diff --git a/tests/test_writexhtml.py b/tests/test_writexhtml.py index f07418c..cc12f40 100644 --- a/tests/test_writexhtml.py +++ b/tests/test_writexhtml.py @@ -2,6 +2,7 @@ import unittest from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter +from pyth.document import Document, Paragraph, Text class TestHtmlWithImage(unittest.TestCase): @@ -15,5 +16,13 @@ def test_inline_png(self): self.assertIn('height:50px', doc) + def test_underline(self): + text = Text(content=[u'Underlined'], properties={'underline': True}) + para = Paragraph(content=[text]) + doc = Document(content=[para]) + result = XHTMLWriter.write(doc).getvalue() + self.assertIn('Underlined', result) + + if __name__ == '__main__': unittest.main() From 5db24d515c8633be2e47880d92fdaf773247cbe4 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Fri, 4 Dec 2015 09:27:42 +0000 Subject: [PATCH 07/14] Use sub/ super xhtml tags for super/subscript text As per http://www.w3.org/TR/xhtml1/dtds.html#dtdentry_xhtml1-strict.dtd_sub this is the recommended way. --- pyth/plugins/xhtml/writer.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index eb0879d..d0ffb40 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -15,6 +15,8 @@ 'bold': 'strong', 'italic': 'em', 'underline': 'u', # ? + 'super': 'sup', + 'sub': 'sub', } @@ -112,20 +114,12 @@ def _text(self, text): current = tag - for prop in ('bold', 'italic', 'underline'): + for prop in ('bold', 'italic', 'underline', 'sub', 'super'): if prop in text.properties: newTag = Tag(_tagNames[prop]) current.content.append(newTag) current = newTag - for prop in ('sub', 'super'): - if prop in text.properties: - if current.tag is None: - newTag = Tag("span") - current.content.append(newTag) - current = newTag - current.attrs['style'] = "vertical-align: %s; font-size: smaller" % prop - current.content.append(u"".join(text.content)) return tag From 4171d2997eb3797d5264f4d53d97e452def0a3bc Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Fri, 4 Dec 2015 15:50:37 +0000 Subject: [PATCH 08/14] Treat xhtml ol as ul For now its better to parse html ordered lists as unordered lists rather than creating invalid document structures that crash parsing. (ListItems right under Paras because ol is ignored) --- pyth/plugins/xhtml/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py index 2889251..312845e 100644 --- a/pyth/plugins/xhtml/reader.py +++ b/pyth/plugins/xhtml/reader.py @@ -177,7 +177,7 @@ def process_into(self, node, obj): new_obj = document.Paragraph() obj.append(new_obj) obj = new_obj - elif node.name == 'ul': + elif node.name in ('ul', 'ol'): # add a new list new_obj = document.List() obj.append(new_obj) From 496657306d9c2b7a491ae105377f93714dab2585 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Thu, 31 Dec 2015 10:37:22 +0000 Subject: [PATCH 09/14] Fix for RTF documents that open with a list Found plenty of examples of these in the wild.. This fix adds a para up front but doesn't add it to list stack, so we also hold on the last pop of the list stack when unwinding lists because there is no final holding paragraph --- pyth/plugins/rtf15/reader.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py index c9fe4fd..a8da8df 100644 --- a/pyth/plugins/rtf15/reader.py +++ b/pyth/plugins/rtf15/reader.py @@ -320,7 +320,8 @@ def handle_Para(self, para): times = prevListLevel + 1 if self.listLevel is not None: times = times - (self.listLevel + 1) - for _ in xrange(times): + depth = len(self.listStack) - 1 + for _ in xrange(min(times, depth)): l = self.listStack.pop() self.listStack[-1].append(l) @@ -606,12 +607,12 @@ def handle_strike(self, onOff=None): def handle_ilvl(self, level): - if self.currentParaTag is not None: - self.currentParaTag.listLevel = int(level) - else: - # Well, now we're in trouble. But I'm pretty sure this - # isn't supposed to happen anyway. - pass + if self.currentParaTag is None: + # this can happen where documents open straight with lists rather than a containing Para.. + p = Para() + self.content.append(p) + self.currentParaTag = p + self.currentParaTag.listLevel = int(level) def handle_up(self, amount): From b8ab9c320a9425d43ad86fd73253e365d89ae11d Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Thu, 31 Dec 2015 12:18:15 +0000 Subject: [PATCH 10/14] Better sublist handling in xhtml writer Previously, sublists were always added to their own li element, but this renders as double bullets in HTML: * Top level * - Sub list item Now we add the nested ul directly to the prior non-list flow item (Top level para in example above), which gives expected single-bullet nesting: * Top level - Sub list item --- pyth/plugins/xhtml/writer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index d0ffb40..b5c5e9f 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -93,12 +93,23 @@ def _list(self, lst): if self.cssClasses: ul.attrs['class'] = 'pyth_list_%s' % self.listLevel + last_li = None for entry in lst.content: li = Tag("li") for element in entry.content: + # in practice list elements always have only one content child? handler = self.paragraphDispatch[element.__class__] - li.content.extend(handler(element)) - ul.content.append(li) + if handler == self._list: + # this is a sublist, so we shouldn't create an empty li, but rather append ul to prior li. + # Lists can't be immediately sublisted (e.g. there must be at least something at outer level) + # but if that is not the case the last_li will be None and next line will bomb out, which is a + # useful implicit assertion + last_li.content.extend(handler(element)) + else: + li.content.extend(handler(element)) + last_li = li + if li.content: # li might be empty.. + ul.content.append(li) self.listLevel -= 1 From 8116549efaa76ff0077e5bbabc75ff512b25141f Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Wed, 27 Jan 2016 13:45:16 +0000 Subject: [PATCH 11/14] Workaround for non-utf chars embedded in charBuffer --- pyth/plugins/rtf15/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py index a8da8df..5e8aca0 100644 --- a/pyth/plugins/rtf15/reader.py +++ b/pyth/plugins/rtf15/reader.py @@ -391,7 +391,7 @@ def __init__(self, reader, parent=None, charsetTable=None): def flushChars(self): - chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors) + chars = u"".join(c.decode(self.charset, self.reader.errors) for c in self.charBuffer) self.content.append(chars) self.charBuffer = [] From 5abf0e1ab1906d373285bb95a38d2586e856061c Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Tue, 9 Feb 2016 16:17:51 +0000 Subject: [PATCH 12/14] Encode control chars {} and \ when writing RTF Currently these characters get writter out verbatim to RTF stream, rendering the result invalid. Instead they should be escaped with a leading backslash. --- pyth/plugins/rtf15/writer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py index f7bc85a..b33b153 100644 --- a/pyth/plugins/rtf15/writer.py +++ b/pyth/plugins/rtf15/writer.py @@ -257,7 +257,10 @@ def _text(self, text): if unichar == '\n': self.target.write(r'\line ') continue - + # Escape control characters + if unichar in '\\{}': + self.target.write(r'\%s' % unichar) + continue point = ord(unichar) if point < 128: self.target.write(str(unichar)) From fd95cbfd504f09d6558625f9a4a6f353efff0764 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Thu, 3 Mar 2016 18:47:37 +0000 Subject: [PATCH 13/14] Don't 'double escape' html entities when writing If HTML entities were escaped when converting from HTML to whatever other format, don't escape the ampersands in them again on the way out from whatever format back to HTML. --- pyth/plugins/xhtml/writer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index b5c5e9f..38e907b 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -7,6 +7,7 @@ from pyth import document from pyth.format import PythWriter import base64 +import re from cStringIO import StringIO @@ -198,8 +199,8 @@ def __repr__(self): def quoteText(text): - return text.replace( - u"&", u"&").replace( + return re.sub( + u'&(?!(amp|lt|gt);)', u'&', text, flags=re.IGNORECASE).replace( u"<", u"<").replace( u">", u">") From 79d0dd195c1171fdcf539161d29c3eeba9831212 Mon Sep 17 00:00:00 2001 From: Kris Powell Date: Tue, 13 Aug 2019 12:53:06 +0100 Subject: [PATCH 14/14] Ignore images when writing to plaintext --- pyth/plugins/plaintext/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyth/plugins/plaintext/writer.py b/pyth/plugins/plaintext/writer.py index 9dd8bfd..a00dc25 100644 --- a/pyth/plugins/plaintext/writer.py +++ b/pyth/plugins/plaintext/writer.py @@ -46,7 +46,8 @@ def go(self): def paragraph(self, paragraph, prefix=""): content = [] for text in paragraph.content: - content.append(u"".join(text.content)) + if text.__class__ != document.Image: + content.append(u"".join(text.content)) content = u"".join(content).encode("utf-8") for line in content.split("\n"):