Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add xhtml inline png image support #35

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
12 changes: 12 additions & 0 deletions examples/writing/htmlToRtf15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pyth.plugins.xhtml.reader import XHTMLReader
from pyth.plugins.rtf15.writer import Rtf15Writer
import sys

if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "tests/html/sample-with-image.html"
source = open(filename, "rb")
doc = XHTMLReader.read(source)

print Rtf15Writer.write(doc).getvalue()
12 changes: 12 additions & 0 deletions examples/writing/rtf15ToXhtml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pyth.plugins.xhtml.writer import XHTMLWriter
from pyth.plugins.rtf15.reader import Rtf15Reader
import sys

if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "tests/rtfs/sample-with-image.rtf"
source = open(filename, "rb")
doc = Rtf15Reader.read(source)

print XHTMLWriter.write(doc).getvalue()
18 changes: 9 additions & 9 deletions pyth/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class _PythBase(object):
def __init__(self, properties={}, content=[]):
self.properties = {}
self.content = []

for (k,v) in properties.iteritems():
self[k] = v

Expand All @@ -33,7 +33,7 @@ def append(self, item):

If the item is of the wrong type, and if this element has a sub-type,
then try to create such a sub-type and insert the item into that, instead.

This happens recursively, so (in python-markup):
L [ u'Foo' ]
actually creates:
Expand All @@ -51,7 +51,7 @@ def append(self, item):
okay = False
else:
okay = False

if not okay:
raise TypeError("Wrong content type for %s: %s (%s)" % (
self.__class__.__name__, repr(type(item)), repr(item)))
Expand Down Expand Up @@ -94,10 +94,10 @@ class Image(Paragraph):
"""
An image is stored in bytes. All properties of images from the rtf definition are allowed.
"""
validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap',
'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal',
'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr',

validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap',
'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal',
'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr',
'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap')
contentType = bytes

Expand All @@ -122,14 +122,14 @@ class List(Paragraph):

validProperties = ()
contentType = ListEntry



class Document(_PythBase):
"""
Top-level item. One document is exactly one file.
Documents consist of a list of paragraphs.
"""

validProperties = ('title', 'subject', 'author')
contentType = Paragraph
3 changes: 2 additions & 1 deletion pyth/plugins/plaintext/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def go(self):
def paragraph(self, paragraph, prefix=""):
content = []
for text in paragraph.content:
content.append(u"".join(text.content))
if text.__class__ != document.Image:
content.append(u"".join(text.content))
content = u"".join(content).encode("utf-8")

for line in content.split("\n"):
Expand Down
51 changes: 28 additions & 23 deletions pyth/plugins/rtf15/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@

# All the ones named by number in my 2.6 encodings dir, and those listed above
_CODEPAGES_BY_NUMBER = dict(
(x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856,
(x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856,
857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
875, 932, 936, 949, 950, 1006, 1026, 1140, 1250,
875, 932, 936, 949, 950, 1006, 1026, 1140, 1250,
1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361))

# Miscellaneous, incomplete
Expand Down Expand Up @@ -157,7 +157,7 @@ def getControl(self):
if not next:
break

if first and next in '\\{}':
if first and next in '\\{}~_-':
chars.extend("control_symbol")
digits.append(next)
break
Expand Down Expand Up @@ -224,7 +224,7 @@ def __init__(self, doc, clean_paragraphs=True):
def flushRun(self):
if self.block is None:
self.block = document.Paragraph()

if self.isImage:
self.block.content.append(
document.Image(self.propStack[-1].copy(),
Expand Down Expand Up @@ -317,11 +317,16 @@ def handle_Para(self, para):
self.listStack.append(l)

elif self.listLevel < prevListLevel:
l = self.listStack.pop()
self.listStack[-1].append(l)
times = prevListLevel + 1
if self.listLevel is not None:
times = times - (self.listLevel + 1)
depth = len(self.listStack) - 1
for _ in xrange(min(times, depth)):
l = self.listStack.pop()
self.listStack[-1].append(l)

self.block = None

def handle_Pict(self, pict):
self.flushRun()
self.isImage = True
Expand Down Expand Up @@ -354,7 +359,7 @@ def handle_ImageMarker(self, marker):
del self.propStack[-1][marker.name]
else:
self.propStack[-1][marker.name] = True



class Group(object):
Expand Down Expand Up @@ -386,7 +391,7 @@ def __init__(self, reader, parent=None, charsetTable=None):


def flushChars(self):
chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors)
chars = u"".join(c.decode(self.charset, self.reader.errors) for c in self.charBuffer)
self.content.append(chars)
self.charBuffer = []

Expand All @@ -398,11 +403,11 @@ def handle(self, control, digits):
if control == '*':
self.destination = True
return
if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile',
'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes',
'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley',
'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp',

if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile',
'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes',
'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley',
'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp',
'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap']:
self.content.append(ImageMarker(control, digits))
return
Expand Down Expand Up @@ -602,12 +607,12 @@ def handle_strike(self, onOff=None):


def handle_ilvl(self, level):
if self.currentParaTag is not None:
self.currentParaTag.listLevel = level
else:
# Well, now we're in trouble. But I'm pretty sure this
# isn't supposed to happen anyway.
pass
if self.currentParaTag is None:
# this can happen where documents open straight with lists rather than a containing Para..
p = Para()
self.content.append(p)
self.currentParaTag = p
self.currentParaTag.listLevel = int(level)


def handle_up(self, amount):
Expand Down Expand Up @@ -650,15 +655,15 @@ def handle_tab(self):

def handle_trowd(self):
self.content.append(u'\n')

#Handle the image tag
def handle_pict(self):
p = Pict()
self.content.append(p)
self.image = p
#Remove the destination control group of the parent, so that the image is preserved
self.parent.destination = False

def handle_field(self):
def finalize():
if len(self.content) != 2:
Expand Down Expand Up @@ -745,7 +750,7 @@ def __init__(self):

def __repr__(self):
return "!Image!"

class Para(ReadableMarker):
listLevel = None

Expand Down
55 changes: 41 additions & 14 deletions pyth/plugins/rtf15/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
http://www.biblioscape.com/rtf15_spec.htm
"""

import binascii
from pyth import document
from pyth.format import PythWriter

Expand Down Expand Up @@ -55,12 +56,16 @@ def __init__(self, doc, target, family):
document.List: self._list,
document.Paragraph: self._paragraph
}
self.paragraphContentDispatch = {
document.Text: self._text,
document.Image: self._image,
}


def go(self):
self.listLevel = -1
self.addSpacing = None

self.target.write('{')
self._writeHeader()
self._writeDocument()
Expand Down Expand Up @@ -105,7 +110,7 @@ def _getFontTable(self):
# We need Symbol for list bullets
output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1))
self.symbolFontNumber = i+1

output.append('}')
return "".join(output)

Expand Down Expand Up @@ -138,7 +143,7 @@ def _getListTable(self):

output.append('}}')
return "".join(output)


def _getListOverrides(self):
# I have no idea what the point is of this,
Expand All @@ -153,7 +158,7 @@ def _getRevTable(self):

# -----------------------------------------------
# Document section


def _writeDocument(self):

Expand Down Expand Up @@ -193,14 +198,15 @@ def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING):
if self.addSpacing is not None:
self.target.write(r'\sb%d' % self.addSpacing)
self.addSpacing = None

# Space after the paragraph,
# expressed in units of god-knows-what
self.target.write(r'\sa%d{' % spacing)

for text in paragraph.content:
self._text(text)


for item in paragraph.content:
handler = self.paragraphContentDispatch[item.__class__]
handler(item)

self.target.write(r'}\par\pard' '\n')


Expand Down Expand Up @@ -241,23 +247,26 @@ def _text(self, text):
for prop in text.properties:
if prop in _styleFlags:
props.append(_styleFlags[prop])

if props:
self.target.write("".join(props) + " ")

for run in text.content:

for run in text.content:
for unichar in run:
if unichar == '\n':
self.target.write(r'\line ')
continue

# Escape control characters
if unichar in '\\{}':
self.target.write(r'\%s' % unichar)
continue
point = ord(unichar)
if point < 128:
self.target.write(str(unichar))
else:
self.target.write(r'\u%d?' % point)

if props:
self.target.write("".join("%s0" % p for p in props) + " ")

Expand All @@ -266,3 +275,21 @@ def _text(self, text):

if 'url' in text.properties:
self.target.write('}}')

def _image(self, image):
self.target.write(r'{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE "cid:[email protected]" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict')
properties = "".join('\\' + prop + (val if val != True else '') for prop, val in image.properties.iteritems())
self.target.write(properties)
self.target.write(' \n')
image_data = binascii.hexlify(image.content[0])
for i in chunk(image_data):
self.target.write(i)
self.target.write('\n')
self.target.write(r'}}}}')

def chunk(data, size=200):
length = len(data)
end = 0
while length > end:
end = end + size
yield data[end-size:end]
3 changes: 3 additions & 0 deletions pyth/plugins/xhtml/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,6 @@ def is_super(self, node):
properties = self.get_properties(node)
return properties.get('vertical-align') == 'super'

def is_underline(self, node):
properties = self.get_properties(node)
return properties.get('text-decoration') == 'underline'
Loading