brendonh · prechelt · Jun 13, 2015 · Jun 28, 2015 · Jul 11, 2015 · Jul 11, 2015
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.rtf eol=crlf
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 *~
+*.bak
 *.py[co]
-*.egg-info
+*.egg-info
+tests/currentoutput/
diff --git a/README b/README
diff --git a/README.md b/README.md
@@ -0,0 +1,107 @@
+pyth3 - Python text markup and conversion
+=========================================
+
+Pyth is intended to make it easy to convert marked-up text between different common formats.
+This is a (rather incomplete so far) port of pyth 0.6.0 to Python 3.
+
+*Marked-up text* means text which has:
+
+* Paragraphs
+* Headings
+* Bold, italic, and underlined text
+* Hyperlinks
+* Bullet lists
+* Simple tables
+* Very little else
+
+
+Formats that have (very varying) degrees of support are
+
+* Plain text
+* XHTML
+* RTF (Rich Text Format)
+* PDF (output only)
+
+
+Design principles/goals
+=======================
+
+* Ignore unsupported information in input formats (e.g. page layout)
+* Ignore font issues -- output in a single font.
+* Ignore specific text sizes, but maintain italics, boldface, subscript/superscript
+* Have no dependencies unless they are written in Python, and work
+* Make it easy to add support for new formats, by using an architecture based on *plugins* and *adapters*.
+
+
+
+Examples
+========
+
+See directory `examples`.
+
+
+
+Python 3 migration
+==================
+
+The code was originally written for Python 2.
+It has been partially(!) upgraded to Python 3 compatibility (starting via 'modernize').
+This does not mean it will actually work!
+
+pyth.plugins.rtf15.reader has been debugged and now appears to work correctly.
+pyth.plugins.xhtml.writer has been debugged and now appears to work correctly.
+pyth.plugins.plaintext.writer has been debugged and now appears to work correctly.
+Everything else is unknown (or definitely broken on Python 3: even many
+of the tests fail)
+See directory py3migration for a bit more detail.
+(If you find something is broken on Python 2 that worked before, please
+either fix it or simply stick to pyth version 0.6.0.)
+
+
+Limitations
+===========
+
+pyth.plugins.rtf15.reader:
+- bulleted or enumerated items will be returned
+  as plain paragraphs (no indentation, no bullets).
+- cannot cope with Symbol font correctly:
+  - from MS Word: lower-coderange characters (greek mostly) work
+  - from MS Word: higher-coderange characters are missing, because
+    Word encodes them in a horribly complicated manner not supported
+    by pyth currently
+  - from Wordpad: lower- and higher-coderange characters come out in
+    the wrong encoding (ANSI, I think)
+
+pyth.plugins.xhtml.writer:
+- very limited functionality
+
+pyth.plugins.plaintext.writer:
+- very very limited functionality
+
+Others: 
+- will not work on Python 3 without some porting love-and-care
+
+
+Tests
+=====
+
+Don't try to run them all, it's frustrating.
+`py.test -v test_readrtf15.py` is a good way to run the least frustrating 
+subset of them.
+It is normal that most others will fail on Python 3.
+`test_readrtf15.py` generates test cases dynamically based on
+existing input files in `tests/rtfs` and
+existing reference output files in `tests/rtf-as-html` and `tests/rtf-as-html`.
+The empty or missing output files indicate where functionality is missing,
+which nicely indicates possible places to jump in if you want to help.
+
+
+Dependencies
+============
+
+Only the most important two of the dependencies,
+are actually declared in `setup.py`, because the others are large, yet
+are required only in pyth components not yet ported to Python 3. 
+They are:
+- `reportlab` for PDFWriter
+- `docutils` for LatexWriter 
diff --git a/examples/reading/rtf15.py b/examples/reading/rtf15.py
@@ -1,17 +1,25 @@
+from __future__ import absolute_import
+from __future__ import print_function
 import sys
 import os.path
 
 from pyth.plugins.rtf15.reader import Rtf15Reader
-from pyth.plugins.xhtml.writer import XHTMLWriter
+from pyth.plugins.xhtml.writer import XHTMLWriter, write_html_file
 
+numargs = len(sys.argv) - 1
 
-if len(sys.argv) > 1:
-    filename = sys.argv[1]
+if numargs not in [1, 2]:
+    print("usage: rtf15 inputfile.rtf [outputdir]")
 else:
-    filename = os.path.normpath(os.path.join(
-        os.path.dirname(__file__), 
-        '../../tests/rtfs/sample.rtf'))
-
-doc = Rtf15Reader.read(open(filename, "rb"))
-
-print XHTMLWriter.write(doc, pretty=True).read()
+    inputfile = sys.argv[1]
+    doc = Rtf15Reader.read(open(inputfile, "rb"))
+    the_output = XHTMLWriter.write(doc, pretty=True).read()
+    if numargs == 1:
+        print("<!-- ##### RTF file" + inputfile + "as XHTML: -->")
+        print(the_output)
+    else:
+        basename = os.path.basename(inputfile)
+        outputdir = sys.argv[2]
+        outputfile = os.path.join(outputdir,
+                                  os.path.splitext(basename)[0] + ".html")
+        write_html_file(outputfile, the_output, print_msg=True)
diff --git a/examples/reading/sampleWithImage.py b/examples/reading/sampleWithImage.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.rtf15.reader import Rtf15Reader
 import sys
 
@@ -8,4 +10,4 @@
 
 doc = Rtf15Reader.read(open(filename, "rb"))
 
-print [x.content for x in doc.content]
+print([x.content for x in doc.content])
diff --git a/examples/reading/xhtml.py b/examples/reading/xhtml.py
@@ -1,10 +1,12 @@
+from __future__ import absolute_import
+from __future__ import print_function
 # -*- coding: utf-8 -*-
 
 from pyth.plugins.xhtml.reader import XHTMLReader
 from pyth.plugins.xhtml.writer import XHTMLWriter
 import xhtml
 
-from cStringIO import StringIO
+from six import StringIO
 
 # A simple xhtml document with limited features.
 content = StringIO(r"""
@@ -49,4 +51,4 @@
     # Parse the document and then reconstruct it using the xhtml
     # writer.
     doc = XHTMLReader.read(content, css)
-    print XHTMLWriter.write(doc).getvalue()
+    print(XHTMLWriter.write(doc).getvalue())
diff --git a/examples/writing/latex.py b/examples/writing/latex.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.latex.writer import LatexWriter
 import pythonDoc
 
 if __name__ == "__main__":
     doc = pythonDoc.buildDoc()
-    print LatexWriter.write(doc).getvalue()
+    print(LatexWriter.write(doc).getvalue())
diff --git a/examples/writing/pdf.py b/examples/writing/pdf.py
@@ -1,3 +1,4 @@
+from __future__ import absolute_import
 # -*- coding: utf-8 -*-
 
 from pyth.plugins.rtf15.reader import Rtf15Reader

diff --git a/examples/writing/plaintext.py b/examples/writing/plaintext.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.plaintext.writer import PlaintextWriter
 import pythonDoc
 
 doc = pythonDoc.buildDoc()
 
-print PlaintextWriter.write(doc).getvalue()
+print(PlaintextWriter.write(doc).getvalue())
diff --git a/examples/writing/pythonDoc.py b/examples/writing/pythonDoc.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import
 # -*- coding: utf-8 -*-
 
 from pyth.plugins.python.reader import *
+import six
 
 def buildDoc():
     return PythonReader.read((        
@@ -9,7 +11,7 @@ def buildDoc():
        u", hee hee hee! ", T(url=u'http://www.google.com') [ u"This seems to work" ]
       ],
       L [
-       [unicode(word) for word in ("One", "Two", "Three", "Four")]
+       [six.text_type(word) for word in ("One", "Two", "Three", "Four")]
       ],
       L [
         u"Introduction",

diff --git a/examples/writing/rst.py b/examples/writing/rst.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.rst.writer import RSTWriter
 import pythonDoc
 
 if __name__ == "__main__":
     doc = pythonDoc.buildDoc()
-    print RSTWriter.write(doc).getvalue()
+    print(RSTWriter.write(doc).getvalue())
diff --git a/examples/writing/rtf15.py b/examples/writing/rtf15.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.rtf15.writer import Rtf15Writer
 import pythonDoc
 
 doc = pythonDoc.buildDoc()
 
-print Rtf15Writer.write(doc).getvalue()
+print(Rtf15Writer.write(doc).getvalue())
diff --git a/examples/writing/xhtml.py b/examples/writing/xhtml.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import
+from __future__ import print_function
 from pyth.plugins.xhtml.writer import XHTMLWriter
 import pythonDoc
 
@@ -17,4 +19,4 @@
 
 if __name__ == "__main__":
     doc = pythonDoc.buildDoc()
-    print docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue()
+    print(docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue())
diff --git a/py3migration/STATUS.txt b/py3migration/STATUS.txt
@@ -0,0 +1,39 @@
+as of 2015-06-28:
+
+I have made the code nearly python2/python3-duocompatible by calling
+python-modernize.
+Comitted.
+
+I have inserted requires = ['six'] into setup.py
+Not committed.
+
+I have then repaired many of the bytes-vs-str issues in 
+pyth\plugins\rtf15\reader.py.
+Dito for pyth\plugins\xhtml\writer.py.
+The former in particular was tricky because most strings have to be handled
+as bytestrings -- but not all of them.
+See  http://pythonhosted.org/six/
+
+These two now appear to work correctly for simple RTF files (without
+images, tables, headers etc).
+Complex files remain to be tested.
+
+I have established a set of system-level test cases,
+with various input files with the relevant RTF features
+(paragraphs, line breaks, page breaks, various characters, fonts,
+ bold, italics, underline, hyperlink)
+and coming from MS Word, Wordpad, OpenOffice.
+They are handled correctly (as per comparison with how
+MS Word 2013 shows them) with one exception.
+
+TO DO:
+
+- For tests/rtfs/zh-cn, the conversion produces some additional
+text that is not shown in MS Word 2013.
+The RTF is very complicated, so I am not sure whether this is
+a defect or maybe the RTF is incorrect (but even then...).
+
+- Introduce proper handling of itemized lists (well, that is a
+new feature actually).
+
+- Debug the other plugins.