euske · staccatosound · Sep 1, 2014 · Sep 2, 2014 · Sep 3, 2014 · Sep 3, 2014
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+*.class
+*.pyc
+*.pyo
+.svn
+_svn
+.pythoscope
+.ipynb_checkpoints
+.settings
+_update.bat
+docs/_build
+/Goulib.egg-info/
+/build/
+/dist/
+/pdfminer.six.egg-info/
+tests/*.xml
+tests/*.txt
+.idea/
+.tox/
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,12 @@ language: python
 python:
   - "2.6"
   - "2.7"
+  - "3.4"
+  - "3.5"
+  - "3.6"
 install:
-  - pip install pycrypto
+  - pip install six
+  - pip install pycryptodome
+  - pip install chardet
 script:
-  - make test
+  nosetests --nologcapture
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,9 +1,11 @@
 include Makefile
 include LICENSE
 include *.txt
+include *.md
 include *.py
 graft cmaprsrc
 graft docs
 graft pdfminer
 graft samples
 graft tools
+global-exclude *.pyc
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@
 
 PACKAGE=pdfminer
 
-PYTHON=python2
+PYTHON=python
 GIT=git
 RM=rm -f
 CP=cp -f
@@ -55,12 +55,5 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
 		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
 
 test: cmap
-	$(PYTHON) -m doctest \
-		pdfminer/arcfour.py \
-		pdfminer/lzw.py \
-		pdfminer/ascii85.py \
-		pdfminer/runlength.py \
-		pdfminer/rijndael.py
-	$(PYTHON) -m pdfminer.ccitt
-	$(PYTHON) -m pdfminer.psparser
+	nosetests
 	cd samples && $(MAKE) test
diff --git a/README.md b/README.md
@@ -1,20 +1,21 @@
-PDFMiner
-========
+PDFMiner.six
+============
 
-[![Build Status](https://travis-ci.org/euske/pdfminer.svg?branch=master)](https://travis-ci.org/euske/pdfminer)
+PDFMiner.six is a fork of PDFMiner using six for Python 2+3 compatibility
+
+[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six) [![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/)
 
 PDFMiner is a tool for extracting information from PDF documents.
-Unlike other PDF-related tools, it focuses entirely on getting 
+Unlike other PDF-related tools, it focuses entirely on getting
 and analyzing text data. PDFMiner allows one to obtain
-the exact location of text in a page, as well as 
+the exact location of text in a page, as well as
 other information such as fonts or lines.
 It includes a PDF converter that can transform PDF files
 into other text formats (such as HTML). It has an extensible
 PDF parser that can be used for other purposes than text analysis.
 
- * Webpage: https://euske.github.io/pdfminer/
- * Download (PyPI): https://pypi.python.org/pypi/pdfminer/
- * Demo WebApp: http://pdf2html.tabesugi.net:8080/
+ * Webpage: https://github.com/pdfminer/
+ * Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
 
 
 Features
@@ -34,42 +35,16 @@ Features
 How to Install
 --------------
 
- * Install Python 2.6 or newer. (**For Python 3 support have a look at [pdfminer.six](https://github.com/goulu/pdfminer)**).
- * Download the source code.
- * Unpack it.
- * Run `setup.py`:
+ * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
+ * Install
 
-    $ python setup.py install
+    $ pip install pdfminer.six
 
- * Do the following test:
+ * Run the following test:
 
     $ pdf2txt.py samples/simple1.pdf
 
 
-For CJK Languages
------------------
-
-In order to process CJK languages, do the following before
-running setup.py install:
-
-    $ make cmap
-    python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt
-    reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
-    writing 'CNS1_H.py'...
-    ...
-    $ python setup.py install
-
-On Windows machines which don't have `make` command, 
-paste the following commands on a command line prompt:
-
-    mkdir pdfminer\cmap
-    python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt
-    python tools\conv_cmap.py -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt
-    python tools\conv_cmap.py -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt
-    python tools\conv_cmap.py -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt
-    python setup.py install
-
-
 Command Line Tools
 ------------------
 
@@ -91,53 +66,26 @@ You cannot extract any text from a PDF document which does not have extraction p
 
 **dumppdf.py**
 
-dumppdf.py dumps the internal contents of a PDF file in pseudo-XML format. 
+dumppdf.py dumps the internal contents of a PDF file in pseudo-XML format.
 This program is primarily for debugging purposes,
 but it's also possible to extract some meaningful contents (e.g. images).
 
 (For details, refer to the html document.)
 
 
-API Changes
------------
-
-As of November 2013, there were a few changes made to the PDFMiner API
-prior to October 2013. This is the result of code restructuring.  Here
-is a list of the changes:
-
- * PDFDocument class is moved to pdfdocument.py.
- * PDFDocument class now takes a PDFParser object as an argument.
-   PDFDocument.set_parser() and PDFParser.set_document() is removed.
- * PDFPage class is moved to pdfpage.py
- * process_pdf function is implemented as a class method PDFPage.get_pages.
-
-
 TODO
 ----
 
- * Replace STRICT variable with something better.
- * Use logging module instead of sys.stderr.
- * Proper test cases.
  * PEP-8 and PEP-257 conformance.
  * Better documentation.
- * Crypt stream filter support.
-
-
-Related Projects
-----------------
-
- * <a href="http://pybrary.net/pyPdf/">pyPdf</a>
- * <a href="http://www.foolabs.com/xpdf/">xpdf</a>
- * <a href="http://pdfbox.apache.org/">pdfbox</a>
- * <a href="http://mupdf.com/">mupdf</a>
 
 
 Terms and Conditions
 --------------------
 
 (This is so-called MIT/X License)
 
-Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>
+Copyright (c) 2004-2014  Yusuke Shinyama <yusuke at cs dot nyu dot edu>
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation

diff --git a/docs/index.html b/docs/index.html
@@ -9,7 +9,7 @@
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Mon Sep 26 09:04:15 UTC 2016
+Last Modified: Wed Jun 25 10:27:52 UTC 2014
 <!-- hhmts end -->
 </div>
 
@@ -82,14 +82,14 @@ <h3>Features</h3>
 <h3><a name="download">Download</a></h3>
 <p>
 <strong>Source distribution:</strong><br>
-<a href="http://pypi.python.org/pypi/pdfminer/">
-http://pypi.python.org/pypi/pdfminer/
+<a href="http://pypi.python.org/pypi/pdfminer_six/">
+http://pypi.python.org/pypi/pdfminer_six/
 </a>
 
 <P>
 <strong>github:</strong><br>
-<a href="https://github.com/euske/pdfminer/">
-https://github.com/euske/pdfminer/
+<a href="https://github.com/goulu/pdfminer/">
+https://github.com/goulu/pdfminer/
 </a>
 
 <h3><a name="wheretoask">Where to Ask</a></h3>
@@ -100,11 +100,9 @@ <h3><a name="wheretoask">Where to Ask</a></h3>
 http://groups.google.com/group/pdfminer-users/
 </a>
 
-
 <h2><a name="install">How to Install</a></h2>
 <ol>
 <li> Install <a href="http://www.python.org/download/">Python</a> 2.6 or newer.
-     (<font color=red><strong>Python 3 is not supported.</strong></font>)
 <li> Download the <a href="#source">PDFMiner source</a>.
 <li> Unpack it.
 <li> Run <code>setup.py</code> to install:<br>
@@ -268,7 +266,6 @@ <h4>Options</h4>
 <dd> Specifies how much a horizontal and vertical position of a text matters
 when determining a text order. The value should be within the range of 
 -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).
-When this value is out of the range (e.g. +2), a simpler ordering rule is used.
 The default value is 0.5.
 <p>
 <dt> <code>-C</code> 
@@ -373,82 +370,10 @@ <h4>Options</h4>
 <dd> Increases the debug level.
 </dl>
 
-<h2><a name="changes">Changes</a></h2>
+<h2><a name="changes">Changes:</a></h2>
 <ul>
-<li> 2014/03/28: Further bugfixes.
-<li> 2014/03/24: Bugfixes and improvements for fauly PDFs.<br>
-API changes:
- <ul>
- <li> <code>PDFDocument.initialize()</code> method is removed and no longer needed.
-  A password is given as an argument of a PDFDocument constructor.
- </ul>
-<li> 2013/11/13: Bugfixes and minor improvements.<br>
-As of November 2013, there were a few changes made to the PDFMiner API
-prior to October 2013. This is the result of code restructuring.  Here
-is a list of the changes:
- <ul>
- <li> <code>PDFDocument</code> class is moved to <code>pdfdocument.py</code>.
- <li> <code>PDFDocument</code> class now takes a <code>PDFParser</code> object as an argument.
- <li> <code>PDFDocument.set_parser()</code> and <code>PDFParser.set_document()</code> is removed.
- <li> <code>PDFPage</code> class is moved to <code>pdfpage.py</code>.
- <li> <code>process_pdf</code> function is implemented as <code>PDFPage.get_pages</code>.
-</ul>
-<li> 2013/10/22: Sudden resurge of interests. API changes.
-Incorporated a lot of patches and robust handling of broken PDFs.
-<li> 2011/05/15: Speed improvements for layout analysis.
-<li> 2011/05/15: API changes. <code>LTText.get_text()</code> is added.
-<li> 2011/04/20: API changes. LTPolygon class was renamed as LTCurve.
-<li> 2011/04/20: LTLine now represents horizontal/vertical lines only. Thanks to Koji Nakagawa.
-<li> 2011/03/07: Documentation improvements by Jakub Wilk. Memory usage patch by Jonathan Hunt.
-<li> 2011/02/27: Bugfixes and layout analysis improvements. Thanks to fujimoto.report.
-<li> 2010/12/26: A couple of bugfixes and minor improvements. Thanks to Kevin Brubeck Unhammer and Daniel Gerber.
-<li> 2010/10/17: A couple of bugfixes and minor improvements. Thanks to standardabweichung and Alastair Irving.
-<li> 2010/09/07: A minor bugfix. Thanks to Alexander Garden.
-<li> 2010/08/29: A couple of bugfixes. Thanks to Sahan Malagi, pk, and Humberto Pereira.
-<li> 2010/07/06: Minor bugfixes. Thanks to Federico Brega.
-<li> 2010/06/13: Bugfixes and improvements on CMap data compression. Thanks to Jakub Wilk.
-<li> 2010/04/24: Bugfixes and improvements on TOC extraction. Thanks to Jose Maria.
-<li> 2010/03/26: Bugfixes. Thanks to Brian Berry and Lubos Pintes.
-<li> 2010/03/22: Improved layout analysis. Added regression tests.
-<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
-<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
-<li> 2010/02/15: Several bugfixes. Thanks to Sean.
-<li> 2010/02/13: Bugfix and enhancement. Thanks to Andr&eacute; Auzi.
-<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
-<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed. 
-<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
-<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
-<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
-<li> 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting.
-<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
-<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
-<li> 2009/10/31: SGML output format is changed and renamed as XML.
-<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
-<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
-<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
-<li> 2009/08/30: Fixed page rotation handling.
-<li> 2009/08/26: Fixed zlib decoding bug. Thanks to Shon Urbas.
-<li> 2009/08/24: Fixed a bug in character placing. Thanks to Pawan Jain.
-<li> 2009/07/21: Improvement in layout analysis.
-<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
-<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
-<li> 2009/03/30: Text output mode added.
-<li> 2009/03/25: Encoding problems fixed. Word splitting option added. 
-<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.
-<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
-<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
-<li> 2009/01/10: Handling Type3 font metrics correctly.
-<li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
-<li> 2008/09/06: A sample pdf2html webapp added.
-<li> 2008/08/30: ASCII85 encoding filter support.
-<li> 2008/07/27: Tagged contents extraction support.
-<li> 2008/07/10: Outline (TOC) extraction support.
-<li> 2008/06/29: HTML output added. Reorganized the directory structure.
-<li> 2008/04/29: Bugfix for Win32. Thanks to Chris Clark.
-<li> 2008/04/27: Basic encryption and LZW decoding support added.
-<li> 2008/01/07: Several bugfixes. Thanks to Nick Fabry for his vast contribution.
-<li> 2007/12/31: Initial release.
-<li> 2004/12/24: Start writing the code out of boredom...
+<li> 2014/09/15: pushed on PyPi</li>
+<li> 2014/09/10: pdfminer_six forked from pdfminer since Yusuke didn't want to merge and pdfminer3k is outdated</li>
 </ul>
 
 <h2><a name="todo">TODO</a></h2>

diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
@@ -1,5 +1,16 @@
-#!/usr/bin/env python
-__version__ = '20140328'
+# -*- coding: utf-8 -*-
+"""
+Fork of PDFMiner using six for Python 2+3 compatibility
+
+PDFMiner is a tool for extracting information from PDF documents.
+Unlike other PDF-related tools, it focuses entirely on getting and analyzing
+text data. PDFMiner allows to obtain the exact location of texts in a page,
+as well as other information such as fonts or lines.
+It includes a PDF converter that can transform PDF files into other text
+formats (such as HTML). It has an extensible PDF parser that can be used for
+other purposes instead of text analysis.
+"""
+__version__ = '20170720'
 
 if __name__ == '__main__':
-    print (__version__)
+    print(__version__)