forked from misja/python-boilerpipe
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsetup.py
70 lines (63 loc) · 2.55 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import tarfile
from fnmatch import fnmatch
import shutil
from os.path import basename, exists, dirname, abspath, join
import os
import subprocess
from distutils.core import setup
#from setuptools import setup
try:
from urllib import urlretrieve
except:
from urllib.request import urlretrieve
__version__ = '3.0.0'
DATAPATH = join(abspath(dirname((__file__))), 'src/pdfextract/data')
def download_or_compile_jars(datapath):
if not exists(datapath+"/PDFExtract.jar") or not exists(datapath+"/PDFExtract.json"):
wd = os.getcwd()
if not exists(datapath+"/pdf-extract"):
subprocess.check_call(["git","clone","https://github.com/bitextor/pdf-extract.git","--recursive",datapath+"/pdf-extract"])
os.chdir(datapath+"/pdf-extract/cld3-Java")
subprocess.check_call(["ant", "jar"])
subprocess.check_call(["mvn", "install:install-file","-Dfile=cld3-java.jar","-DgroupId=cld3-java","-DartifactId=cld3-java","-Dversion=1.0","-Dpackaging=jar"])
os.chdir(datapath+"/pdf-extract")
subprocess.check_call(["git", "pull"])
subprocess.check_call(["git", "submodule", "update", "--init", "--recursive"])
subprocess.check_call(["mvn", "package"])
os.chdir(wd)
shutil.move(datapath+'/pdf-extract/target/PDFExtract-2.0.jar', datapath+"/PDFExtract.jar")
shutil.move(datapath+'/pdf-extract/target/PDFExtract.json', datapath+"/PDFExtract.json")
download_or_compile_jars(datapath=DATAPATH)
setup(
name='python-pdfextract',
version=__version__,
packages=['pdfextract', 'pdfextract.extract'],
package_dir={'': 'src'},
package_data={
'pdfextract': [
'data/PDFExtract.jar',
'data/PDFExtract.json'
],
},
install_requires=[
'JPype1',
'chardet',
],
author='Misja Hoebe, Leopoldo Pla',
author_email='[email protected], [email protected]',
maintainer='Matthew Russell, Leopoldo Pla',
maintainer_email='[email protected], [email protected]',
url='https://github.com/bitextor/python-pdfextract/',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 2.7',
'Natural Language :: English',
],
keywords='pdfextract',
license='Apache 2.0',
description='Python interface to pdf-extract, HTML Extraction from PDF pages'
)