-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_to_text.py
47 lines (42 loc) · 1.46 KB
/
pdf_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#coding=utf8
'''
Created on 2017-1-3
@author: xuwei
@summary:
'''
import threading
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
lockObject = threading.Lock()
def Extractor(sourcefile, outfile):
lockObject.acquire()
try:
fp = file(sourcefile, 'rb')
outfp=file(outfile,'w')
#创建一个PDF资源管理器对象来存储共享资源
#caching = False不缓存
rsrcmgr = PDFResourceManager(caching = False)
# 创建一个PDF设备对象
laparams = LAParams()
device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams,imagewriter=None)
#创建一个PDF解析器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos = set(),maxpages=0,
password='',caching=False, check_extractable=True):
page.rotate = page.rotate % 360
interpreter.process_page(page)
#关闭输入流
fp.close()
#关闭输出流
device.close()
outfp.flush()
outfp.close()
except Exception, e:
print "Exception:%s",e
finally:
#注意一定要释放锁,否则程序出异常时,会死掉
lockObject.release()
if __name__ == "__main__":
AA = Extractor("80.pdf",'test.txt')