-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpdf2db.py
76 lines (71 loc) · 2.17 KB
/
pdf2db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import multiprocessing, signal
import utils
import pdf
from database import db_session
from models import *
def process_pdfs(rootpath):
"""Walk through all directories beneath rootpath and extract text from any pdf files found"""
timeouts = []
errors = []
for root, dirs, files in os.walk(rootpath):
directory = Directory(root)
db_session.add(directory)
db_session.commit()
print 'Converting files in: %s' % (root)
for name in files:
if name[-4:] == '.pdf':
try:
if already_processed(name, root):
print 'Already converted: %s' % (name)
continue
print 'Converting: %s' % (name)
doc = Document(name)
doc.directory_id = directory.id
q = multiprocessing.Queue()
p = multiprocessing.Process(target=extract_pages, args=(os.path.join(root, name), q,))
p.start()
p.join(timeout=5*60)
if p.is_alive():
# Timeout has expired
p.terminate()
print 'Processing timeout: %s' % name
timeouts.append({'dir':root, 'file':name})
continue
pages = q.get()
if not pages:
print 'Error processing: %s' % name
errors.append({'dir':root, 'file':name})
continue
for i, text in enumerate(pages):
page = Page(text, i)
doc.pages.append(page)
db_session.add(doc)
db_session.commit()
except:
print 'Error processing: %s' % name
errors.append({'dir':root, 'file':name})
else:
print 'Skipping: %s' % (name)
db_session.add(directory)
db_session.commit()
return {'timeouts':timeouts, 'errors':errors}
def already_processed(name, directory):
"""Check whether a document has already been processed"""
dbcheck = Document.query.filter(Document.filename == name).all()
for doc in dbcheck:
if doc.directory.directory == directory:
return True
return False
def extract_pages(filename, q):
"""Extract text from the pdf file filename"""
try:
pages = pdf.get_pages(filename)
q.put(pages)
except:
q.put(False)
if __name__ == '__main__':
missed = process_pdfs(utils.rootpath)
print missed
print 'Timeouts: ', len(missed['timeouts'])
print 'Errors: ', len(missed['errors'])