-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr_parser.py
70 lines (63 loc) · 1.96 KB
/
ocr_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""This file is the main point
of entry for OCR parsing
"""
# Built-in modules
import sys
import base64
import os
from PIL import Image
# Custom Modules
import parser
import pytesseract
# 3rd Party Modules
from wand.image import Image as WandImage
from wand.color import Color
def parse_images(images):
"""
Takes in a list of Python Image objects and returns all bank data
Input Type: list of Python Image objects
Output Type: string that is csv compliant
"""
contents = []
count = 1
for i in images:
print('Running OCR on image {0}'.format(count))
content = pytesseract.image_to_boxes(i)
content = content.split('\n')
structured = []
# Pytesseract output is a big string so we have to break and parse out
for i in content:
data = i.split(' ') # More parsing out
structured += [{
'contents': data[0],
'x': int(data[1]),
'y': int(data[2])
}]
contents += [structured]
count += 1
parsed = parser.parse_tesseract(contents)
return parsed
def parse(file_name):
pil_pages = []
if file_name[len(file_name)-4:len(file_name)] == '.pdf':
with WandImage(filename=file_name, resolution=300) as pdf:
for page_count, page in enumerate(pdf.sequence):
page_image = WandImage(image=page)
page_image.save(filename="reserved_name.png")
pil_pages.append(Image.open("reserved_name.png"))
os.remove("reserved_name.png")
else:
# Otherwise we assume it's an actual image file
pil_pages = [Image.open(file_name)]
csv_data = parse_images(pil_pages)
print(csv_data)
print('Done. Cheers!')
return csv_data
if __name__ == '__main__':
file_name = None
try:
file_name = sys.argv[1]
except:
print('Sorry buddy, but you need to provide a filename.')
sys.exit()
parse(file_name)