This repository has been archived by the owner on May 11, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 30
/
annotate.py
98 lines (84 loc) · 2.8 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import sys
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def makeBox(bbox):
return {
'_left': int(bbox[0]),
'_top': int(bbox[1]),
'_right': int(bbox[2]),
'_bottom': int(bbox[3]),
'width': int(bbox[2]) - int(bbox[0]),
'height': int(bbox[3]) - int(bbox[1])
}
def getbbox(title):
title_parts = title.split(';')
for part in title_parts:
if part.strip()[0:4] == 'bbox':
return part.replace('bbox', '').strip().split()
return
def tess(infile, outfile):
with open(infile) as hocr:
text = hocr.read()
soup = BeautifulSoup(text, "html.parser")
pages = soup.find_all('div', 'ocr_page')
careas = soup.find_all('div', 'ocr_carea')
pars = soup.find_all('p', 'ocr_par')
words = soup.find_all('span', 'ocrx_word')
page_boxes = [makeBox(getbbox(page.get('title'))) for page in pages]
carea_boxes = [makeBox(getbbox(carea.get('title'))) for carea in careas]
par_boxes = [makeBox(getbbox(par.get('title'))) for par in pars]
word_boxes = [makeBox(getbbox(word.get('title'))) for word in words]
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
for box in page_boxes:
ax.add_patch(patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=1,
edgecolor="#2b8cbe"
)
)
for box in carea_boxes:
ax.add_patch(patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.75,
edgecolor="#7bccc4"
)
)
for box in par_boxes:
ax.add_patch(patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.5,
edgecolor="#bae4bc"
)
)
for box in word_boxes:
ax.add_patch(patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.1,
edgecolor="#000000"
)
)
plt.ylim(0,page_boxes[0]['_bottom'])
plt.xlim(0,page_boxes[0]['_right'])
plt.axis("off")
ax = plt.gca()
ax.invert_yaxis()
plt.axis('off')
# fig.savefig(outfile, dpi=400, bbox_inches='tight', pad_inches=0)
if len(sys.argv) == 3:
tess(sys.argv[1], sys.argv[2])
else:
print 'Script requires two parameters: an input Tesseract HOCR file and an output file name and location'