This repository has been archived by the owner on May 11, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
annotate.py
113 lines (99 loc) · 3.31 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import sys
from bs4 import BeautifulSoup
from PIL import Image
import matplotlib as mpl
from matplotlib import pyplot
import codecs
mpl.use('TkAgg')
def makeBox(bbox):
return {
'_left': int(bbox[0]),
'_top': int(bbox[1]),
'_right': int(bbox[2]),
'_bottom': int(bbox[3]),
'width': int(bbox[2]) - int(bbox[0]),
'height': int(bbox[3]) - int(bbox[1])
}
def getbbox(title):
title_parts = title.split(';')
for part in title_parts:
if part.strip()[0:4] == 'bbox':
return part.replace('bbox', '').strip().split()
return
def tess(infile, outfile):
with codecs.open(infile, "r", "utf-8") as hocr:
text = hocr.read()
soup = BeautifulSoup(text, "html.parser")
pages = soup.find_all('div', 'ocr_page')
careas = soup.find_all('div', 'ocr_carea')
pars = soup.find_all('p', 'ocr_par')
lines = soup.find_all('span', 'ocr_line')
words = soup.find_all('span', 'ocrx_word')
page_boxes = [makeBox(getbbox(page.get('title'))) for page in pages]
carea_boxes = [makeBox(getbbox(carea.get('title'))) for carea in careas]
par_boxes = [makeBox(getbbox(par.get('title'))) for par in pars]
line_boxes = [makeBox(getbbox(line.get('title'))) for line in lines]
word_boxes = [makeBox(getbbox(word.get('title'))) for word in words]
fig = pyplot.figure()
ax = fig.add_subplot(111, aspect='equal')
for box in page_boxes:
ax.add_patch(mpl.patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.5,
edgecolor="#FF00FF"
)
)
for box in carea_boxes:
ax.add_patch(mpl.patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.5,
edgecolor="#0000FF"
)
)
for box in par_boxes:
ax.add_patch(mpl.patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.1,
edgecolor="#F0F0F0"
)
)
for box in line_boxes:
ax.add_patch(mpl.patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.1,
edgecolor="#FF0000"
)
)
for box in word_boxes:
ax.add_patch(mpl.patches.Rectangle(
(box['_left'], box['_top']),
box['_right'] - box['_left'],
box['_bottom'] - box['_top'],
fill=False,
linewidth=0.1,
edgecolor="#000000"
)
)
pyplot.ylim(0,page_boxes[0]['_bottom'])
pyplot.xlim(0,page_boxes[0]['_right'])
pyplot.axis("off")
ax = pyplot.gca()
ax.invert_yaxis()
pyplot.axis('off')
fig.savefig(outfile, dpi=400, bbox_inches='tight', pad_inches=0)
if len(sys.argv) == 3:
tess(sys.argv[1], sys.argv[2])
else:
print('Script requires two parameters: an input Tesseract HOCR file and an output file name and location')