forked from ocropus/hocr-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhocr-extract-g1000
executable file
·164 lines (135 loc) · 4.65 KB
/
hocr-extract-g1000
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python
# extract lines from Google 1000 book sample
import string,re,sys,os,shutil,glob
from xml.sax import saxexts, saxlib, saxutils
from PIL import Image
usage = """
... hocr image_pattern output_prefix
Process Google 1000 books volumes and prepares line or word images
for alignment using OCRopus.
Run ocroscript align-... Volume_0000/0000/0000.{png,txt}
Arguments:
hocr: hocr source file
image_pattern: either a glob pattern that results in a list
of image files in order, or @filename for a file containing
a list of image files in order; DON'T FORGET TO QUOTE THIS
output_pattern: output images are of the form
output_pattern%(pageno,lineno)
Environment Variables:
element="ocr_line": which element to extract; ocrx_word and
ocr_cinfo are also useful
regex=".": the text for any transcription must match this pattern
dict=None: a dictionary; if provided, all the words in any line
that's output by the program must occur in the dictionary
min_len=20: minimum length of text for which lines are output
max_len=50: maximum length of text for which lines are output
max_lines=1000000: maximum number of lines output
pad=2: pad the bounding box by this many pixels prior to extraction
output_format=png: format for line image files
"""
if len(sys.argv)!=4:
sys.stderr.write(usage)
print "args:",sys.argv
sys.exit(1)
exe,hocr,image_pattern,output_pattern = sys.argv
if image_pattern[0]=="@":
image_list = open(image_pattern[1:]).readlines()
image_list = [s[:-1] for s in image_list]
image_list.sort()
else:
image_list = glob.glob(image_pattern)
if not os.path.exists(hocr):
sys.stderr.write(hocr+": not found")
sys.exit(1)
element = os.getenv("element","ocr_line")
regex = os.getenv("regex",".")
min_len = int(os.getenv("min_len","20"))
max_len = int(os.getenv("max_len","50"))
dict = None
dictfile = os.getenv("dict")
max_lines = int(os.getenv("max_lines","1000000"))
pad = int(os.getenv("pad","2"))
output_format = os.getenv("output_format","png")
if dictfile:
stream = open(dictfile,"r")
words = stream.read().split()
stream.close()
dict = {}
for word in words: dict[word.lower()] = 1
# print "[read %d words from %s]\n"%(len(words),dictfile)
def check_dict(dict,s):
if not dict: return 1
words = re.split(r'\W+',s)
for word in words:
if word=="": continue
if not dict.get(word.lower()): return 0
return 1
def write_string(file,text):
stream = open(file,"w")
stream.write(text.encode("utf-8"))
stream.close()
def get_prop(title,name):
props = title.split(';')
for prop in props:
(key,args) = prop.split(None,1)
if key==name: return args
return None
class docHandler(saxlib.DocumentHandler):
def __init__(self):
self.element = element
self.regex = regex
def startDocument(self):
self.total = 0
self.pageno = -1
self.text = None
self.depth = 0
self.start = -1
self.copied = {}
def endDocument(self):
pass
def startElement(self,name,attrs):
self.depth += 1
if attrs.get("class","")=="ocr_page":
self.lineno = -1
self.pageno += 1
self.page = image_list[self.pageno]
self.image = Image.open(self.page)
if attrs.get("class","")==self.element:
self.lineno += 1
props = attrs.get("title","")
self.bbox = get_prop(props,"bbox")
self.start = self.depth
self.text = u""
def endElement(self,name):
if self.depth==self.start:
if len(self.text)>=min_len and \
len(self.text)<=max_len and \
re.match(self.regex,self.text) and \
check_dict(dict,self.text):
print self.page,self.bbox,self.text.encode("utf-8")
w,h = self.image.size
x0,y0,x1,y1 = [int(s) for s in self.bbox.split()]
assert y0<y1 and x0<x1 and x1<=w and y1<=h
x0 = max(0,x0-pad)
y0 = max(0,y0-pad)
x1 = min(w,x1+pad)
y1 = min(h,y1+pad)
limage = self.image.crop((x0,y0,x1,y1))
base = output_pattern%(self.pageno,self.lineno)
basedir = os.path.dirname(base)
if not os.path.exists(basedir): os.mkdir(basedir)
limage.save(base+"."+output_format)
write_string(base+".txt",self.text)
write_string(base+".bbox",self.bbox)
self.total += 1
if self.total>=max_lines: sys.exit(0)
self.text = None
self.start = -1
self.depth -= 1
def characters(self,str,start,end):
if self.text!=None:
self.text += str[start:end]
parser = saxexts.make_parser()
parser.setDocumentHandler(docHandler())
stream = os.popen("tidy -q -wrap 9999 -asxhtml < %s 2> /tmp/tidy_errs"%hocr,"r")
parser.parseFile(stream)