-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.py
53 lines (42 loc) · 1.91 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from PIL import Image
from kraken import binarization
from kraken import pageseg
from kraken.lib import models
from kraken import blla
from kraken import serialization
from kraken import rpred
from kraken.lib import vgsl
def to_bw(im, threshold=127):
return binarization.nlbin(im)
# im = im.convert('L')
# im = im.point( lambda p: 255 if p > threshold else 0 )
# im = im.convert('1')
# return im
class ImageOCR:
def __init__(self, model, baseline_model=None, bw_threshold=127, scale=1):
self.model = model
# The baseline model actually doesn't work well for our sample, and it takes a very long time to run (~seconds for each article)
self.baseline_model = baseline_model
# Scaling for some newspapers
self.scale = scale
self.bw_threshold = bw_threshold
def get_text(self, image):
if self.scale != 1:
image = image.resize((int(self.scale * image.size[0]), int(self.scale * image.size[1])))
bw_im = to_bw(image, threshold=self.bw_threshold)
# display(image)
# display(bw_im)
seg = None
if self.baseline_model is not None:
try:
seg = blla.segment(bw_im, model=self.baseline_model, text_direction='horizontal-rl')
except RuntimeError:
# Failed due to memory error (probably). Happens for single lines of text
pass
if seg is None:
seg = pageseg.segment(bw_im, text_direction='horizontal-rl')
# seg = blla.segment(bw_im, model=self.baseline_model, text_direction='horizontal-rl') # This uses too much memory, crashes VM
pred_it = rpred.rpred(self.model, image, seg, bidi_reordering='R')
records = [record for record in pred_it]
# print([record.prediction for record in records])
return [record.prediction for record in records]