-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
318 lines (264 loc) · 8.52 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# -*- coding: utf-8 -*-
import cv2
import numpy as np
from skimage.color import rgb2gray
import skimage.io as io
from skimage.filters import gaussian, threshold_otsu
from skimage.feature import canny
from skimage.transform import probabilistic_hough_line, rotate
from feature_extraction import f_get_dots, f_get_holes, f_ft, f_multi_lbp, get_features
# ========== #
# De-skewing #
# ========== #
def deskew_1(img):
img_blur = cv2.medianBlur(img,5).astype('uint8')
thresh = cv2.threshold(cv2.bitwise_not(img_blur), 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
coords = np.column_stack(np.where(thresh > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = img.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
def deskew_2(img):
image = rgb2gray(img)
#threshold to get rid of extraneous noise
thresh = threshold_otsu(image)
normalize = image > thresh
# gaussian blur
blur = gaussian(normalize, 3)
# canny edges in scikit-image
edges = canny(blur)
# hough lines
hough_lines = probabilistic_hough_line(edges)
# hough lines returns a list of points, in the form ((x1, y1), (x2, y2))
# representing line segments. the first step is to calculate the slopes of
# these lines from their paired point values
slopes = [(y2 - y1)/(x2 - x1) if (x2-x1) else 0 for (x1,y1), (x2, y2) in hough_lines]
# it just so happens that this slope is also y where y = tan(theta), the angle
# in a circle by which the line is offset
rad_angles = [np.arctan(x) for x in slopes]
# and we change to degrees for the rotation
deg_angles = [np.degrees(x) for x in rad_angles]
# which of these degree values is most common?
histo = np.histogram(deg_angles, bins=180)
# correcting for 'sideways' alignments
rotation_number = histo[1][np.argmax(histo[0])]
if rotation_number > 45:
rotation_number = -(90-rotation_number)
elif rotation_number < -45:
rotation_number = 90 - abs(rotation_number)
return (rotate(img,rotation_number,cval=1) * 255).astype(np.uint8)
def deskew(img):
return deskew_2(deskew_1(img))
# ============ #
# Thresholding #
# ============ #
def thresholding(img, thresh=None):
binary = np.copy(img)
thresh = thresh or threshold_otsu(img)
binary[img < thresh] = 1
binary[img >= thresh] = 0
return binary
# ============= #
# Fetching Data #
# ============= #
from segmentation import get_lines, extract_words_one_line, get_char_from_word
from time import time
import skimage.io as io
import os
char_to_int = {
'ا': 1,
'ب': 2,
'ت': 3,
'ث': 4,
'ج': 5,
'ح': 6,
'خ': 7,
'د': 8,
'ذ': 9,
'ر': 10,
'ز': 11,
'س': 12,
'ش': 13,
'ص': 14,
'ض': 15,
'ط': 16,
'ظ': 17,
'ع': 18,
'غ': 19,
'ف': 20,
'ق': 21,
'ك': 22,
'ل': 23,
'م': 24,
'ن': 25,
'ه': 26,
'و': 27,
'ي': 28,
'لا': 29
}
int_to_char = (' ',
'ا',
'ب',
'ت',
'ث',
'ج',
'ح',
'خ',
'د',
'ذ',
'ر',
'ز',
'س',
'ش',
'ص',
'ض',
'ط',
'ظ',
'ع',
'غ',
'ف',
'ق',
'ك',
'ل',
'م',
'ن',
'ه',
'و',
'ي',
'لا'
)
def get_char_images(imgs_path='scanned', txt_path='text', start=0, end=1000):
imgs = os.listdir(imgs_path)
txts = os.listdir(txt_path)
imgs.sort()
txts.sort()
segErrors = []
data = []
labels = []
was = time()
for i in range(start, end):
# Getting labels
path = os.path.join(txt_path, txts[i])
labelWords = []
with open(path, 'r') as f:
words = f.read().split(' ')
for word in words:
labelWords.append(word)
# Getting images
path = os.path.join(imgs_path, imgs[i])
original = io.imread(path)
deskewed = deskew_1(original)
lines = get_lines(deskewed)
thresholded_lines = []
for line in lines:
thresholded_lines.append(thresholding(line))
linesWithWords = []
lengthOfWords = 0
for line in thresholded_lines:
wordsFromLine = extract_words_one_line(line)
linesWithWords.append(wordsFromLine)
lengthOfWords += len(wordsFromLine)
# Check for word segmentation error
if(lengthOfWords != len(labelWords)):
print(f'skipping {path}')
continue
currLabelIndex = -1
word_lengths = []
for i in range(len(linesWithWords)): # looping on lines
for j in range(len(linesWithWords[i])): # looping on words in specific line
currLabelIndex += 1
chars = get_char_from_word(linesWithWords[i][j], thresholded_lines[i], True)
lamAlefCount = 0
lamAlefIdx = []
for l in range(len(linesWithWords[i][j]) - 1):
if(linesWithWords[i][j][l] == 'ل' and linesWithWords[i][j][l+1] == 'ا'):
lamAlefCount += 1
lamAlefIdx.append(l)
break
# Check for character segmentation error
if(len(chars) != len(labelWords[currLabelIndex]) - lamAlefCount):
segErrors.append((path, labelWords[currLabelIndex], i, j))
continue
word_lengths.append(len(chars))
for k in range(len(chars)):
data.append(chars[k])
if len(lamAlefIdx) != 0 and k == lamAlefIdx[0]:
labels.append(char_to_int['لا'])
else:
labels.append(char_to_int[labelWords[currLabelIndex][k]])
print(f'got {end-start} images in: {int(time() - was)} sec')
# with open('dataset/d')
return data, labels, segErrors, word_lengths
def save_predictions(predictions, path):
text = ''
for char in predictions:
text += int_to_char[char]
with open(path, 'w') as f:
f.write(text)
Y = [1, 2, 3, 0, 2, 3]
# save_predictions(Y, 'pred/t1.txt')
def get_char_images_pred(img_path='scanned/capr1.png'):
words = []
original = io.imread(img_path)
deskewed = deskew_1(original)
lines = get_lines(deskewed)
for i in range(len(lines)):
lines[i] = thresholding(lines[i])
linesWithWords = []
for line in lines:
linesWithWords.append(extract_words_one_line(line))
for i in range(len(linesWithWords)): # looping on lines
for j in range(len(linesWithWords[i])): # looping on words in specific line
words.append(get_char_from_word(linesWithWords[i][j], lines[i], True))
return words
# X = get_char_images_pred('scanned/capr2.png')
# for x in X:
# io.imshow(x)
# io.show()
def save_char_imgs(data, path):
k = 0
for char in data:
char = np.asarray(char * 255).astype(np.uint8)
io.imsave(path + '/capr' + str(k) + '.png', char)
k += 1
def save_labels(labels):
with open('labels.txt', 'w') as f:
for i in range(len(labels)):
f.write(str(labels[i]))
if i < len(labels) - 1:
f.write('\n')
# Takes path of directories containing character images and other for their labels
# Returns features array NxM and labels Nx1
# N = number of data
# M = number of features
def save_features(features):
features_path = 'features.txt'
with open(features_path, 'w') as f:
for i in range(len(features)):
for j in range(len(features[i])):
if j < len(features[i]) - 1:
f.write(str(features[i][j]) + ' ')
else:
f.write(str(features[i][j]))
if i < len(features) - 1:
f.write('\n')
def load_dataset(features_file, labels_file):
features = []
with open(features_file, 'r') as f:
lines = f.read().split('\n')
for line in lines:
features.append(line.split(' '))
features = np.asarray(features).astype(np.uint8)
labels = []
with open(labels_file, 'r') as f:
lines = np.array([f.read().split('\n')])
for line in lines:
labels.append(line)
labels = np.asarray(labels).astype(np.uint8)
return features, labels[0]