-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain_contour.py
201 lines (176 loc) · 6.25 KB
/
train_contour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import pandas as pd
import numpy as np
import cv2
import os
import preprocess
import hough
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import AdaBoostRegressor
import joblib
HAND_MARKED_LABELS = "./data/labels/labels_hand_marked.csv"
def get_stats(im: np.array) -> (np.array, np.array):
"""
Returns stats and areas of connected components of the image
Args:
im: np.array
Returns:
stats: np.array - stats of every component
label_area:np.array - area of every component
"""
img = cv2.resize(im, (hough.AVG_W, hough.AVG_H))
img = preprocess.cut_image(img, 30)
mask = cv2.threshold(
img[:, :, 0], 255, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)[1]
stats = cv2.connectedComponentsWithStats(mask, 4)[2]
label_area = stats[1:, cv2.CC_STAT_AREA]
return stats, label_area
def get_count(im: np.array, l: int = 70, r: int = 600, res: bool = True):
"""
Returns approximate number of granules, by counting mean area of component
that lies in [l, r]
if res is true then im is already areas of components of image, otherwise
we compute it beforehand
Args:
im: np.array - image or label_area depending on res
l:int - left boundary for computing mean
r:int - right boundary
res:bool
ret_stats: bool - do we want to return stats
"""
stats = None
if res:
stats, label_area = get_stats(im)
else:
label_area = im.copy()
min_area, max_area = l, r
singular_mask = (min_area < label_area) & (label_area <= max_area)
circle_area = np.mean(label_area[singular_mask])
found = int(np.sum(np.round(label_area / circle_area)))
return found, stats
def precompute_stats(df: pd.DataFrame) -> (np.array, np.array, np.array):
"""
Precompute stats and areas of every image beforehand, to
save computational time
Args:
df:pd.DataFrame
ret_stats
Returns:
images:np.array - areas of images,
labels:np.array - prop_count of images
imageids:np.array - id of each image
"""
images = []
labels = df.prop_count.to_list()
imageids = df.ImageId.to_list()
stats = []
for im in df.ImageId.to_list():
curim = preprocess.read_im(im)
stat, labarea = get_stats(curim)
stats.append(stat.copy())
images.append(labarea)
return images, labels, imageids
def brute_force_bounds(
images: np.array, labels: np.array, imageids: np.array
) -> (np.array, np.array, np.array):
"""
Find for each image best suitable l,r boundaries
then we will train model for predicting them on unknown images.
Args:
images:np.array - areas of components
lables:np.array - prop count of images
imageids:np.array - ids of images in dataframe
Returns:
bounds:np.array - best boundaries
train_x:np.array - data for training model where features are distribution of areas of components
test_x:np.array - labels of training data
"""
train_y = []
train_x = []
bounds = [(1, 1)] * len(images)
bests = [1] * len(images)
for id_, (im, lab, imid) in enumerate(zip(images, labels, imageids)):
min_error = 100000
l = 30
# try all variants of (l,r) for image
for r in range(l + 10, 1000, 10):
try:
found, _ = get_count(im, l=l, r=r, res=False)
except ValueError:
found = 0
true = lab
error = abs(found - true) / true
if min_error > error:
min_error = error
best_l = l
best_r = r
# save best
bounds[id_] = (l, best_r)
bests[id_] = min_error
train_y.append((l, best_r))
found, stats = get_count(preprocess.read_im(imid), l=0, r=1000, res=True)
# find distribution
p = pd.DataFrame(
stats,
columns=[
"CC_STAT_LEFT",
"CC_STAT_TOP",
"CC_STAT_WIDTH",
"CC_STAT_HEIGHT",
"CC_STAT_AREA",
],
)
out = pd.cut(
p[p.CC_STAT_AREA < 1000].CC_STAT_AREA,
bins=np.arange(0, 1000, 10),
include_lowest=True,
)
# add to training data
train_x.append(out.value_counts(normalize=True, sort=False).to_list())
bounds = np.array(bounds)
train_y = np.array(train_y)
train_x = np.array(train_x)
return bounds, train_x, train_y
def read_data_frame(
csv_file: str, drop_columns: list, drop_images: list
) -> pd.DataFrame:
df = pd.read_csv(csv_file).drop(drop_columns, axis=1)
df = df[~df.prop_count.isna()]
df = df[~df.ImageId.isin(drop_images)]
return df
def get_model() -> AdaBoostRegressor:
"""
Full pipeline for getting trained AdaBoostRegressor model
Returns:
clf:AdaBoostRegressor
"""
# Read dataframes and drop excess columns and bad images
# Special dataframe of our handmarked labels
drop_columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"]
drop_images = [104, 908, 906, 907, 905, 904] + list(range(905, 1000))
df_augmented = read_data_frame(HAND_MARKED_LABELS, drop_columns, drop_images)
# precompute stats and brute force (l,r) boundaries
images_aug_, labels_aug_, imageids_aug_ = precompute_stats(df_augmented)
imageids_aug_ = df_augmented.ImageId.to_numpy()
labels_aug_ = np.array(labels_aug_)
bounds_aug_, train_x_aug_, train_y_aug_ = brute_force_bounds(
images_aug_, labels_aug_, imageids_aug_
)
# images on which we tested model
test_ids = [776, 675, 42, 3, 714, 312, 127, 653, 592, 205, 179, 191]
test_indices = np.in1d(df_augmented.ImageId.to_numpy(), test_ids).nonzero()[0]
# delete test images from train
deleted_test_x, deleted_test_y = (
np.delete(train_x_aug_, test_indices, axis=0),
np.delete(train_y_aug_, test_indices, axis=0),
)
x_train_aug = deleted_test_x[:]
y_train_aug = deleted_test_y[:]
# train
clf = MultiOutputRegressor(AdaBoostRegressor(random_state=10, n_estimators=5)).fit(
x_train_aug, y_train_aug
)
return clf
if __name__ == "__main__":
clf = get_model()
joblib.dump(clf, "counter_model.pkl")