-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain_distributions.py
326 lines (281 loc) · 10.4 KB
/
train_distributions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import pandas as pd
import numpy as np
import cv2
import os
import random
import preprocess
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from preprocess import full_pipeline
from hough import draw_hough, AVG_H, AVG_W
from RPCC_metric_utils_for_participants_V2 import *
LABELS_PATH = "./data/labels/train.csv"
PIXEL_TO_MM_RATIO = 0.1
data_df = None
def get_train_radiuses(cur_im: int) -> np.array:
"""
Reads given image, and computes cv2.houghcircles radius distribution for it
Args:
cur_im: int - imaged id
Returns:
np.array: - array of shape (29, ) - count of each radius
"""
im = preprocess.read_im(cur_im)
proc = cv2.resize(im, (AVG_W, AVG_H))
avg_r, circles = draw_hough(proc)
radiuses = circles[:, 2]
distros = [
np.where(radiuses == i)[0].shape[0] for i in range(1, 30)
] # get each radius count
return distros
def get_radiuses(data_df: pd.DataFrame) -> list:
"""
Reads preprocessed images and retreives HoughCircles radiuses
for each image
Args:
data_df:pd.DataFrame
Returns:
rows:list - list with radiuses for each image
"""
rows = []
for ind, (frac, im) in enumerate(
list(zip(data_df.fraction.to_list(), data_df.ImageId.to_list()))
):
distros = get_train_radiuses(im)
if frac is not np.nan:
rows.append([im, frac] + distros)
return rows
def y_radius(y: np.array) -> (int, np.array):
"""
Return mean bin diameter for matrix of distributions of beans
Example : 6,7,8,10,12,14,16, 18, 20, 25,30,35,40,45,50,60,70,80,100
y : [[0,0,0,0,0,0,0,0,0, 0.5, 0.6, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0, 0.1, 0.2, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]] - 2 rows
mean is
mean_y = [0 0 0 .............. 0.3 0.4 ...............................0]
multiply by diam
6,7,8,10,12,14,16, 18, 20, 25,30,35,40,45,50,60,70,80,100,0
[3.35,2.8, 2.36, 2, 1.7, 1.4, 1.18, 1, 0.85, 0.71, 0.6, 0.5, 0.425, 0.355, 0.3, 0.25, 0.212, 0.18, 0.15,0]:
0.3 * 1 (18th bin) + 0.4 * 0.85 (20th bin) = 0.64 mm - mean suitable diameter of the bin
Args:
y: np.array - target matrix of bin distributions of shape (x, 20),
Returns:
int: - mean suitable diamater
np.array: - mean distribution bin array (like mean_y)
"""
mean_y = y.mean(axis=0)
return np.inner(mean_y, sive_diam_pan), mean_y
def x_radius(x: np.array) -> (int, np.array):
"""
Return mean radius for matrix of distributions of radiuses
Example : 1,2,3,4 5 6 7 8 ,9, 10 .....
[[0..... 0.4 0.2 0.5 0 .......]
[0..... 0.2 0 0.8 0 .......]
mean is
mean_x = [0..... 0.3 0.2 0.65 0 .......]
multiply by size of radiuses
[1 2 3 4 5 6 7 8 9]
5*0.3 + 6*0.2 + 7*0.65 = 7.25 is a mean radius over the whole matrix of radiuses
distributions
Args:
x: np.array - matrix of normalized radiuses distribution of train image
of shape (x, 29),
[i, j] cell shows which part of all hough circles on image #i
take circles with radius of j pixels
Returns:
int: - mean radius in pixels
np.array: - mean radius bin array (mean_x of shape (29, ))
"""
mean_x = x.mean(axis=0)
return np.inner(mean_x, list(range(1, 30))), mean_x
def augment_data(
data_x: np.array, data_y: np.array, shifts: int
) -> (np.array, np.array):
"""
Helper method to overcome unknown fractions and new distributions of radiuses and bins.
Data augmentation - shifting all radiuses in data_x array on 'shifts' points left
and calculating new bin distributions :
shift matrix of bin distributions until mean diameter of the bin is the same
as diameter of the shifted radiuses
Args:
data_x: np.array - matrix of normalized radiuses distribution of train image
of shape (n, 29),
data_y: np.array - target matrix of bin distributions of shape (n, 20),
Returns:
augmented_x: np.array - shifted radiuses
best_y: np.array - cacluated best target distributions of bins
"""
augmented_x = np.roll(data_x.copy(), shifts)
avg_r = x_radius(augmented_x)[0] * PIXEL_TO_MM_RATIO
augmented_y = data_y.copy()
best_y = augmented_y
shift = -1 if shifts > 0 else 1
while y_radius(augmented_y)[1][0] == 0 and y_radius(augmented_y)[1][-1] == 0:
augmented_y = np.roll(augmented_y, shift)
avg_r_y = y_radius(augmented_y)[0]
diff = avg_r - avg_r_y
if abs(diff) < abs(avg_r - y_radius(best_y)[0]):
best_y = augmented_y.copy()
return augmented_x, best_y
def augment_class(
class_name: str,
labels_df: pd.DataFrame,
radius_df: pd.DataFrame,
augment: bool = True,
) -> (np.array, np.array):
"""
Method find all rows for the given class (fraction) and augments them
Args:
class_name:str - which fraction to augment
labels_df:pd.DataFrame
radius_df:pd.DataFrame
augment:bool
Returns:
train_x:np.array
train_y:np.array
"""
train_class = radius_df[radius_df.fraction == class_name]
init_x, init_y = get_data_labels(train_class, labels_df)
train_x = init_x.copy()
train_y = init_y.copy()
if augment:
for shift in range(-5, 5, 2):
aug_x, aug_y = augment_data(init_x, init_y, shift)
train_x = np.concatenate((train_x, aug_x))
train_y = np.concatenate((train_y, aug_y))
return train_x, train_y
def get_data_labels(
radius_df: pd.DataFrame, labels_df: pd.DataFrame, normalize: bool = True
) -> (pd.DataFrame, pd.DataFrame):
"""
Drop redundant columns and normalize target distributions by their sum
Args:
radius_df: pd.DataFrame - table of radiuses distributions of train image
of shape (n, 29), [i,j] cell shows how many circles with radius j found on image #i
labels_df: pd.DataFrame - table of target bin distributions of shape (n, 20)
normalize: bool - flag whether to normalize each radius by number of their occurences on the image
Returns:
labels: pd.DataFrame - clean train.csv table without redundant columns
data: pd.DataFrame - normalized radiuses table for images
"""
labels = (
labels_df[labels_df.ImageId.isin(radius_df.ImageId)]
.drop(["ImageId", "fraction"], axis=1)
.to_numpy()
)
labels = np.array([row / row.sum() for row in labels])
data = radius_df.drop(["ImageId", "fraction"], axis=1).to_numpy()
if normalize:
data = np.array([row / row.sum() for row in data])
return data, labels
def read_data() -> (pd.DataFrame, pd.DataFrame):
"""
Reads preprocessed images and retreives HoughCircles radiuses
for each image, shuffles everything and assembles new pandas dataframe from the array of radiuses
Returns:
data_df:pd.DataFrame
radius_df:pd.DataFrame
"""
data_df = pd.read_csv(LABELS_PATH)
rows = get_radiuses(data_df)
radius_df = pd.DataFrame(rows, columns=["ImageId", "fraction"] + list(range(1, 30)))
radius_df = radius_df.sample(frac=1).reset_index(drop=True)
data_df = data_df.drop(
[
"prop_count",
"pan",
],
axis=1,
)
data_df = data_df.dropna(subset=["fraction"])
data_df = data_df.set_index("ImageId")
data_df = data_df.reindex(index=radius_df["ImageId"])
data_df = data_df.reset_index()
data_df["0"] = 0
return data_df, radius_df
def train_test_split_radius(
x_df: pd.DataFrame, y_df: pd.DataFrame
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
Train test split of the radiuses (x_df) and target bin distributions (y_df)
Args:
x_df:pd.DataFrame
y_df:pd.DataFrame
Returns:
train_df: pd.DataFrame
train_labels_df: pd.DataFrame
test_df: pd.DataFrame
test_labels_df: pd.DataFrame
"""
train_df, train_labels_df, test_df, test_labels_df = train_test_split(x_df, y_df, test_size = 0.2, stratify = y_df.fraction.to_list())
return train_df, train_labels_df, test_df, test_labels_df
def get_training_data(
labels_df: pd.DataFrame, train_df: pd.DataFrame, test_df: pd.DataFrame
) -> (np.array, np.array, np.array, np.array):
"""
Function augments samples of each fraction except 2040pdcd and returns train and test data
Args:
labels_df:pd.DataFrame
train_df:pd.DataFrame
test_df:pd.DataFrame
Returns:
train_x:np.array
train_y:np.array
test_x:np.array
test_y:np.array
"""
train_1620, labels_1620 = augment_class(
"16/20", augment=True, labels_df=labels_df, radius_df=train_df
)
train_2040, labels_2040 = augment_class(
"20/40", augment=True, labels_df=labels_df, radius_df=train_df
)
train_20phd, labels_20phd = augment_class(
"20/40_pdcpd_bash_lab", augment=False, labels_df=labels_df, radius_df=train_df
)
train_x = np.concatenate(
(
train_1620,
train_20phd,
train_2040,
)
)
train_y = np.concatenate(
(
labels_1620,
labels_20phd,
labels_2040,
)
)
test_x, test_y = get_data_labels(test_df, labels_df)
return train_x, train_y, test_x, test_y
def train_model(train_x: np.array, train_y: np.array) -> DecisionTreeRegressor:
"""
Function trains RandomForestRegressor on train x and train y
Args:
train_x:np.array - training data
train_y:np.array - training labels
Returns:
regr: RandomForestRegressor - trained model
"""
regr = DecisionTreeRegressor(max_depth=15, random_state=0)
regr = regr.fit(train_x, train_y)
return regr
def get_trained_model() -> DecisionTreeRegressor:
"""
Function runs all the pipeline and returns the trained model
Returns:
regr:MultiOutputRegressor - model
"""
global data_df
random.seed(41)
np.random.seed(41)
data_df, radius_df = read_data()
print("Training model.....")
train_df, train_labels_df, test_df, test_labels_df = train_test_split(
radius_df, data_df
)
train_x, train_y, test_x, test_y = get_training_data(data_df, train_df, test_df)
regr = train_model(train_x, train_y)
return regr