-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathnonImg_model_wrappers.py
380 lines (323 loc) · 18.3 KB
/
nonImg_model_wrappers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import sklearn
import os
import csv
import collections
import matplotlib.pyplot as plt
import shap
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.calibration import CalibratedClassifierCV
from utils import COG_thresholding, ADD_thresholding
"""
Example Usage:
model = NonImg_Model_Wrapper(
tasks=['COG', 'ADD'], # a list of tasks to train
main_config=read_json('main_config.json'), # main_config is the dict read from json
task_config=read_json('task_config.json'), # task config is the dict read from json
seed=1000) # random seed
model.train() # train the model
thres = model.get_optimal_thres() # get optimal threshold
model.gen_score(['test'], thres) # generate csv files to future evaluation
for more details how this class is called, please see main.py
note: in the tasks argument, need to put COG before ADD since imputer will be calculated based on COG data
and the imputer will be used to transform the ADD data
"""
class NonImg_Model_Wrapper:
def __init__(self, tasks, main_config, task_config, seed):
# --------------------------------------------------------------------------------------------------------------
# some constants
self.seed = seed # random seed number
self.model_name = main_config['model_name'] # user assigned model_name, will create folder using model_name to log
self.csv_dir = main_config['csv_dir'] # data will be loaded from the csv files specified in this directory
self.config = task_config # task_config contains task specific info
self.n_tasks = len(tasks) # number of tasks will be trained
self.tasks = tasks # a list of tasks names to be trained
self.features = task_config['features'] # a list of features
# --------------------------------------------------------------------------------------------------------------
# folders preparation to save checkpoints of model weights *.pth
self.checkpoint_dir = './checkpoint_dir/{}/'.format(self.model_name)
if not os.path.exists(self.checkpoint_dir):
os.mkdir(self.checkpoint_dir)
# folders preparation to save tensorboard and other logs
self.tb_log_dir = './tb_log/{}/'.format(self.model_name)
if not os.path.exists(self.tb_log_dir):
os.mkdir(self.tb_log_dir)
# --------------------------------------------------------------------------------------------------------------
# initialize models
self.models = [] # note: self.models[i] is for the i th task
self.init_models([task_config[t]['name'] for t in tasks])
# --------------------------------------------------------------------------------------------------------------
# initialize data
self.train_data = [] # note: self.train_data[i] contains the
self.imputer = None
self.load_preprocess_data() # features and labels for the i th task
def train(self):
for i, task in enumerate(self.tasks):
X, Y = self.train_data[i].drop([task], axis=1), self.train_data[i][task]
self.models[i].fit(X, Y)
print(task + ' model training is done!')
def get_optimal_thres(self, csv_name='valid'):
self.gen_score(stages=[csv_name])
thres = {}
for i, task in enumerate(self.tasks):
if task == 'COG' and self.config['COG']['type'] == 'reg':
thres['NC'], thres['DE'] = COG_thresholding(self.tb_log_dir + csv_name + '_eval.csv')
elif task == 'ADD':
thres[task] = ADD_thresholding(self.tb_log_dir + csv_name + '_eval.csv')
else:
print("optimal for the task {} is not supported yet".format(task))
return thres
def gen_score(self, stages=['train', 'valid', 'test', 'OASIS'], thres={'ADD':0.5, 'NC':0.5, 'DE':1.5}):
for stage in stages:
data = pd.read_csv(self.csv_dir + stage + '.csv')[self.features + self.tasks + ['filename']]
data = self.drop_cases_without_label(data, 'COG')
COG_data = self.preprocess_pipeline(data[self.features+['COG']], 'COG') # treat it as COG data to do the preprocessing
features = COG_data.drop(['COG'], axis=1)
labels = data[self.tasks]
filenames = data['filename']
# make sure the features and labels has the same number of rows
if len(features.index) != len(labels.index):
raise ValueError('number of rows between features and labels have to be the same')
predicts = []
for i, task in enumerate(self.tasks):
if task == 'COG':
predicts.append(self.models[i].predict(features))
print("the shape of prediction for COG task is ", predicts[-1].shape)
if task == 'ADD':
predicts.append(self.models[i].predict_proba(features))
print("the shape of prediction for ADD task is ", predicts[-1].shape)
content = []
for i in range(len(features.index)):
label = labels.iloc[i] # the feature and label are for the i th subject
filename = filenames.iloc[i]
case = {'filename': filename}
for j, task in enumerate(self.tasks): # j is the task index
case[task] = "" if np.isnan(label[task]) else int(label[task])
if task == 'COG':
case[task+'_score'] = predicts[j][i]
if case[task+'_score'] < thres['NC']:
case[task + '_pred'] = 0
elif thres['NC'] <= case[task+'_score'] <= thres['DE']:
case[task + '_pred'] = 1
else:
case[task + '_pred'] = 2
elif task == 'ADD':
case[task + '_score'] = predicts[j][i, 1]
if case[task+'_score'] < thres['ADD']:
case[task + '_pred'] = 0
else:
case[task + '_pred'] = 1
content.append(case)
with open(self.tb_log_dir + stage + '_eval.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=list(content[0].keys()))
writer.writeheader()
for case in content:
writer.writerow(case)
def shap(self, stage='test'):
"""
This function will generate shap value for a specific stage
if stage is 'test', the shap analysis will be performed on the testing section of the data
"""
# get the data ready
data = pd.read_csv(self.csv_dir + stage + '.csv')
task_data = []
for task in self.tasks:
task_data.append(data[self.features + [task]])
for i, task in enumerate(self.tasks):
task_data[i] = self.preprocess_pipeline(task_data[i], task).drop([task], axis=1)
# get the explainer ready
self.explainer = []
shap_values = []
task_names = [self.config[t]['name'] for t in self.tasks]
for i, task in enumerate(self.tasks):
# background = shap.maskers.Independent(self.train_data[i], max_samples=100) # can we sample background from train_data?
if task_names[i] in ['XGBoostCla', 'XGBoostReg']:
if 'Reg' in task_names[i]:
explainer = shap.TreeExplainer(self.models[i], task_data[i], model_output='raw')
shap_values.append(explainer.shap_values(task_data[i]))
elif 'Cla' in task_names[i]:
explainer = shap.TreeExplainer(self.models[i], task_data[i], model_output='predict_proba')
shap_values.append(explainer.shap_values(task_data[i])[1]) # index 1 means only taking ADD prob
elif task_names[i] in ['CatBoostCla', 'CatBoostReg']:
if 'Reg' in task_names[i]:
explainer = shap.TreeExplainer(self.models[i], model_output='raw')
shap_values.append(explainer.shap_values(task_data[i]))
elif 'Cla' in task_names[i]: # use kernel explainer becaus shap only support model_output="raw"
explainer = shap.KernelExplainer(self.models[i].predict_proba, task_data[i])
shap_values.append(explainer.shap_values(task_data[i], nsamples=200)[1])
elif task_names[i] in ['RandomForestCla', 'RandomForestReg', 'DecisionTreeCla', 'DecisionTreeReg']:
if 'Reg' in task_names[i]:
explainer = shap.TreeExplainer(self.models[i], task_data[i], model_output='raw')
shap_values.append(explainer.shap_values(task_data[i]))
elif 'Cla' in task_names[i]:
explainer = shap.TreeExplainer(self.models[i], task_data[i], model_output='probability')
shap_values.append(explainer.shap_values(task_data[i])[1])
elif task_names[i] in ['PerceptronCla', 'PerceptronReg']:
if 'Reg' in task_names[i]:
explainer = shap.KernelExplainer(self.models[i].predict, task_data[i])
shap_values.append(explainer.shap_values(task_data[i], nsamples=500))
elif 'Cla' in task_names[i]:
explainer = shap.KernelExplainer(self.models[i].predict_proba, task_data[i])
shap_values.append(explainer.shap_values(task_data[i], nsamples=500)[1]) # index 1 means only taking ADD prob
elif task_names[i] in ['SupportVectorCla', 'SupportVectorReg', 'NearestNeighborCla', 'NearestNeighborReg']:
if 'Reg' in task_names[i]:
explainer = shap.KernelExplainer(self.models[i].predict, task_data[i])
shap_values.append(explainer.shap_values(task_data[i], nsamples=200))
elif 'Cla' in task_names[i]:
explainer = shap.KernelExplainer(self.models[i].predict_proba, task_data[i])
shap_values.append(explainer.shap_values(task_data[i], nsamples=200)[1]) # index 1 means only taking ADD prob
print(task + "'s shap values in shape: ", shap_values[-1].shape)
# save the shap_values into a csv file for future use
# rows are subjects, columns are the features
columns = task_data[i].columns
df = pd.DataFrame(shap_values[-1], columns=columns)
df.to_csv(self.tb_log_dir + 'shap_'+stage+'_'+task+'.csv', index = False, header=True)
self.shap_beeswarm_plot(shap_values[-1], task_data[i], task, stage)
return shap_values, task_data
###############################################################################################################
# below methods are internal methods and won't be called from outside of the class
def init_models(self, task_models):
"""
each task can have different types of models
for example, we will use regression relevant model for the COG task
and classification relevant model for the ADD task
the task_models parameter should be a python list
where the task_models[i] is the name of the model for the i th task
after model initialization, models will be appended into self.models
where the self.models[i] is for the i th task
"""
for name in task_models:
# random forest model tested, function well
if name == 'RandomForestCla':
model = RandomForestClassifier()
elif name == 'RandomForestReg':
model = RandomForestRegressor()
# xgboost model tested, function well
elif name == 'XGBoostCla':
model = xgb.XGBClassifier(use_label_encoder=False)
elif name == 'XGBoostReg':
model = xgb.XGBRegressor()
# catboost model tested, function well
elif name == 'CatBoostCla':
model = CatBoostClassifier()
elif name == 'CatBoostReg':
model = CatBoostRegressor()
# mlp model tested, function well
elif name == 'PerceptronCla':
model = MLPClassifier(max_iter=1000)
elif name == 'PerceptronReg':
model = MLPRegressor(max_iter=1000)
# decision tree model tested, function well
elif name == 'DecisionTreeCla':
model = DecisionTreeClassifier()
elif name == 'DecisionTreeReg':
model = DecisionTreeRegressor()
# support vector model tested, function well
elif name == 'SupportVectorCla':
model = SVC(probability=True)
elif name == 'SupportVectorReg':
model = SVR()
# KNN model tested, function well
elif name == 'NearestNeighborCla':
model = KNeighborsClassifier()
elif name == 'NearestNeighborReg':
model = KNeighborsRegressor()
self.models.append(model)
def init_imputer(self, data):
"""
since cases with ADD labels is only a subset of the cases with COG label
in this function, we will initialize a single imputer
and fit the imputer based on the COG cases from the training part
"""
imputation_method = self.config['impute_method']
if imputation_method == 'mean':
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
elif imputation_method == 'median':
imp = SimpleImputer(missing_values=np.nan, strategy='median')
elif imputation_method == 'most_frequent':
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
elif imputation_method == 'constant':
imp = SimpleImputer(missing_values=np.nan, strategy='constant')
elif imputation_method == 'KNN':
imp = KNNImputer(n_neighbors=20)
elif imputation_method == 'Multivariate':
imp = IterativeImputer(max_iter=1000)
else:
raise NameError('method for imputation not supported')
imp.fit(data)
return imp
def load_preprocess_data(self):
data_train = pd.read_csv(self.csv_dir + 'train.csv')
for task in self.tasks:
self.train_data.append(data_train[self.features + [task]])
for i, task in enumerate(self.tasks):
self.train_data[i] = self.preprocess_pipeline(self.train_data[i], task)
print('after preprocess pipeline, the data frame for the {} task is'.format(task))
print(self.train_data[i])
print('\n' * 2)
def preprocess_pipeline(self, data, task):
"""
Cathy, we need to remove cases with too much missing non-imaging features, please consider adding the step
"""
# data contains features + task columns
data = self.drop_cases_without_label(data, task)
data = self.transform_categorical_variables(data)
features = data.drop([task], axis=1) # drop the task columns to get all features
features = self.imputation(features) # do imputation merely on features
features = self.normalize(features) # normalize features
features[task] = data[task] # adding the task column back
return features # return the complete data
def drop_cases_without_label(self, data, label):
data = data.dropna(axis=0, how='any', thresh=None, subset=[label], inplace=False)
return data.reset_index(drop=True)
def transform_categorical_variables(self, data):
if 'gender' in data:
return data.replace({'male': 0, 'female': 1})
else:
return data
# return pd.get_dummies(data, columns=['gender'])
def imputation(self, data):
columns = data.columns
if self.imputer == None:
self.imputer = self.init_imputer(data)
data = self.imputer.transform(data)
return pd.DataFrame(data, columns=columns)
def normalize(self, data):
df_std = data.copy()
for column in df_std.columns:
if data[column].std(): # normalize only when std != 0
df_std[column] = (data[column] - data[column].mean()) / data[column].std()
return df_std
def shap_beeswarm_plot(self, shap_values, data, task, stage):
from matplotlib import rc, rcParams
rc('axes', linewidth=2)
rc('font', weight='bold')
fig, ax = plt.subplots(figsize=(8, 10))
fig.text(-0.04, 0.87, 'Features', fontsize=15, fontweight='black')
shap.summary_plot(shap_values, data)
ax.set_xlabel('SHAP value', fontsize=15, fontweight='black')
plt.savefig(self.tb_log_dir + '{}_shap_beeswarm_{}.png'.format(stage, task), dpi=100, bbox_inches='tight')
plt.close()
class Fusion_Model_Wrapper(NonImg_Model_Wrapper):
def load_preprocess_data(self):
data_train = pd.read_csv(self.csv_dir + 'train_mri.csv')
for task in self.tasks:
self.train_data.append(data_train[self.features + [task]])
for i, task in enumerate(self.tasks):
self.train_data[i] = self.preprocess_pipeline(self.train_data[i], task)
print('after preprocess pipeline, the data frame for the {} task is'.format(task))
print(self.train_data[i])
print('\n' * 2)