-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
151 lines (133 loc) · 6.02 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Train the model"""
import numpy as np
import pandas as pd
import argparse
import config
import utils
import sys
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pickle
from preprocess import preprocess
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score
def parse_searcher(searcher):
"""Get results from gridsearch
Args:
searcher: GridSearchCV object
"""
train_accs = searcher.cv_results_['mean_train_score']
val_accs = searcher.cv_results_['mean_test_score']
best_idx = searcher.best_index_
best_params = searcher.best_params_
train_acc, val_acc = train_accs[best_idx], val_accs[best_idx]
best_model = searcher.best_estimator_
return best_model, best_params, train_acc, val_acc
def train(X_train, y_train, estimator, param_grid, seed=1, n_jobs=1):
"""Train the model
Args:
X_train (pd.DataFrame): features (train)
y_train (pd.DataFrame): label (train)
estimator (sklearn.model): model
param_grid (dict): hyper-parameters to tune
seed (int): seed for training
n_jobs (int): num of threads
"""
np.random.seed(seed)
# train and tune hyper parameter with 5-fold cross validation
if param_grid is not None:
searcher = GridSearchCV(estimator, param_grid, cv=5, n_jobs=n_jobs, return_train_score=True, iid=False)
searcher.fit(X_train, y_train)
best_model, best_params, train_acc, val_acc = parse_searcher(searcher)
else:
# if no hyper parameter is given, train directly
best_model = estimator
val_acc = cross_val_score(best_model, X_train, y_train, cv=5).mean()
best_model.fit(X_train, y_train)
train_acc = best_model.score(X_train, y_train)
best_params = {}
result = {"best_params": best_params, "train_acc":train_acc, "val_acc": val_acc}
return best_model, result
def evaluate(best_model, X_test_list, y_test_list, test_files):
# evaluate on test sets
result = {}
for X_test, y_test, file in zip(X_test_list, y_test_list, test_files):
y_pred = best_model.predict(X_test)
test_acc = np.mean(y_pred == y_test)
result[file + "_test_acc"] = test_acc
if len(set(y_test)) > 2:
test_f1 = f1_score(y_test, y_pred, average='macro')
else:
test_f1 = f1_score(y_test, y_pred)
result[file + "_test_f1"] = test_f1
return result
def get_coarse_grid(model, seed, n_jobs, N):
"""Get hyper parameters (coarse random search) """
np.random.seed(seed)
low, high = model["hyperparams_range"]
if model["hyperparams_type"] == "real":
param_grid = {model['hyperparams']: 10 ** np.random.uniform(low, high, 20)}
if model["hyperparams_type"] == "int":
if model["name"] == "knn_classification":
high = min(high, int(N/5*4))
param_grid = {model['hyperparams']: np.random.randint(low, high, 20)}
return param_grid
def get_fine_grid(model, best_param_coarse, n_jobs, N):
"""Get hyper parameters (fine grid search, around the best parameter in coarse search) """
if model["hyperparams_type"] == "real":
base = np.log10(best_param_coarse)
param_grid = {model['hyperparams']: np.linspace(10**(base-0.5), 10**(base+0.5), 20)}
if model["hyperparams_type"] == "int":
low = max(best_param_coarse - 10, 1)
high = low + 20
if model["name"] == "knn_classification":
high = min(high, int(N/5*4))
param_grid = {model['hyperparams']: np.arange(low, high)}
return param_grid
def hyperparam_search(X_train, y_train, model, n_jobs=1, seed=1):
np.random.seed(seed)
coarse_param_seed, coarse_train_seed, fine_train_seed = np.random.randint(1000, size=3)
fixed_params = model["fixed_params"]
if "parallelable" in model.keys() and model['parallelable']:
fixed_params["n_jobs"] = n_jobs
estimator = model["fn"](**fixed_params)
# hyperparameter search
if "hyperparams" not in model.keys():
# if no hyper parmeter, train directly
best_model, result = train(X_train, y_train, estimator, None, n_jobs=n_jobs, seed=coarse_train_seed)
else:
# coarse random search
param_grid = get_coarse_grid(model, coarse_param_seed, n_jobs, len(y_train))
best_model_coarse, result_coarse = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=coarse_train_seed)
val_acc_coarse = result_coarse['val_acc']
# fine grid search
best_param_coarse = result_coarse['best_params'][model['hyperparams']]
param_grid = get_fine_grid(model, best_param_coarse, n_jobs, len(y_train))
best_model_fine, result_fine = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=fine_train_seed)
val_acc_fine = result_fine['val_acc']
if val_acc_fine > val_acc_coarse:
result = result_fine
best_model = best_model_fine
else:
result = result_coarse
best_model = best_model_coarse
# convert int to float to avoid json error
if model["hyperparams_type"] == "int":
result['best_params'][model["hyperparams"]] *= 1.0
return best_model, result
def train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=1, seed=1):
"""Search hyperparameters and evaluate
Args:
X_train (pd.DataFrame): features (train)
y_train (pd.DataFrame): label (train)
X_test_list (list): list of features (test)
y_test_list (list): list of label (test)
test_files (list): list of filenames of test set
model (dict): ml model dict in model.py
seed (int): seed for training
n_jobs (int): num of threads
"""
best_model, result_train = hyperparam_search(X_train, y_train, model, n_jobs, seed)
result_test = evaluate(best_model, X_test_list, y_test_list, test_files)
result = {**result_train, **result_test}
return result