-
Notifications
You must be signed in to change notification settings - Fork 1
/
hps_ner.py
210 lines (178 loc) · 8.01 KB
/
hps_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
from typing import Optional
import os
import shutil
import json
from datetime import datetime
import wandb
from ray import tune
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining, HyperBandForBOHB, ASHAScheduler
from ray.tune import CLIReporter
from ray.tune.integration.wandb import WandbLoggerCallback
from ray.tune.logger import DEFAULT_LOGGERS
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import BestRun
from pathlib import Path
BEST_TUNED_MODELS_PATH = "./best_tuned_models"
def get_immediate_child_directory_with_sub_name(parent_dir: str, sub_name: str) -> Optional[str]:
directory_contents = os.listdir(parent_dir)
for item in directory_contents:
item_path = parent_dir + "/" + item
if not os.path.isdir(item_path):
continue
if sub_name in item:
return item
return None
def save_best_model(best_run: BestRun, tune_name: str, tune_local_dir: str) -> None:
# Create directories
Path(BEST_TUNED_MODELS_PATH).mkdir(parents=True, exist_ok=True)
target_dir = BEST_TUNED_MODELS_PATH + "/" + tune_name
# Path(target_dir).mkdir(parents=True, exist_ok=True)
# Save model and hyperparameter configuration
tune_dir = tune_local_dir + "/" + tune_name
# Find best run directory
best_run_dir = get_immediate_child_directory_with_sub_name(tune_dir, best_run.run_id)
if best_run_dir is None:
print(f"Could not find best run directory run_id={best_run.run_id}")
return
best_run_dir = tune_dir + "/" + best_run_dir
# Go down two checkpoint levels
for i in range(2):
checkpoint_dir = get_immediate_child_directory_with_sub_name(best_run_dir, "checkpoint")
if best_run_dir is None:
print(f"Could not find best run checkpoint directory inside: {best_run_dir}")
return
best_run_dir += "/" + checkpoint_dir
target_dir_path = Path(target_dir)
if target_dir_path.exists() and target_dir_path.is_dir():
print("Target directory already exists, removing/overriding:", target_dir)
shutil.rmtree(target_dir_path)
shutil.copytree(src=best_run_dir, dst=target_dir)
# Write hyperparameters as json pretty print
with open(target_dir + "/hyperparameters.json", 'w') as f:
json.dump(best_run.hyperparameters, f, indent=True)
def hyperparameter_tune(trainer: Trainer, training_args: TrainingArguments, data_args) -> BestRun:
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d_%H:%M:%S")
# Dynamically set eval_steps based on data and batch size for reasonably fine-grained HPS
train_set_size = len(trainer.train_dataset)
# num_gpus = torch.cuda.device_count()
# bs = training_args.per_device_train_batch_size * (num_gpus if num_gpus > 0 else 1)
bs = training_args.per_device_train_batch_size
opt_steps_per_epoch = train_set_size / bs
evals_per_epoch = 4
eval_steps = int(opt_steps_per_epoch / evals_per_epoch) # evaluate & checkpoint 4 times per epoch
org_eval_steps = trainer.args.eval_steps
org_save_steps = trainer.args.save_steps
trainer.args.eval_steps = eval_steps
trainer.args.save_steps = eval_steps
# This corresponds to a full run to num_train_epochs
max_training_iterations = int(evals_per_epoch * training_args.num_train_epochs)
# Set this to true, and ray will try to resume execution of a cancelled session in the experiment directory
resume = False
assert data_args.tune_alg in ['BOHB', 'PBT', 'ASHA']
tune_config = {
# "weight_decay": tune.uniform(0.0, 0.3),
# "learning_rate": tune.uniform(1e-5, 1e-3),
"weight_decay": tune.uniform(0.0, 0.2),
"learning_rate": tune.uniform(5e-6, 5e-5),
# "learning_rate": tune.uniform(5e-6, 5e-5),
"warmup_ratio": tune.uniform(0, 0.12),
"attention_probs_dropout_prob": tune.uniform(0, 0.2),
"hidden_dropout_prob": tune.uniform(0, 0.2),
"per_device_train_batch_size": tune.choice([16, 32, 64]),
# "max_steps": -1, # Use 1 for smoke test, else -1.
}
# time_attr = "training_iteration"
time_attr = "epoch"
if data_args.tune_alg == 'BOHB':
scheduler = HyperBandForBOHB(
time_attr=time_attr,
# number of training_iterations (evaluations) to run for each trial, * 2 to allow for grace period
# max_t=max_training_iterations * 2,
max_t=int(training_args.num_train_epochs) * 2,
reduction_factor=3,
stop_last_trials=True,
)
search = TuneBOHB(
# space=config_space, # If you want to set the space manually
max_concurrent=4
)
elif data_args.tune_alg == 'ASHA':
scheduler = ASHAScheduler(
time_attr=time_attr,
# number of training_iterations (evaluations) to run for each trial, * 2 to allow for grace period
# max_t=max_training_iterations,
max_t=int(training_args.num_train_epochs),
grace_period=1, # Give each trial at least an epoch
# grace_period=int(evals_per_epoch / 2), # Give each trial at least half an epoch
reduction_factor=3,
)
search = HyperOptSearch()
else: # data_args.tune_alg == 'PBT'
scheduler = PopulationBasedTraining(
time_attr=time_attr,
# metric="eval_f1",
# mode="max",
# perturbation_interval=1, # perturb at every evaluation
perturbation_interval=1 / evals_per_epoch, # perturb at every evaluation
# perturbation_interval=eval_steps,
hyperparam_mutations=tune_config,
# synch=torch.cuda.device_count() < 2 # Use non-recommended synchronized implementation if not multi-gpu
)
search = None
reporter = CLIReporter(
parameter_columns={
"weight_decay": "w_decay",
"learning_rate": "lr",
"warmup_ratio": "wr",
"attention_probs_dropout_prob": "att_do",
"hidden_dropout_prob": "hi_do",
"per_device_train_batch_size": "bs",
# "per_device_train_batch_size": "train_bs/gpu",
# "num_train_epochs": "num_epochs"
},
metric_columns=[
"eval_accuracy", "eval_loss", "eval_precision", "eval_recall", "eval_f1",
"epoch", "training_iteration"
])
best_run = trainer.hyperparameter_search(
hp_space=lambda _: tune_config,
metric="eval_f1",
mode="max",
direction="maximize",
backend="ray",
n_trials=data_args.tune_trials,
resources_per_trial={
"cpu": 2,
"gpu": 1
},
scheduler=scheduler,
search_alg=search,
keep_checkpoints_num=1,
# checkpoint_score_attr="training_iteration",
checkpoint_score_attr="epoch",
stop=None,
progress_reporter=reporter,
# local_dir="/home/joey/extra_space/ray_results/",
# local_dir="/home/joey/code/kb/ner_kram/ray_results/",
local_dir=data_args.tune_local_dir,
name=data_args.tune,
log_to_file=True,
fail_fast=True,
resume=resume,
callbacks=[WandbLoggerCallback(project="ner_kram",
entity="joeyohman",
group=data_args.tune + "_" + dt_string,
# api_key="0fc05c8f0ff7f9219378a081a69de35fc26c1011",
# api_key_file="wandb_key_file.txt",
# log_config=data_args.tune_alg == 'PBT',
# start_method="fork",
settings=wandb.Settings(start_method="fork"),
)],
)
trainer.args.eval_steps = org_eval_steps
trainer.args.save_steps = org_save_steps
save_best_model(best_run, data_args.tune, data_args.tune_local_dir)
return best_run