forked from YassineYousfi/comma10k-baseline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_lit_model.py
94 lines (73 loc) · 3.08 KB
/
train_lit_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Runs a model on a single node across multiple gpus.
"""
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
from pathlib import Path
from argparse import ArgumentParser
from LitModel import *
import torch
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateLogger
from pytorch_lightning.utilities.distributed import rank_zero_only
from pytorch_lightning.callbacks import Callback
seed_everything(1994)
def setup_callbacks_loggers(args):
log_path = Path('/home/yyousfi1/LogFiles/comma/')
name = args.backbone
version = args.version
tb_logger = TensorBoardLogger(log_path, name=name, version=version)
lr_logger = LearningRateLogger(logging_interval='epoch')
ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir)/'checkpoints/{epoch:02d}_{val_loss:.4f}',
save_top_k=10, save_last=True)
return ckpt_callback, tb_logger, lr_logger
def main(args):
""" Main training routine specific for this project. """
if args.seed_from_checkpoint:
print('model seeded')
model = LitModel.load_from_checkpoint(args.seed_from_checkpoint, **vars(args))
else:
model = LitModel(**vars(args))
ckpt_callback, tb_logger, lr_logger = setup_callbacks_loggers(args)
trainer = Trainer(checkpoint_callback=ckpt_callback,
logger=tb_logger,
callbacks=[lr_logger],
gpus=args.gpus,
min_epochs=args.epochs,
max_epochs=args.epochs,
precision=16,
amp_backend='native',
row_log_interval=100,
log_save_interval=100,
distributed_backend='ddp',
benchmark=True,
sync_batchnorm=True,
resume_from_checkpoint=args.resume_from_checkpoint)
trainer.logger.log_hyperparams(model.hparams)
trainer.fit(model)
def run_cli():
root_dir = os.path.dirname(os.path.realpath(__file__))
parent_parser = ArgumentParser(add_help=False)
parser = LitModel.add_model_specific_args(parent_parser)
parser.add_argument('--version',
default=None,
type=str,
metavar='V',
help='version or id of the net')
parser.add_argument('--resume-from-checkpoint',
default=None,
type=str,
metavar='RFC',
help='path to checkpoint')
parser.add_argument('--seed-from-checkpoint',
default=None,
type=str,
metavar='SFC',
help='path to checkpoint seed')
args = parser.parse_args()
main(args)
if __name__ == '__main__':
run_cli()