forked from anosorae/IRRA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
77 lines (64 loc) · 2.68 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import os.path as op
import torch
import numpy as np
import random
import time
from irra.datasets import build_dataloader
from irra.processor.processor import do_train
from irra.utils.checkpoint import Checkpointer
from irra.utils.iotools import save_train_configs
from irra.utils.logger import setup_logger
from irra.solver import build_optimizer, build_lr_scheduler
from irra.model import build_model
from irra.utils.metrics import Evaluator
from irra.utils.options import get_args
from irra.utils.comm import get_rank, synchronize
def set_seed(seed=0):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
if __name__ == '__main__':
args = get_args()
set_seed(1+get_rank())
name = args.name
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
args.distributed = num_gpus > 1
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend="nccl", init_method="env://")
synchronize()
device = "cuda"
cur_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
args.output_dir = op.join(args.output_dir, args.dataset_name, f'{cur_time}_{name}')
logger = setup_logger('IRRA', save_dir=args.output_dir, if_train=args.training, distributed_rank=get_rank())
logger.info("Using {} GPUs".format(num_gpus))
logger.info(str(args).replace(',', '\n'))
save_train_configs(args.output_dir, args)
# get image-text pair datasets dataloader
train_loader, val_img_loader, val_txt_loader, num_classes = build_dataloader(args)
model = build_model(args, num_classes)
logger.info('Total params: %2.fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
model.to(device)
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(
model,
device_ids=[args.local_rank],
output_device=args.local_rank,
# this should be removed if we update BatchNorm stats
broadcast_buffers=False,
)
optimizer = build_optimizer(args, model)
scheduler = build_lr_scheduler(args, optimizer)
is_master = get_rank() == 0
checkpointer = Checkpointer(model, optimizer, scheduler, args.output_dir, is_master)
evaluator = Evaluator(val_img_loader, val_txt_loader)
start_epoch = 1
if args.resume:
checkpoint = checkpointer.resume(args.resume_ckpt_file)
start_epoch = checkpoint['epoch']
do_train(start_epoch, args, model, train_loader, evaluator, optimizer, scheduler, checkpointer)