-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
595 lines (464 loc) · 20.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
# Copyright (C) 2021 Samsung Electronics Co. LTD
# This software is a property of Samsung Electronics.
# No part of this software, either material or conceptual may be copied or distributed, transmitted,
# transcribed, stored in a retrieval system or translated into any human or computer language in any form by any means,
# electronic, mechanical, manual or otherwise, or disclosed
# to third parties without the express written permission of Samsung Electronics.
# The following items are modified and they can be claimed as properties of Samsung Electronics.
# (1) Add get_optimzer() interface
# (2) Add a batch sampler (SplitRandomSampler) for 4-bin splitting training data
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.distributed as dist
import os
import time
import math
import h5py
import torch.cuda.nvtx as nvtx
from apex.optimizers import FusedLAMB, FusedAdam, FusedSGD
from optim import ACClip, MADGRAD
from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
from apex.contrib.optimizers.distributed_fused_adam_v3 import DistributedFusedAdamV3
from typing import Callable, Optional, Tuple
try:
from torch.utils.tensorboard import SummaryWriter
has_tensorboard = True
except ImportError:
has_tensorboard = False
try:
from apex import amp
has_apex = True
except ImportError:
amp = None
has_apex = False
from contextlib import contextmanager
from functools import partial
import logging.config
import random
_TENSOR_PARALLEL_GROUP = None # ProcessGroup
_DATA_PARALLEL_GROUP = None # ProcessGroup
_PIPELINE_PARALLEL_GROUP = None # ProcessGroup
_TENSOR_PARALLEL_WORLD_SIZE = None
_PIPELINE_PARALLEL_WORLD_SIZE = None
_DATA_PARALLEL_RANKS = None # rank list
_TENSOR_PARALLEL_RANKS = None # rank list
_PIPELINE_PARALLEL_RANKS = None # rank list
_DATA_PARALLEL_RANK = None # rank of this process(GPU)
_TENSOR_PARALLEL_RANK = None # rank of this process(GPU)
_PIPELINE_PARALLEL_RANK = None # rank of this process(GPU)
def generate_seeds(rng, size):
"""
Generate list of random seeds
:param rng: random number generator
:param size: length of the returned list
"""
seeds = [rng.randint(0, 2 ** 32 - 1) for _ in range(size)]
return seeds
def broadcast_seeds(seeds, device):
"""
Broadcasts random seeds to all distributed workers.
Returns list of random seeds (broadcasted from workers with rank 0).
:param seeds: list of seeds (integers)
:param device: torch.device
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
seeds_tensor = torch.LongTensor(seeds).to(device)
torch.distributed.broadcast(seeds_tensor, 0)
seeds = seeds_tensor.tolist()
return seeds
def setup_seeds(master_seed, epochs, device):
"""
Generates seeds from one master_seed.
Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
used to initialize per-worker random number generators (mostly for
dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
dataset before each epoch.
Seeds are generated on worker with rank 0 and broadcasted to all other
workers.
:param master_seed: master RNG seed used to initialize other generators
:param epochs: number of epochs
:param device: torch.device (used for distributed.broadcast)
"""
if master_seed is None:
# random master seed, random.SystemRandom() uses /dev/urandom on Unix
master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
if get_rank() == 0:
# master seed is reported only from rank=0 worker, it's to avoid
# confusion, seeds from rank=0 are later broadcasted to other
# workers
logging.info(f'Using random master seed: {master_seed}')
else:
# master seed was specified from command line
logging.info(f'Using master seed from command line: {master_seed}')
# initialize seeding RNG
seeding_rng = random.Random(master_seed)
# generate worker seeds, one seed for every distributed worker
worker_seeds = generate_seeds(seeding_rng, get_world_size())
# generate seeds for data shuffling, one seed for every epoch
shuffling_seeds = generate_seeds(seeding_rng, epochs)
# broadcast seeds from rank=0 to other workers
worker_seeds = broadcast_seeds(worker_seeds, device)
shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
return worker_seeds, shuffling_seeds
def barrier():
"""
Works as a temporary distributed barrier, currently pytorch
doesn't implement barrier for NCCL backend.
Calls all_reduce on dummy tensor and synchronizes with GPU.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
torch.cuda.synchronize()
def get_rank():
"""
Gets distributed rank or returns zero if distributed is not initialized.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
else:
rank = 0
return rank
def get_world_size():
"""
Gets total number of distributed workers or returns one if distributed is
not initialized.
"""
if torch.distributed.is_available():
print("Torch distributed is available.")
else:
print("Torch distributed is not available.")
if torch.distributed.is_initialized():
print("Torch distributed is initialized.")
else:
print("Torch distributed is not initialized.")
if torch.distributed.is_available() and torch.distributed.is_initialized():
world_size = torch.distributed.get_world_size()
else:
world_size = 1
return world_size
def set_device(cuda, local_rank):
"""
Sets device based on local_rank and returns instance of torch.device.
:param cuda: if True: use cuda
:param local_rank: local rank of the worker
"""
if cuda:
torch.cuda.set_device(local_rank)
device = torch.device('cuda')
else:
device = torch.device('cpu')
return device
@contextmanager
def sync_workers():
"""
Yields distributed rank and synchronizes all workers on exit.
"""
rank = get_rank()
yield rank
barrier()
def is_main_process():
return get_rank() == 0
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Training Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Training Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
return s
def has_hooks(module: torch.nn.Module):
""" Returns True if the module uses hooks. """
for hooks in (module._forward_pre_hooks, # pylint: disable=protected-access
module._forward_hooks, module._backward_hooks): # pylint: disable=protected-access
if hooks:
return True
return False
def get_optimizer(name, parameters, lr=0.1, betas=(0.9, 0.999), wd=1e-4, eps=1e-6):
if name.lower() == "fusedlamb":
optimizer = FusedLAMB(parameters, lr=lr, betas=betas, eps=eps, weight_decay=wd, max_grad_norm=1.0)
elif name.lower() == 'fusedsgd':
optimizer = FusedSGD(parameters, lr=lr, momentum=betas[0], weight_decay=wd)
elif name.lower() == 'fusedadam':
optimizer = FusedAdam(parameters, lr=lr, betas=betas, eps=eps, weight_decay=wd)
elif name.lower() == "fusedacclip":
optimizer = ACClip(parameters, lr=lr, betas=betas, eps=eps, weight_decay=wd, max_grad_norm=1.0)
elif name.lower() == 'madgrad':
optimizer = MADGRAD(parameters, lr=lr, momentum=betas[0], eps=eps, weight_decay=wd, max_grad_norm=1.0)
elif name.lower() == 'distributedfusedlamb':
optimizer = DistributedFusedLAMB(parameters, lr=lr, betas=betas, eps=eps, weight_decay=wd, max_grad_norm=1.0,
overlap_reductions=True, clip_after_ar=True,
dwu_num_blocks=4, dwu_num_chunks=1, dwu_num_rs_pg=1,
dwu_num_ar_pg=1, dwu_num_ag_pg=1, use_nvlamb=False)
else:
print("Error : Not Supported Type of Optimizer {}".format(name))
exit()
return optimizer
def has_children(module):
try:
next(module.children())
return True
except StopIteration:
return False
def hasNaN(parameters):
for p in parameters:
if torch.any(p.grad.isnan()):
return True
return False
def ema(avg, beta, yi, i):
"""Exponential moving average"""
if avg is None: avg = 0
avg = beta * avg + (1 - beta) * yi
return avg, avg / (1 - beta ** (i + 1))
class GradientNoiseScale:
"""
A class to measure the gradient noise scale of a model while training (cf. https://arxiv.org/abs/1812.06162).
The core thesis of the paper is that, if our batch size is small, there will be a lot of noise present in the gradients, and we might update our weights only on noise.
After several updates the optimizer may still push us in the right direction, but we would be better off having used a larger batch size, which is more computationally
efficient and directly averages out the noise in the gradients.
But there's a limit to the gains large batch sizes can give you - if, after a certain batch size, your gradient is already accurate, there's no point in increasing the
batch size further, as we'll just be wasting compute for little to no gain in accuracy.
This means there is some theoretically optimal batch size for a given model, which measuring the gradient noise scale can help us to estimate.
To estimate the 'simple' noise scale (Bsimple), we need to have a measure of the gradients using a large batch size (Bbig) and a small
batch size (Bsmall).
when we have those:
Bsimple ≈ (tr(Σ) / |G|^2)
tr(Σ) can be approximated by:
tr(Σ) ≈ (1 / ((1/Bsmall) - (1/Bbig))) * (|Gsmall|^2 - |Gbig|^2)
and |G|^2 by:
|G|^2 ≈ (1 / (Bbig - Bsmall)) * (Bbig*|Gbig|^2 - Bsmall*|Gsmall|^2)
- With multi-gpu training, we can do this by taking the gradients of the microbatch_size_per_gpu for Bsmall,
and the gradients of the entire batch for Bbig.
- Alternatively, we can just take Bsmall as a single batch, and Bbig as several sequential batches in a row.
This is the option we've opted for in this implementation because a) it's easier to implement and b) also works in
single-gpu environments. Unfortunately it does come with some memory overhead.
"""
def __init__(self, batch_size_small, n_batches=20, beta=0.99, args=None):
self.batch_size_small = batch_size_small
self.batch_size_large = batch_size_small * n_batches
self.n_batches = n_batches
self.beta = beta
self.buffer = None
self.ema_scale = None
self.ema_noise = None
self.noise_scale = None
self.n_updates = 0
self.args = args
def _update(self, master_grads):
grad = torch._utils._flatten_dense_tensors(master_grads)
is_overflow = grad is None
if is_overflow:
return
if self.buffer is None:
self.buffer = grad
else:
self.buffer += grad
if self.n_updates % self.n_batches == self.n_batches - 1:
# average grads every n_batches iteration to get a simulation of Bbig
self.buffer /= self.n_batches
grads = self.buffer
self.buffer = None
# calculate Gbig and Gsmall
# this needs to be done in fp32 or it overflows
g_big = torch.square(torch.norm(grads))
g_small = torch.square(torch.norm(grad))
# communicate any overflows
is_overflow = (g_small.isinf().any() or g_small.isnan().any() or g_big.isinf().any() or g_big.isnan().any())
if is_overflow:
return
# calculate noise / scale
noise = 1 / (self.batch_size_large - self.batch_size_small) * (
self.batch_size_large * g_big - self.batch_size_small * g_small)
scale = 1 / (1 / self.batch_size_small - 1 / self.batch_size_large) * (g_small - g_big)
# calculate running average
self.ema_noise, noise = ema(self.ema_noise, self.beta, noise, self.n_updates)
self.ema_scale, scale = ema(self.ema_scale, self.beta, scale, self.n_updates)
# calculate noise scale
self.noise_scale = (scale / noise)
self.n_updates += 1
def update(self, master_grads):
self._update(master_grads)
def get_noise_scale_logger(args):
noise_scale_logger = GradientNoiseScale(batch_size_small=args.train_batch_size, n_batches=10)
return noise_scale_logger
# On APOLLO, Adaptive Parameter-wise diagonal quasi-newton method for nonconvex
# Parameter-wise clipping slightly better
# Momentum Extension of clipped SGD : Stability and convergence of Stochastic Gradient Clipping
# Not clip gradients, clip momentum is the key
def model_parallel_is_initialized():
"""Check if model and data parallel groups are initialized."""
if _TENSOR_PARALLEL_GROUP is None or \
_PIPELINE_PARALLEL_GROUP is None or \
_DATA_PARALLEL_GROUP is None:
return False
return True
def get_tensor_parallel_group():
"""Get the tensor model parallel group the caller rank belongs to."""
assert _TENSOR_PARALLEL_GROUP is not None, \
'intra_layer_model parallel group is not initialized'
return _TENSOR_PARALLEL_GROUP
def get_pipeline_parallel_group():
"""Get the pipeline model parallel group the caller rank belongs to."""
assert _PIPELINE_PARALLEL_GROUP is not None, \
'pipeline_model parallel group is not initialized'
return _PIPELINE_PARALLEL_GROUP
def get_data_parallel_group():
"""Get the data parallel group the caller rank belongs to."""
assert _DATA_PARALLEL_GROUP is not None, \
'data parallel group is not initialized'
return _DATA_PARALLEL_GROUP
def set_data_parallel_group(group):
global _DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP = group
def set_data_parallel_ranks(ranks):
global _DATA_PARALLEL_RANKS
_DATA_PARALLEL_RANKS = ranks
def set_tensor_parallel_group(group):
global _TENSOR_PARALLEL_GROUP
_TENSOR_PARALLEL_GROUP = group
def set_tensor_parallel_ranks(ranks):
global _TENSOR_PARALLEL_RANKS
_TENSOR_PARALLEL_RANKS = ranks
def set_pipeline_parallel_group(group):
global _PIPELINE_PARALLEL_GROUP
_PIPELINE_PARALLEL_GROUP = group
def set_pipeline_parallel_ranks(ranks):
global _PIPELINE_PARALLEL_RANKS
_PIPELINE_PARALLEL_RANKS = ranks
def get_tensor_parallel_world_size():
"""Return world size for the tensor model parallel group."""
global _TENSOR_PARALLEL_WORLD_SIZE
if _TENSOR_PARALLEL_WORLD_SIZE is not None:
return _TENSOR_PARALLEL_WORLD_SIZE
return torch.distributed.get_world_size(group=get_tensor_parallel_group())
def get_pipeline_parallel_world_size():
"""Return world size for the pipeline model parallel group."""
global _PIPELINE_PARALLEL_WORLD_SIZE
if _PIPELINE_PARALLEL_WORLD_SIZE is not None:
return _PIPELINE_PARALLEL_WORLD_SIZE
return torch.distributed.get_world_size(group=get_pipeline_parallel_group())
def get_tensor_parallel_rank():
"""Return my rank for the tensor model parallel group."""
global _TENSOR_PARALLEL_RANK
if _TENSOR_PARALLEL_RANK is not None:
return _TENSOR_PARALLEL_RANK
return torch.distributed.get_rank(group=get_tensor_parallel_group())
def get_pipeline_parallel_rank():
"""Return my rank for the pipeline model parallel group."""
global _PIPELINE_PARALLEL_RANK
if _PIPELINE_PARALLEL_RANK is not None:
return _PIPELINE_PARALLEL_RANK
return torch.distributed.get_rank(group=get_pipeline_parallel_group())
def get_tensor_parallel_src_rank():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank = torch.distributed.get_rank()
local_world_size = get_tensor_parallel_world_size()
return (global_rank // local_world_size) * local_world_size
def get_data_parallel_world_size():
"""Return world size for the data parallel group."""
return torch.distributed.get_world_size(group=get_data_parallel_group())
def get_data_parallel_rank():
"""Return my rank for the data parallel group."""
return torch.distributed.get_rank(group=get_data_parallel_group())
def get_data_parallel_ranks():
return _DATA_PARALLEL_RANKS
def get_tensor_parallel_ranks():
return _TENSOR_PARALLEL_RANKS
def get_pipeline_parallel_ranks():
return _PIPELINE_PARALLEL_RANKS
def destroy_parallel():
"""Set the groups to none."""
global _TENSOR_PARALLEL_GROUP
_TENSOR_PARALLEL_GROUP = None
global _PIPELINE_PARALLEL_GROUP
_PIPELINE_PARALLEL_GROUP = None
global _DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP = None
def split_tensor_along_last_dim(
tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False
) -> Tuple[torch.Tensor, ...]:
"""Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
"""
# Get the size and dimension.
last_dim = tensor.dim() - 1
last_dim_size = tensor.size()[last_dim] // num_partitions
# Split.
tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
# Note: torch.split does not create contiguous tensors by default.
if contiguous_split_chunks:
return tuple(chunk.contiguous() for chunk in tensor_list)
return tensor_list
def _initialize_affine_weight(
weight: torch.Tensor,
out_features: int,
in_features: int,
per_partition_size: int,
partition_dim: int,
init_method: Callable[[torch.Tensor], torch.Tensor],
stride: int = 1,
return_master_weight: bool = False,
) -> Optional[torch.Tensor]:
"""Initialize affine weight for model parallel.
Build the master weight on all processes and scatter
the relevant chunk."""
# If we only use 1 process for model parallelism, bypass scatter.
world_size = get_tensor_parallel_world_size()
if world_size == 1:
init_method(weight)
if return_master_weight:
return weight
return None
# Initialize master weight
master_weight = torch.empty(out_features, in_features, dtype=weight.dtype, requires_grad=False)
init_method(master_weight)
# Split and copy
per_partition_per_stride_size = per_partition_size // stride
weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
rank = get_tensor_parallel_rank()
my_weight_list = weight_list[rank::world_size]
with torch.no_grad():
torch.cat(my_weight_list, dim=partition_dim, out=weight)
if return_master_weight:
return master_weight
return None
def is_pipeline_master():
return get_pipeline_parallel_rank() == 0
class SplitRandomSampler:
def __init__(self, input_files, batch_ratio=[6,3,2,5], generator=None) -> None:
self.num_samples = []
for input_file in input_files:
h5_ifile = h5py.File(input_file, 'r')
f_next_sentence_labels = h5_ifile['next_sentence_labels'][:]
self.num_samples.append(f_next_sentence_labels.shape[0])
h5_ifile.close()
self.generator = generator
self.batch_ratio = batch_ratio
def __iter__(self):
if self.generator is None:
generator = torch.Generator()
generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
else:
generator = self.generator
max_len = min([self.num_samples[i] // batch_count for i,batch_count in enumerate(self.batch_ratio)])
rand_idxs = [(torch.randperm(num_sample, generator=generator)[:int(num_sample/batch_cnt)*batch_cnt]).view(-1, batch_cnt)[:max_len,:]+sum(self.num_samples[:i]) for i, (num_sample,batch_cnt) in enumerate(zip(self.num_samples,self.batch_ratio))]
rand_idxs = torch.flatten(torch.cat(rand_idxs,dim=-1)) # max_len, 16 ->
yield from rand_idxs.tolist()