From ca480eb338f8986b718e402965e99b3dc3c0ad82 Mon Sep 17 00:00:00 2001 From: tango4j Date: Fri, 22 Nov 2024 01:49:02 +0000 Subject: [PATCH 1/6] Apply isort and black reformatting Signed-off-by: tango4j --- nemo/collections/asr/metrics/der.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/metrics/der.py b/nemo/collections/asr/metrics/der.py index ee9e9b36424f..c8dec24eaaca 100644 --- a/nemo/collections/asr/metrics/der.py +++ b/nemo/collections/asr/metrics/der.py @@ -147,14 +147,14 @@ def score_labels( AUDIO_RTTM_MAP (dict): Dictionary containing information provided from manifestpath all_reference (list[uniq_name,Annotation]): reference annotations for score calculation all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation - all_uem (list[list[float]]): List of UEM segments for each audio file. If UEM file is not provided, + all_uem (list[list[float]]): List of UEM segments for each audio file. If UEM file is not provided, it will be read from manifestpath collar (float): Length of collar (in seconds) for diarization error rate calculation ignore_overlap (bool): If True, overlapping segments in reference and hypothesis will be ignored verbose (bool): If True, warning messages will be printed Returns: - metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object. + metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object. This object contains detailed scores of each audiofile. mapping (dict): Mapping dict containing the mapping speaker label for each audio input itemized_errors (tuple): Tuple containing (DER, CER, FA, MISS) for each audio file. From e731d4818895ed179e1e0bf0949005f48e38e0f7 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 21 Nov 2024 20:03:10 -0800 Subject: [PATCH 2/6] Revert "update hypothesis when passed through cfg (#11366)" (#11373) This reverts commit 8ab46ffcb0fc8b892d2e35b166772076fb630e8f. --- examples/asr/transcribe_speech.py | 2 -- nemo/collections/asr/models/ctc_models.py | 1 - nemo/collections/asr/models/rnnt_models.py | 2 +- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index b3264e7e6d90..f1d61edc990e 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -275,8 +275,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis # we will adjust this flag if the model does not support it compute_langs = cfg.compute_langs - if cfg.timestamps: - cfg.return_hypotheses = True # Check whether model and decoder type match if isinstance(asr_model, EncDecCTCModel): diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 962c7f2902eb..3df6a7352c4d 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -160,7 +160,6 @@ def transcribe( A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ - timestamps = timestamps or override_config.timestamps if override_config is not None else None if timestamps is not None: # else retain the decoder state (users can set it using change_decoding_strategy) if timestamps or (override_config is not None and override_config.timestamps): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 4facd59a8c14..a6408b5e935e 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -285,7 +285,7 @@ def transcribe( * A list of greedy transcript texts / Hypothesis * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. """ - timestamps = timestamps or override_config.timestamps if override_config is not None else None + if timestamps is not None: if timestamps or (override_config is not None and override_config.timestamps): logging.info( From 94dbd50ad644efb5ac5da16d360d3df0ef3da5ff Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Fri, 22 Nov 2024 08:44:02 +0300 Subject: [PATCH 3/6] add fix to recipe (#11368) * add fix to recipe * Apply isort and black reformatting Signed-off-by: JRD971000 --------- Signed-off-by: JRD971000 Co-authored-by: Ali Taghibakhshi Co-authored-by: JRD971000 --- nemo/collections/llm/gpt/model/ssm.py | 1 + nemo/collections/llm/recipes/mamba2_130m.py | 64 ++++++++++++----- nemo/collections/llm/recipes/mamba2_1_3b.py | 68 +++++++++++++------ nemo/collections/llm/recipes/mamba2_2_7b.py | 64 ++++++++++++----- nemo/collections/llm/recipes/mamba2_370m.py | 64 ++++++++++++----- nemo/collections/llm/recipes/mamba2_780m.py | 64 ++++++++++++----- nemo/collections/llm/recipes/mamba2_8b.py | 64 ++++++++++++----- .../llm/recipes/mamba2_hybrid_8b.py | 64 ++++++++++++----- 8 files changed, 332 insertions(+), 121 deletions(-) diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py index e828d85f2814..f4190114042e 100644 --- a/nemo/collections/llm/gpt/model/ssm.py +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -290,6 +290,7 @@ class BaseMambaConfig2_7B(SSMConfig): @dataclass class NVIDIAMambaConfig8B(SSMConfig): hybrid_override_pattern: str = "M" * 56 + num_attention_heads: int = 32 num_layers: int = 56 seq_length: int = 4096 hidden_size: int = 4096 diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py index 3f13f91f6609..e70fec03b3fb 100644 --- a/nemo/collections/llm/recipes/mamba2_130m.py +++ b/nemo/collections/llm/recipes/mamba2_130m.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 1, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -158,6 +163,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -193,16 +208,23 @@ def pretrain_recipe( fn, model=model(), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, - tokenizer=tokenizer(tokenizer_model=tokenizer_model), + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), @@ -218,6 +240,14 @@ def finetune_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py index 1a280b8b92a1..aaa263078686 100644 --- a/nemo/collections/llm/recipes/mamba2_1_3b.py +++ b/nemo/collections/llm/recipes/mamba2_1_3b.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 1, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -157,7 +162,17 @@ def pretrain_recipe( name: str = "default", tokenizer_model: str = None, num_nodes: int = 1, - num_gpus_per_node: int = 8, + num_gpus_per_node: int = 1, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -191,17 +206,24 @@ def pretrain_recipe( """ return run.Partial( fn, - model=model(), + model=model(tokenizer_model=tokenizer_model), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), @@ -217,7 +239,15 @@ def finetune_recipe( resume_path: str = None, tokenizer_model: str = None, num_nodes: int = 1, - num_gpus_per_node: int = 8, + num_gpus_per_node: int = 1, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py index 0915cec748dd..b4fd5b487b6a 100644 --- a/nemo/collections/llm/recipes/mamba2_2_7b.py +++ b/nemo/collections/llm/recipes/mamba2_2_7b.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 1, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -158,6 +163,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -193,16 +208,23 @@ def pretrain_recipe( fn, model=model(), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, - tokenizer=tokenizer(tokenizer_model=tokenizer_model), + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), @@ -218,6 +240,14 @@ def finetune_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py index bb063dfcfc3f..6fa619b33486 100644 --- a/nemo/collections/llm/recipes/mamba2_370m.py +++ b/nemo/collections/llm/recipes/mamba2_370m.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 1, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -158,6 +163,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -193,16 +208,23 @@ def pretrain_recipe( fn, model=model(), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, - tokenizer=tokenizer(tokenizer_model=tokenizer_model), + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), @@ -218,6 +240,14 @@ def finetune_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py index e89905b2269a..45d28f82f779 100644 --- a/nemo/collections/llm/recipes/mamba2_780m.py +++ b/nemo/collections/llm/recipes/mamba2_780m.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 1, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -158,6 +163,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -193,16 +208,23 @@ def pretrain_recipe( fn, model=model(), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, - tokenizer=tokenizer(tokenizer_model=tokenizer_model), + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), @@ -218,6 +240,14 @@ def finetune_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py index 873d79fcb0f0..8f8384b45059 100644 --- a/nemo/collections/llm/recipes/mamba2_8b.py +++ b/nemo/collections/llm/recipes/mamba2_8b.py @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(name=NAME) def trainer( tensor_parallelism: int = 8, pipeline_parallelism: int = 1, @@ -76,7 +77,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -137,15 +142,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -158,6 +163,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -191,17 +206,24 @@ def pretrain_recipe( """ return run.Partial( fn, - model=model(), + model=model(tokenizer_model=tokenizer_model), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), @@ -218,6 +240,14 @@ def finetune_recipe( name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 8, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -266,8 +296,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=8, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -283,10 +313,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -296,7 +327,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -304,7 +334,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py index 5a557de46066..b91c8e228bc9 100644 --- a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py +++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py @@ -69,6 +69,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: ) +@run.cli.factory(target=finetune, name=NAME) def trainer( tensor_parallelism: int = 8, pipeline_parallelism: int = 1, @@ -78,7 +79,11 @@ def trainer( sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, - max_steps: int = 1168251, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, callbacks: Optional[list[run.Config[Callback]]] = None, ) -> run.Config[nl.Trainer]: """ @@ -139,15 +144,15 @@ def trainer( accumulate_grad_batches=1, callbacks=callbacks, devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, - val_check_interval=2000, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, ) return trainer @@ -160,6 +165,16 @@ def pretrain_recipe( tokenizer_model: str = None, num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, + seq_length: int = 4096, + gbs: int = 8, + mbs: int = 1, fn=pretrain, ) -> run.Partial: """ @@ -193,17 +208,24 @@ def pretrain_recipe( """ return run.Partial( fn, - model=model(), + model=model(tokenizer_model=tokenizer_model), trainer=trainer( + max_steps=max_steps, num_nodes=num_nodes, + tensor_parallelism=tensor_parallelism, + pipeline_parallelism=pipeline_parallelism, num_gpus_per_node=num_gpus_per_node, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, callbacks=[run.Config(TimingCallback)], ), data=run.Config( MockDataModule, - seq_length=4096, - global_batch_size=8, - micro_batch_size=1, + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), @@ -220,6 +242,14 @@ def finetune_recipe( name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, + tensor_model_parallel_size: int = 8, + pipeline_model_parallel_size: int = 1, + seq_length: int = 4096, + max_steps: int = 100, + val_check_interval: int = 100, + limit_test_batches: int = 50, + limit_val_batches: int = 32, + log_every_n_steps: int = 10, gbs: int = 8, mbs: int = 1, peft_scheme: Optional[str] = 'none', @@ -268,8 +298,8 @@ def finetune_recipe( ) strategy = run.Config( nl.MegatronStrategy, - tensor_model_parallel_size=8, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, gradient_as_bucket_view=True, ckpt_load_optimizer=False, ckpt_save_optimizer=False, @@ -285,10 +315,11 @@ def finetune_recipe( accelerator="gpu", accumulate_grad_batches=1, devices=num_gpus_per_node, - limit_test_batches=10, - limit_val_batches=10, - log_every_n_steps=20, - max_steps=100, + max_steps=max_steps, + val_check_interval=val_check_interval, + limit_test_batches=limit_test_batches, + limit_val_batches=limit_val_batches, + log_every_n_steps=log_every_n_steps, num_nodes=num_nodes, plugins=run.Config( nl.MegatronMixedPrecision, @@ -298,7 +329,6 @@ def finetune_recipe( callbacks=[checkpoint_callback], strategy=strategy, use_distributed_sampler=False, - val_check_interval=20, ) recipe = run.Partial( llm.finetune, @@ -306,7 +336,7 @@ def finetune_recipe( trainer=trainer, data=run.Config( llm.SquadDataModule, - seq_length=2048, + seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, tokenizer=tokenizer(tokenizer_model=tokenizer_model), From 92aacd56a4539eb55ffd64a84fc238a272cddf63 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 21 Nov 2024 22:11:13 -0800 Subject: [PATCH 4/6] add missing test to CICD needed list (#11376) --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 1e148312a49e..49c6c55ca778 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4493,6 +4493,7 @@ jobs: - L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1 - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1 - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1 + - L2_NEMO_2_LoRA_MERGE - L2_NeMo_2_Mixtral_Pretraining - L2_PTQ_Llama2_FP8 - L2_Community_LLM_Checkpoints_tests_Llama3 From 400dd8402b4b5384f7a06336d763af4138252cab Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Fri, 22 Nov 2024 12:59:49 -0400 Subject: [PATCH 5/6] update SquadDataModule to use run.config (#11358) * update SquadDataModule to use run.config * workable code --------- Co-authored-by: Huy Vu2 --- nemo/collections/llm/recipes/t5_220m.py | 11 +++++++---- nemo/collections/llm/t5/data/squad.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/nemo/collections/llm/recipes/t5_220m.py b/nemo/collections/llm/recipes/t5_220m.py index edc9fdba62d7..975ac5519859 100644 --- a/nemo/collections/llm/recipes/t5_220m.py +++ b/nemo/collections/llm/recipes/t5_220m.py @@ -248,15 +248,17 @@ def finetune_recipe( on fine-tuning LLMs with NeMo, see the fine-tuning guide in the `examples/llm/finetune/` directory. """ - opt_config = OptimizerConfig( + opt_config = run.Config( + OptimizerConfig, optimizer='adam', - lr=1e-4, + lr=0.0001, use_distributed_optimizer=True, bf16=True, weight_decay=0.01, ) - lr_scheduler = WarmupAnnealingScheduler( + lr_scheduler = run.Config( + WarmupAnnealingScheduler, warmup_steps=50, max_steps=2000, min_lr=0.00001, @@ -273,7 +275,7 @@ def finetune_recipe( SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1 ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler), + optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler), resume=nemo_resume(checkpoint_path), ) @@ -285,4 +287,5 @@ def finetune_recipe( recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/t5/data/squad.py b/nemo/collections/llm/t5/data/squad.py index 3e413919211c..4e90b09e622e 100644 --- a/nemo/collections/llm/t5/data/squad.py +++ b/nemo/collections/llm/t5/data/squad.py @@ -42,6 +42,7 @@ class SquadDataModule(FineTuningDataModule, IOMixin): def __init__( self, + dataset_root: str = None, seq_length: int = 512, seq_length_dec: int = 128, tokenizer: Optional["TokenizerSpec"] = None, @@ -60,7 +61,7 @@ def __init__( self.delete_raw = delete_raw super().__init__( - dataset_root=get_dataset_root("squad"), + dataset_root=get_dataset_root("squad") if dataset_root is None else dataset_root, seq_length=seq_length, seq_length_dec=seq_length_dec, tokenizer=tokenizer, From a153b8c58d56aa930749587017ee70d56f75445e Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Fri, 22 Nov 2024 13:04:32 -0500 Subject: [PATCH 6/6] Fix transcribe speech (#11379) * add explicit bracket for or operation Signed-off-by: Nithin Rao Koluguri * add cfg arg Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- examples/asr/transcribe_speech.py | 3 +++ nemo/collections/asr/models/ctc_models.py | 1 + nemo/collections/asr/models/rnnt_models.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index f1d61edc990e..5c4a636e8b1c 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -276,6 +276,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis # we will adjust this flag if the model does not support it compute_langs = cfg.compute_langs + if cfg.timestamps: + cfg.return_hypotheses = True + # Check whether model and decoder type match if isinstance(asr_model, EncDecCTCModel): if cfg.decoder_type and cfg.decoder_type != 'ctc': diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 3df6a7352c4d..76dcd13cca50 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -160,6 +160,7 @@ def transcribe( A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ + timestamps = timestamps or (override_config.timestamps if override_config is not None else None) if timestamps is not None: # else retain the decoder state (users can set it using change_decoding_strategy) if timestamps or (override_config is not None and override_config.timestamps): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index a6408b5e935e..e4d1abd0b50c 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -285,7 +285,7 @@ def transcribe( * A list of greedy transcript texts / Hypothesis * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. """ - + timestamps = timestamps or (override_config.timestamps if override_config is not None else None) if timestamps is not None: if timestamps or (override_config is not None and override_config.timestamps): logging.info(