Merge branch 'sortformer/pr_01' of https://github.com/tango4j/NeMo in…

…to sortformer/pr_01
NVIDIA · Nov 22, 2024 · a4367a3 · a4367a3
2 parents af04832 + 1b091c8
commit a4367a3
Show file tree

Hide file tree

Showing 15 changed files with 347 additions and 130 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -4493,6 +4493,7 @@ jobs:
       - L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
+      - L2_NEMO_2_LoRA_MERGE
       - L2_NeMo_2_Mixtral_Pretraining
       - L2_PTQ_Llama2_FP8
       - L2_Community_LLM_Checkpoints_tests_Llama3

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -275,6 +275,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
     # we will adjust this flag if the model does not support it
     compute_langs = cfg.compute_langs
+
     if cfg.timestamps:
         cfg.return_hypotheses = True
 

diff --git a/nemo/collections/asr/metrics/der.py b/nemo/collections/asr/metrics/der.py
@@ -147,14 +147,14 @@ def score_labels(
         AUDIO_RTTM_MAP (dict): Dictionary containing information provided from manifestpath
         all_reference (list[uniq_name,Annotation]): reference annotations for score calculation
         all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation
-        all_uem (list[list[float]]): List of UEM segments for each audio file. If UEM file is not provided, 
+        all_uem (list[list[float]]): List of UEM segments for each audio file. If UEM file is not provided,
                                      it will be read from manifestpath
         collar (float): Length of collar (in seconds) for diarization error rate calculation
         ignore_overlap (bool): If True, overlapping segments in reference and hypothesis will be ignored
         verbose (bool): If True, warning messages will be printed
 
     Returns:
-        metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object. 
+        metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object.
                                                 This object contains detailed scores of each audiofile.
         mapping (dict): Mapping dict containing the mapping speaker label for each audio input
         itemized_errors (tuple): Tuple containing (DER, CER, FA, MISS) for each audio file.

diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
@@ -160,7 +160,7 @@ def transcribe(
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as 
             paths2audio_files
         """
-        timestamps = timestamps or override_config.timestamps if override_config is not None else None
+        timestamps = timestamps or (override_config.timestamps if override_config is not None else None)
         if timestamps is not None:
             # else retain the decoder state (users can set it using change_decoding_strategy)
             if timestamps or (override_config is not None and override_config.timestamps):

diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
@@ -285,7 +285,7 @@ def transcribe(
             * A list of greedy transcript texts / Hypothesis
             * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
         """
-        timestamps = timestamps or override_config.timestamps if override_config is not None else None
+        timestamps = timestamps or (override_config.timestamps if override_config is not None else None)
         if timestamps is not None:
             if timestamps or (override_config is not None and override_config.timestamps):
                 logging.info(

diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
@@ -290,6 +290,7 @@ class BaseMambaConfig2_7B(SSMConfig):
 @dataclass
 class NVIDIAMambaConfig8B(SSMConfig):
     hybrid_override_pattern: str = "M" * 56
+    num_attention_heads: int = 32
     num_layers: int = 56
     seq_length: int = 4096
     hidden_size: int = 4096

diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
     )
 
 
+@run.cli.factory(target=finetune, name=NAME)
 def trainer(
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
@@ -76,7 +77,11 @@ def trainer(
     sequence_parallelism: bool = False,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
-    max_steps: int = 1168251,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
     callbacks: Optional[list[run.Config[Callback]]] = None,
 ) -> run.Config[nl.Trainer]:
     """
@@ -137,15 +142,15 @@ def trainer(
         accumulate_grad_batches=1,
         callbacks=callbacks,
         devices=num_gpus_per_node,
-        limit_test_batches=50,
-        limit_val_batches=32,
-        log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
         plugins=bf16_mixed(),
         strategy=strategy,
         use_distributed_sampler=False,
-        val_check_interval=2000,
+        val_check_interval=val_check_interval,
+        limit_test_batches=limit_test_batches,
+        limit_val_batches=limit_val_batches,
+        log_every_n_steps=log_every_n_steps,
     )
 
     return trainer
@@ -158,6 +163,16 @@ def pretrain_recipe(
     tokenizer_model: str = None,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
+    seq_length: int = 4096,
+    gbs: int = 8,
+    mbs: int = 1,
     fn=pretrain,
 ) -> run.Partial:
     """
@@ -193,16 +208,23 @@ def pretrain_recipe(
         fn,
         model=model(),
         trainer=trainer(
+            max_steps=max_steps,
             num_nodes=num_nodes,
+            tensor_parallelism=tensor_parallelism,
+            pipeline_parallelism=pipeline_parallelism,
             num_gpus_per_node=num_gpus_per_node,
+            val_check_interval=val_check_interval,
+            limit_test_batches=limit_test_batches,
+            limit_val_batches=limit_val_batches,
+            log_every_n_steps=log_every_n_steps,
             callbacks=[run.Config(TimingCallback)],
         ),
         data=run.Config(
             MockDataModule,
-            seq_length=4096,
-            global_batch_size=8,
-            micro_batch_size=1,
-            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+            seq_length=seq_length,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(),
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
@@ -218,6 +240,14 @@ def finetune_recipe(
     tokenizer_model: str = None,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    seq_length: int = 4096,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
     gbs: int = 8,
     mbs: int = 1,
     peft_scheme: Optional[str] = 'none',
@@ -266,8 +296,8 @@ def finetune_recipe(
     )
     strategy = run.Config(
         nl.MegatronStrategy,
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
         gradient_as_bucket_view=True,
         ckpt_load_optimizer=False,
         ckpt_save_optimizer=False,
@@ -283,10 +313,11 @@ def finetune_recipe(
         accelerator="gpu",
         accumulate_grad_batches=1,
         devices=num_gpus_per_node,
-        limit_test_batches=10,
-        limit_val_batches=10,
-        log_every_n_steps=20,
-        max_steps=100,
+        max_steps=max_steps,
+        val_check_interval=val_check_interval,
+        limit_test_batches=limit_test_batches,
+        limit_val_batches=limit_val_batches,
+        log_every_n_steps=log_every_n_steps,
         num_nodes=num_nodes,
         plugins=run.Config(
             nl.MegatronMixedPrecision,
@@ -296,15 +327,14 @@ def finetune_recipe(
         callbacks=[checkpoint_callback],
         strategy=strategy,
         use_distributed_sampler=False,
-        val_check_interval=20,
     )
     recipe = run.Partial(
         llm.finetune,
         model=model(tokenizer_model=tokenizer_model),
         trainer=trainer,
         data=run.Config(
             llm.SquadDataModule,
-            seq_length=2048,
+            seq_length=seq_length,
             global_batch_size=gbs,
             micro_batch_size=mbs,
             tokenizer=tokenizer(tokenizer_model=tokenizer_model),

diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
     )
 
 
+@run.cli.factory(target=finetune, name=NAME)
 def trainer(
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
@@ -76,7 +77,11 @@ def trainer(
     sequence_parallelism: bool = False,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
-    max_steps: int = 1168251,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
     callbacks: Optional[list[run.Config[Callback]]] = None,
 ) -> run.Config[nl.Trainer]:
     """
@@ -137,15 +142,15 @@ def trainer(
         accumulate_grad_batches=1,
         callbacks=callbacks,
         devices=num_gpus_per_node,
-        limit_test_batches=50,
-        limit_val_batches=32,
-        log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
         plugins=bf16_mixed(),
         strategy=strategy,
         use_distributed_sampler=False,
-        val_check_interval=2000,
+        val_check_interval=val_check_interval,
+        limit_test_batches=limit_test_batches,
+        limit_val_batches=limit_val_batches,
+        log_every_n_steps=log_every_n_steps,
     )
 
     return trainer
@@ -157,7 +162,17 @@ def pretrain_recipe(
     name: str = "default",
     tokenizer_model: str = None,
     num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
+    num_gpus_per_node: int = 1,
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
+    seq_length: int = 4096,
+    gbs: int = 8,
+    mbs: int = 1,
     fn=pretrain,
 ) -> run.Partial:
     """
@@ -191,17 +206,24 @@ def pretrain_recipe(
     """
     return run.Partial(
         fn,
-        model=model(),
+        model=model(tokenizer_model=tokenizer_model),
         trainer=trainer(
+            max_steps=max_steps,
             num_nodes=num_nodes,
+            tensor_parallelism=tensor_parallelism,
+            pipeline_parallelism=pipeline_parallelism,
             num_gpus_per_node=num_gpus_per_node,
+            val_check_interval=val_check_interval,
+            limit_test_batches=limit_test_batches,
+            limit_val_batches=limit_val_batches,
+            log_every_n_steps=log_every_n_steps,
             callbacks=[run.Config(TimingCallback)],
         ),
         data=run.Config(
             MockDataModule,
-            seq_length=4096,
-            global_batch_size=8,
-            micro_batch_size=1,
+            seq_length=seq_length,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
             tokenizer=tokenizer(tokenizer_model=tokenizer_model),
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
@@ -217,7 +239,15 @@ def finetune_recipe(
     resume_path: str = None,
     tokenizer_model: str = None,
     num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
+    num_gpus_per_node: int = 1,
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    seq_length: int = 4096,
+    max_steps: int = 100,
+    val_check_interval: int = 100,
+    limit_test_batches: int = 50,
+    limit_val_batches: int = 32,
+    log_every_n_steps: int = 10,
     gbs: int = 8,
     mbs: int = 1,
     peft_scheme: Optional[str] = 'none',
@@ -266,8 +296,8 @@ def finetune_recipe(
     )
     strategy = run.Config(
         nl.MegatronStrategy,
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
         gradient_as_bucket_view=True,
         ckpt_load_optimizer=False,
         ckpt_save_optimizer=False,
@@ -283,10 +313,11 @@ def finetune_recipe(
         accelerator="gpu",
         accumulate_grad_batches=1,
         devices=num_gpus_per_node,
-        limit_test_batches=10,
-        limit_val_batches=10,
-        log_every_n_steps=20,
-        max_steps=100,
+        max_steps=max_steps,
+        val_check_interval=val_check_interval,
+        limit_test_batches=limit_test_batches,
+        limit_val_batches=limit_val_batches,
+        log_every_n_steps=log_every_n_steps,
         num_nodes=num_nodes,
         plugins=run.Config(
             nl.MegatronMixedPrecision,
@@ -296,15 +327,14 @@ def finetune_recipe(
         callbacks=[checkpoint_callback],
         strategy=strategy,
         use_distributed_sampler=False,
-        val_check_interval=20,
     )
     recipe = run.Partial(
         llm.finetune,
         model=model(tokenizer_model=tokenizer_model),
         trainer=trainer,
         data=run.Config(
             llm.SquadDataModule,
-            seq_length=2048,
+            seq_length=seq_length,
             global_batch_size=gbs,
             micro_batch_size=mbs,
             tokenizer=tokenizer(tokenizer_model=tokenizer_model),