From 19caeefe17f6fb796ab4f3570de7f95a32a970fe Mon Sep 17 00:00:00 2001
From: eharper <eharper@nvidia.com>
Date: Tue, 30 Jan 2024 11:51:04 -0700
Subject: [PATCH 01/92] update minor version

Signed-off-by: eharper <eharper@nvidia.com>
---
 megatron/core/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index 55c49b1785..07de3fba41 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,7 +2,7 @@
 
 
 MAJOR = 0
-MINOR = 4
+MINOR = 5
 PATCH = 0
 PRE_RELEASE = 'rc0'
 

From 543dcaf46b2d25a2879452123676c2a37a39f128 Mon Sep 17 00:00:00 2001
From: eharper <eharper@nvidia.com>
Date: Thu, 21 Mar 2024 11:44:43 -0600
Subject: [PATCH 02/92] update package info

Signed-off-by: eharper <eharper@nvidia.com>
---
 megatron/core/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index 07de3fba41..2a4f9897b7 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,7 +2,7 @@
 
 
 MAJOR = 0
-MINOR = 5
+MINOR = 6
 PATCH = 0
 PRE_RELEASE = 'rc0'
 

From cbf81447cc554b8763c38ef41e993ab2d353f94a Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Date: Thu, 4 Apr 2024 20:44:08 -0700
Subject: [PATCH 03/92] Do RS for embeddings instead of AR

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
---
 .../common/embeddings/language_model_embedding.py   | 10 +++++++---
 megatron/core/tensor_parallel/layers.py             | 13 +++++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py
index d525a30fb9..22ebd21154 100644
--- a/megatron/core/models/common/embeddings/language_model_embedding.py
+++ b/megatron/core/models/common/embeddings/language_model_embedding.py
@@ -38,12 +38,14 @@ def __init__(
         self.max_sequence_length: int = max_sequence_length
         self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
         self.num_tokentypes = num_tokentypes
+        self.reduce_scatter_embeddings = (not self.add_position_embedding) and self.num_tokentypes <= 0 and self.config.sequence_parallel
 
         # Word embeddings (parallel).
         self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
             num_embeddings=self.vocab_size,
             embedding_dim=self.config.hidden_size,
             init_method=self.config.init_method,
+            reduce_scatter_embeddings=self.reduce_scatter_embeddings,
             config=self.config,
         )
 
@@ -98,8 +100,9 @@ def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int =
         else:
             embeddings = word_embeddings
 
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
+        if not self.reduce_scatter_embeddings:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            embeddings = embeddings.transpose(0, 1).contiguous()
 
         if tokentype_ids is not None:
             assert self.tokentype_embeddings is not None
@@ -115,7 +118,8 @@ def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int =
 
         # Dropout.
         if self.config.sequence_parallel:
-            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            if not self.reduce_scatter_embeddings:
+                embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
             # `scatter_to_sequence_parallel_region` returns a view, which prevents
             # the original tensor from being garbage collected. Clone to facilitate GC.
             # Has a small runtime cost (~0.5%).
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index e02a41ab95..2587fa769e 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -152,6 +152,7 @@ class VocabParallelEmbedding(torch.nn.Module):
     Args:
         num_embeddings: vocabulary size.
         embedding_dim: size of hidden state.
+        reduce_scatter_embeddings: Decides whether to perform ReduceScatter after embedding lookup
 
     Keyword Args:
         config: A megatron.core.ModelParallelConfig object
@@ -163,12 +164,14 @@ def __init__(
         embedding_dim: int,
         *,
         init_method: Callable,
+        reduce_scatter_embeddings: bool,
         config: ModelParallelConfig,
     ):
         super(VocabParallelEmbedding, self).__init__()
         # Keep the input dimensions.
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
+        self.reduce_scatter_embeddings = reduce_scatter_embeddings
         self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
         # Divide the weight matrix along the vocaburaly dimension.
         (
@@ -222,8 +225,14 @@ def forward(self, input_):
         # Mask the output embedding.
         if self.tensor_model_parallel_size > 1:
             output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if self.reduce_scatter_embeddings:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            output_parallel = output_parallel.transpose(0, 1).contiguous()
+            output = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            # Reduce across all the model parallel GPUs.
+            output = reduce_from_tensor_model_parallel_region(output_parallel)
         return output
 
     def sharded_state_dict(

From 6c72f7b7a6cf38f1c98272bf84aec7b5bae4bb6c Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Date: Thu, 4 Apr 2024 20:50:25 -0700
Subject: [PATCH 04/92] Fixed formatting

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
---
 .../models/common/embeddings/language_model_embedding.py    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py
index 22ebd21154..3744eab7b8 100644
--- a/megatron/core/models/common/embeddings/language_model_embedding.py
+++ b/megatron/core/models/common/embeddings/language_model_embedding.py
@@ -38,7 +38,11 @@ def __init__(
         self.max_sequence_length: int = max_sequence_length
         self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
         self.num_tokentypes = num_tokentypes
-        self.reduce_scatter_embeddings = (not self.add_position_embedding) and self.num_tokentypes <= 0 and self.config.sequence_parallel
+        self.reduce_scatter_embeddings = (
+            (not self.add_position_embedding)
+            and self.num_tokentypes <= 0
+            and self.config.sequence_parallel
+        )
 
         # Word embeddings (parallel).
         self.word_embeddings = tensor_parallel.VocabParallelEmbedding(

From c4674b26bb9b4fbae4c490db692c8191d83673ea Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 5 Apr 2024 11:58:03 -0700
Subject: [PATCH 05/92] Added defaults for reduce_scatter_embeddings switch

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
---
 megatron/core/tensor_parallel/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 2587fa769e..e443c305a9 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -164,7 +164,7 @@ def __init__(
         embedding_dim: int,
         *,
         init_method: Callable,
-        reduce_scatter_embeddings: bool,
+        reduce_scatter_embeddings: bool = False,
         config: ModelParallelConfig,
     ):
         super(VocabParallelEmbedding, self).__init__()

From da6109ec852e9db61afdecb4ca6fa213f9b5c2a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20B=C5=82a=C5=BC?= <mblaz@nvidia.com>
Date: Tue, 9 Apr 2024 11:22:01 +0200
Subject: [PATCH 06/92] Enable debug logging

---
 megatron/training/arguments.py  |  3 +++
 megatron/training/initialize.py | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 85c5821a9e..45d352fec2 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -5,6 +5,7 @@
 import argparse
 import dataclasses
 import json
+import logging
 import os
 import torch
 import types
@@ -861,6 +862,8 @@ def _add_logging_args(parser):
     group.add_argument('--one-logger-run-name', type=str, default=None,
                        help='The one-logger run name displayed. Will ignore if '
                        '--enable-one-logger is not set')
+    group.add_argument('--logging-level', type=int, default=None,
+                       help='Set default logging level')
     return parser
 
 
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index a49d4ee09c..ed69b63aae 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
 """Megatron initialization."""
-
+import logging
 import random
 import os
 import time
@@ -22,6 +22,9 @@
 from megatron.legacy.model.transformer import bias_dropout_add_fused_train
 from megatron.legacy.model.fused_bias_gelu import bias_gelu
 
+logger = logging.getLogger(__name__)
+
+
 def initialize_megatron(
     extra_args_provider=None,
     args_defaults={},
@@ -58,6 +61,9 @@ def initialize_megatron(
     # tensorboard-writer, and timers.
     set_global_variables(args)
 
+    # set logging level
+    setup_logging()
+
     # torch.distributed initialization
     def finish_mpu_init():
         args = get_args()
@@ -392,3 +398,26 @@ def _warmup_jit_function():
             output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate)
     del bias, input, residual, output
     torch.cuda.empty_cache()
+
+
+def setup_logging() -> None:
+    """ Sets the default logging level based on cmdline args and env vars.
+
+    Precedence:
+    1. Command line argument `--logging-level`
+    2. Env var `MEGATRON_LOGGING_LEVEL`
+    3. Default logging level (INFO)
+
+    Returns: None
+    """
+    args = get_args()
+    logging_level = None
+    env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None)
+    if env_logging_level is not None:
+        logging_level = int(env_logging_level)
+    if args.logging_level is not None:
+        logging_level = args.logging_level
+
+    if logging_level is not None:
+        logger.info(f'Setting logging level to {logging_level}')
+        logging.getLogger().setLevel(logging_level)

From f1c97ee2b79a45c49f0fc2dea62aba9e4ebd58fc Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Fri, 3 May 2024 09:21:29 -0700
Subject: [PATCH 07/92] README fixes re: parallelism and distributed optimizer

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1c7e134bd8..ea2f01f8b3 100644
--- a/README.md
+++ b/README.md
@@ -187,15 +187,13 @@ All of the other arguments remain as they were for BERT and GPT pretraining. Run
 
 The `examples/pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch distributed launcher for distributed training. As such, multi-node training can be achieved by properly setting environment variables. See the official PyTorch [documentation](https://pytorch.org/docs/stable/elastic/run.html#launcher-api) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multi-node training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the `torchrun` elastic launcher (equivalent to `python -m torch.distributed.run`) are the only additional requirements to adopt distributed training. See any of `examples/pretrain_{bert,gpt,t5}_distributed.sh` for more details.
 
-We use two types of parallelism: data and model parallelism. We facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.
+We use two types of parallelism: data and model parallelism. Our data parallelism implementation is in `megatron/core/distributed`, and supports overlapping of the gradient reduction with the backward pass when the `--overlap-grad-reduce` command-line option is used.
 
 Second, we developed a simple and efficient two-dimensional model-parallel approach. To use the first dimension, tensor model parallelism (splitting execution of a single transformer module over multiple GPUs, see Section 3 of [our paper](https://arxiv.org/pdf/1909.08053.pdf)), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use the second dimension, sequence parallelism, specify `--sequence-parallel`, which also requires tensor model parallelism to be enabled because it splits across the same GPUs (more details in Section 4.2.2 of [our paper](https://arxiv.org/pdf/2205.05198.pdf)).
 
 To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches, see Section 2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each).
 
-<!-- The number of microbatches in a per-pipeline minibatch is controlled by the `--num-microbatches-in-minibatch` argument. With `WORLD_SIZE` GPUs, `TENSOR_MP_SIZE` tensor-model-parallel size, `PIPELINE_MP_SIZE` pipeline-model-parallel-size, `WORLD_SIZE`/(`TENSOR_MP_SIZE` * `PIPELINE_MP_SIZE`) GPUs will be used for data parallelism. The default values for `--tensor-model-parallel-size` and `--pipeline-model-parallel-size` is 1, which will not implement either form of model parallelism. -->
-
-We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`:
+We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`.
 
 Other than these minor changes, the distributed training is identical to the training on a single GPU.
 
@@ -228,6 +226,8 @@ Theoretical memory savings vary depending on the combination of the model's para
 | bf16 param, fp32 grads | 18 | 6 + 12/d |
 | fp32 param, fp32 grads | 16 | 8 + 8/d |
 
+As with regular data parallelism, overlapping of the gradient reduction (in this case, a reduce-scatter) with the backward pass can be facilitated using the `--overlap-grad-reduce` flag. Additionally, overlapping of the parameter all-gather can be overlapped with the forward pass using `--overlap-param-gather`.
+
 ## FlashAttention
 
 Usage: `--use-flash-attn`. Support attention head dimensions at most 128.

From cac8d1a3f08cc6291b3abb8b01bc2e97fbac3a0a Mon Sep 17 00:00:00 2001
From: Zijie Yan <zijiey@nvidia.com>
Date: Fri, 3 May 2024 15:38:33 -0700
Subject: [PATCH 08/92] Fix Aux Loss Scaling when TP>1

---
 megatron/core/transformer/moe/router.py           | 15 +++++++++++----
 ...e-tp2-pp1-te-8experts2parallel-top2router.json |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index d3c2e4de70..d5d20426ab 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from megatron.core import parallel_state
 from megatron.core.tensor_parallel import (
     gather_from_sequence_parallel_region,
     get_cuda_rng_tracker,
@@ -172,12 +173,15 @@ def apply_load_balancing_loss(
         Returns:
             torch.Tensor: The activation tensor with the attached gradient function.
         """
+        moe_aux_loss_coeff = (
+            self.config.moe_aux_loss_coeff / parallel_state.get_tensor_model_parallel_world_size()
+        )
         aux_loss = switch_load_balancing_loss_func(
-            probs, num_local_tokens_per_expert, self.topk, self.config.moe_aux_loss_coeff
+            probs, num_local_tokens_per_expert, self.topk, moe_aux_loss_coeff
         )
         save_to_aux_losses_tracker(
             "load_balancing_loss",
-            aux_loss / self.config.moe_aux_loss_coeff,
+            aux_loss / moe_aux_loss_coeff,
             self.layer_number,
             self.config.num_layers,
         )
@@ -195,7 +199,10 @@ def apply_z_loss(self, logits):
             torch.Tensor: The logits after applying the z-loss.
         """
         if self.config.moe_z_loss_coeff is not None:
-            z_loss = z_loss_func(logits, self.config.moe_z_loss_coeff)
+            moe_z_loss_coeff = (
+                self.config.moe_z_loss_coeff / parallel_state.get_tensor_model_parallel_world_size()
+            )
+            z_loss = z_loss_func(logits, moe_z_loss_coeff)
             logits = MoEAuxLossAutoScaler.apply(logits, z_loss)
             save_to_aux_losses_tracker(
                 "z_loss",
@@ -242,7 +249,7 @@ def routing(self, logits: torch.Tensor):
         logits = self.apply_z_loss(logits)
 
         if (
-            self.config.tensor_model_parallel_size > 1
+            parallel_state.get_tensor_model_parallel_world_size() > 1
             and self.config.moe_token_dispatcher_type == "alltoall"
         ):
             # Gather the logits from the TP region
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
index 101dae9a14..38b989333f 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80682, 10.86737, 10.8798, 10.79313, 10.66654, 10.57606, 10.05465, 10.17642, 10.09523, 9.75051]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13204.0, 16599.0, 16953.0, 16250.0, 14858.0, 15929.0, 14720.0, 17220.0, 17630.0, 18561.0]}, "iteration_timing_avg": 0.3051714705882352}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80682, 10.86734, 10.87997, 10.79306, 10.66584, 10.57572, 10.05454, 10.17682, 10.09527, 9.75032]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13454.0, 16317.0, 16781.0, 16315.0, 14876.0, 15877.0, 14704.0, 17095.0, 17749.0, 18463.0]}, "iteration_timing_avg": 0.2969329411764706}
\ No newline at end of file

From 71371b4e14975a62cd584dd79920f9426cc93c18 Mon Sep 17 00:00:00 2001
From: Jack Chang <jianbinc@nvidia.com>
Date: Fri, 3 May 2024 15:43:01 -0700
Subject: [PATCH 09/92] Add state in ChainedOptimizer

---
 megatron/core/optimizer/optimizer.py | 64 ++++++++++++++++++++++-----
 tests/unit_tests/test_optimizer.py   | 66 ++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 12 deletions(-)
 create mode 100644 tests/unit_tests/test_optimizer.py

diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
index 4419e0c0ae..e224470fc6 100644
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -6,7 +6,7 @@
 from abc import ABC, abstractmethod
 from itertools import chain
 from logging import getLogger
-from typing import Callable, List, Optional
+from typing import Any, Callable, List, Optional, Tuple
 
 import amp_C
 import torch
@@ -691,6 +691,43 @@ def load_state_dict(self, state_dict):
         self.optimizer.load_state_dict(state_dict)
 
 
+class ProxyDict:
+    """
+    A dictionary-like object that proxies to a list of dictionaries.
+
+    e.g., ProxyDict([{'a': 1}, {'b': 2}]) behaves like:
+    {
+        (0, 'a'): 1,
+        (1, 'b'): 2,
+    }
+    We use tuples as keys to avoid ambiguity with the keys of the inner dicts.
+    """
+
+    def __init__(self, inner_dicts: List[dict]):
+        self._inner_dicts = inner_dicts
+
+    def __getitem__(self, key: Tuple[int, str]):
+        idx, inner_key = key
+        return self._inner_dicts[idx].get(inner_key)
+
+    def __setitem__(self, key: Tuple[int, str], value: Any):
+        idx, inner_key = key
+        self._inner_dicts[idx][inner_key] = value
+
+    def __len__(self) -> int:
+        return sum([len(inner_dict) for inner_dict in self._inner_dicts])
+
+    def __iter__(self):
+        for idx, inner_dict in enumerate(self._inner_dicts):
+            for inner_key in inner_dict:
+                yield (idx, inner_key)
+
+    def items(self):
+        for idx, inner_dict in enumerate(self._inner_dicts):
+            for inner_key, value in inner_dict.items():
+                yield (idx, inner_key), value
+
+
 class ChainedOptimizer(MegatronOptimizer):
     """ChainedOptimizer is designed for a collection of optimizers.
     
@@ -701,15 +738,23 @@ class ChainedOptimizer(MegatronOptimizer):
         chained_optimizers: a list of optimizers.
     """
 
-    # Remove these attributes which inherits from MegatronOptimizer.
-    state = None
-    param_groups = None
-
     def __init__(self, chained_optimizers: List[MegatronOptimizer]):
         self.chained_optimizers = chained_optimizers
-        self.param_groups = []
+
+    @property
+    def param_groups(self) -> List[dict]:
+        param_groups = []
         for optimizer in self.chained_optimizers:
-            self.param_groups += optimizer.param_groups
+            param_groups += optimizer.param_groups
+        return param_groups
+
+    @property
+    def state(self) -> ProxyDict:
+        """
+        Return optimizer state with tuple keys, where the first element is the
+        index of the optimizer in the list of chained optimizers.
+        """
+        return ProxyDict([opt.state for opt in self.chained_optimizers])
 
     def zero_grad(self, set_to_none=True):
         for optimizer in self.chained_optimizers:
@@ -748,11 +793,6 @@ def load_state_dict(self, state_dict):
         for optimizer, state in zip(self.chained_optimizers, state_dict):
             optimizer.load_state_dict(state)
 
-        # Reset param_groups as load_state_dict reset chained optimizers's attribute.
-        self.param_groups = []
-        for optimizer in self.chained_optimizers:
-            self.param_groups += optimizer.param_groups
-
     def disable_pre_hook(self):
         for optimizer in self.chained_optimizers:
             if (
diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py
new file mode 100644
index 0000000000..247da4aeb9
--- /dev/null
+++ b/tests/unit_tests/test_optimizer.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import SGD, Adam
+
+from megatron.core.optimizer import ChainedOptimizer
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def test_chained_optimizer():
+    net = Net()
+    optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01,)
+    optimizer_2 = SGD(list(net.parameters())[2:], lr=0.1, momentum=0.9,)
+    chained_optimizer = ChainedOptimizer([optimizer_1, optimizer_2])
+
+    # Test the chained optimizer's param groups is a reference of the underlying optimizers' param groups
+    assert optimizer_1.param_groups[0]["lr"] == 0.01
+    chained_optimizer.param_groups[0]["lr"] = 0.02
+    assert optimizer_1.param_groups[0]["lr"] == 0.02
+
+    # Test the chained optimizer's state is a reference of the underlying optimizers' state
+    # 1. run step on optimizers, make sure there is state
+    assert len(chained_optimizer.state) == 0
+    input = torch.randn(1, 3, 32, 32)
+    output = net(input)
+    output.sum().backward()
+    optimizer_1.step()
+    optimizer_2.step()
+    assert len(chained_optimizer.state) != 0
+
+    # 2. check the state is a reference
+    assert not list(optimizer_1.state.values())[0]["exp_avg"].is_cuda
+    assert not list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda
+
+    def to_cuda(d):
+        for k, v in d.items():
+            if isinstance(v, torch.Tensor):
+                d[k] = v.to("cuda")
+            elif isinstance(v, dict):
+                to_cuda(v)
+        return d
+
+    for k, v in chained_optimizer.state.items():
+        chained_optimizer.state[k] = to_cuda(v)
+
+    assert list(optimizer_1.state.values())[0]["exp_avg"].is_cuda
+    assert list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda

From cbb9c05c06b5fa32a8f5b47902751a7bc6d9f112 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Sat, 4 May 2024 08:46:49 -0700
Subject: [PATCH 10/92] Implement asynchronous checkpoint saving (with
 `--dist-ckpt-format torch_dist`)

---
 .../core/dist_checkpointing/serialization.py  |  45 +++-
 .../strategies/async_utils.py                 | 214 ++++++++++++++++++
 .../dist_checkpointing/strategies/base.py     |  32 ++-
 .../strategies/filesystem_async.py            |   4 +-
 .../strategies/fully_parallel.py              |  11 +-
 .../dist_checkpointing/strategies/torch.py    |  51 +++--
 megatron/training/arguments.py                |   4 +-
 megatron/training/async_utils.py              |  43 ++++
 megatron/training/checkpointing.py            |  50 ++--
 megatron/training/training.py                 |  30 +--
 megatron/training/utils.py                    |  19 +-
 .../functional_tests/jet_recipes/MR-gpt.yaml  |   6 +-
 .../dist_checkpointing/test_async_save.py     |  41 ++++
 13 files changed, 477 insertions(+), 73 deletions(-)
 create mode 100644 megatron/core/dist_checkpointing/strategies/async_utils.py
 create mode 100644 megatron/training/async_utils.py
 create mode 100644 tests/unit_tests/dist_checkpointing/test_async_save.py

diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py
index a6f9466ffe..d19aa71908 100644
--- a/megatron/core/dist_checkpointing/serialization.py
+++ b/megatron/core/dist_checkpointing/serialization.py
@@ -37,7 +37,9 @@
     apply_factory_merges,
     is_main_replica,
 )
+from .strategies.async_utils import AsyncRequest
 from .strategies.base import (
+    AsyncSaveShardedStrategy,
     LoadCommonStrategy,
     LoadShardedStrategy,
     SaveCommonStrategy,
@@ -260,7 +262,8 @@ def save(
     sharded_strategy: Union[SaveShardedStrategy, Tuple[str, int], None] = None,
     common_strategy: Union[SaveCommonStrategy, Tuple[str, int], None] = None,
     validate_access_integrity: bool = True,
-) -> None:
+    async_sharded_save: bool = False,
+) -> Optional[AsyncRequest]:
     """Saving entrypoint.
 
     Extracts ShardedTensors from the given state dict. Rank 0 saves the
@@ -275,6 +278,13 @@ def save(
     4. Save all other objects to common.pt
     5. (optional) Extract and save ShardedObjects
     6. Save all ShardedBase objects
+    7. Write metadata.json file with backend and version metadata.
+
+    Step (6) can be performed asynchronously (see `async_sharded_save`), in this
+    case the actual save is embodied in the returned async request and can be
+    scheduled by the external caller. For async request, step (7) is added as
+    one of the finalization functions, so that metadata.json is written only
+    if the checkpoint is complete.
 
     Args:
         sharded_state_dict (ShardedStateDict): state dict of the populated with
@@ -285,6 +295,15 @@ def save(
         common_strategy (SaveCommonStrategy, Tuple[str, int], optional): configures common data saving behavior and backend
         validate_access_integrity (bool default = True): checks if each tensor shard is accessed
             exactly once (as main replica) by some process
+        async_sharded_save (bool, optional): if True, for the sharded state dict part
+            an async save implementation will be called, with the AsyncRequest
+            being returned to the caller. Note that it is the caller responsibility to
+            actually schedule the async save. Defaults to False.
+
+    Returns:
+        AsyncRequest (optional): if `async_sharded_save` is True, returns
+            async request that should be scheduled by the caller of this function.
+            None otherwise.
     """
     checkpoint_dir = Path(checkpoint_dir)
 
@@ -322,12 +341,26 @@ def save(
             sharded_state_dict, checkpoint_dir, validate_access_integrity
         )
 
-    sharded_strategy.save(sharded_state_dict, checkpoint_dir)
-    if torch.distributed.get_rank() == 0:
-        save_config(
-            CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version), checkpoint_dir
+    def metadata_finalize_fn():
+        if torch.distributed.get_rank() == 0:
+            save_config(
+                CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version),
+                checkpoint_dir,
+            )
+        torch.distributed.barrier()
+
+    if not async_sharded_save:
+        sharded_strategy.save(sharded_state_dict, checkpoint_dir)
+        metadata_finalize_fn()
+        return
+
+    if not isinstance(sharded_strategy, AsyncSaveShardedStrategy):
+        raise CheckpointingException(
+            f'Cannot apply async_save to non-async strategy {sharded_strategy}'
         )
-    torch.distributed.barrier()
+    async_request = sharded_strategy.async_save(sharded_state_dict, checkpoint_dir)
+    async_request.finalize_fns.append(metadata_finalize_fn)
+    return async_request
 
 
 def get_default_save_sharded_strategy(
diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py
new file mode 100644
index 0000000000..ac9ba1a35a
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/async_utils.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+This module provides an async utilities which allow to start
+a checkpoint save process in the background.
+"""
+import logging
+from collections import deque
+from time import time
+from typing import Callable, List, NamedTuple, Optional, Tuple
+
+import torch
+from torch import multiprocessing as mp
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncRequest(NamedTuple):
+    """ Represents an async request that needs to be scheduled for execution.
+
+    Args:
+        async_fn (Callable, optional): async function to call. None represents noop.
+        async_fn_args (Tuple): args to pass to `async_fn`.
+        finalize_fns (List[Callable]): list of functions to call to finalize the request.
+            These functions will be called synchronously after `async_fn` is done
+            *on all ranks*.
+    """
+
+    async_fn: Optional[Callable]
+    async_fn_args: Tuple
+    finalize_fns: List[Callable]
+    is_frozen: bool = False
+
+    def add_finalize_fn(self, fn: Callable) -> None:
+        """ Adds a new finalize function to the request.
+
+        Args:
+            fn (Callable): function to add to the async request. This function
+                will be called *after* existing finalization functions.
+
+        Returns:
+            None
+        """
+        if self.is_frozen:
+            raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest')
+        self.finalize_fns.append(fn)
+
+    def execute_sync(self) -> None:
+        """ Helper to synchronously execute the request.
+
+        This logic is equivalent to what should happen in case of the async call.
+        """
+        if self.async_fn is not None:
+            self.async_fn(*self.async_fn_args)
+        torch.distributed.barrier()
+        for finalize_fn in self.finalize_fns:
+            finalize_fn()
+
+    def freeze(self) -> 'AsyncRequest':
+        """ Freezes the async request, disallowing adding new finalization functions.
+
+        Returns:
+            AsyncRequest: new async request with all same fields except for the
+                `is_frozen` flag.
+        """
+        return self._replace(is_frozen=True)
+
+
+class DistributedAsyncCaller:
+    """ Wrapper around mp.Process that ensures correct semantic of distributed finalization.
+
+    Starts process asynchronously and allows checking if all processes on all ranks are done.
+    """
+
+    def __init__(self):
+        self.process: Optional[mp.Process] = None
+        self.start_time: Optional[float] = None
+
+    def schedule_async_call(self, async_fn: Optional[Callable], save_args: Tuple,) -> None:
+        """ Spawn a process with `async_fn` as the target.
+        
+        This method must be called on all ranks.
+
+        Args:
+            async_fn (Callable, optional): async function to call. If None,
+                no process will be started.
+            save_args (Tuple): async function args.
+        """
+        if async_fn is None:
+            return  # nothing to do
+        torch.cuda.synchronize()
+        ctx = mp.get_context('fork')
+        self.start_time = time()
+        self.process = ctx.Process(target=async_fn, args=save_args,)
+        self.process.start()
+
+    def is_current_async_call_done(self, blocking=False) -> bool:
+        """ Check if async save is finished on all ranks.
+
+        For semantic correctness, requires rank synchronization in each check.
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until the call is done
+                on all ranks. Otherwise, returns immediately if at least one rank
+                is still active. Defaults to False.
+
+        Returns:
+            bool: True if all ranks are done (immediately of after active wait
+                if `blocking` is True), False if at least one rank is still active.
+        """
+        # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce)
+        is_alive = int(self.process.is_alive()) if self.process is not None else 0
+        ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device())
+        logger.debug(
+            f"rank: {torch.distributed.get_rank()}, DistributedAsyncCaller is_alive: {is_alive}"
+        )
+        torch.distributed.all_reduce(ten)
+        if ten[0] > 0 and not blocking:
+            return False
+        else:
+            if self.process is not None:
+                logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
+                self.process.join()
+                self.process = None
+
+                logger.debug(
+                    f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking"
+                )
+                self.start_time = None
+            return True
+
+
+class _ActiveAsyncRequest(NamedTuple):
+    """ Helper to represent an active async call.
+
+    Args:
+        idx (int): index of the call (starting from 0)
+        async_caller (DistributedAsyncCaller): async caller instance that represents
+            the async process handling the async request
+        async_request (AsyncRequest):  async request that is being called
+    """
+
+    idx: int
+    async_caller: DistributedAsyncCaller
+    async_request: AsyncRequest
+
+
+class AsyncCallsQueue:
+    """ Manages a queue of async calls.
+
+    Allows adding a new async call with `schedule_async_request` and finalizing
+    active calls with `maybe_finalize_async_calls`.
+    """
+
+    def __init__(self):
+        self.async_calls: deque[_ActiveAsyncRequest] = deque([])
+        self.call_idx: int = -1
+
+    def schedule_async_request(self, async_request: AsyncRequest) -> int:
+        """ Start a new async call and add it to a queue of active async calls.
+        
+        This method must be called on all ranks.
+
+        Args:
+            async_request (AsyncRequest): async request to start.
+
+        Returns:
+            int: index of the async call that was started.
+                This can help the user keep track of the async calls.
+        """
+        self.call_idx += 1
+        async_caller = DistributedAsyncCaller()
+        async_request = async_request.freeze()
+        async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args)
+        self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request))
+        return self.call_idx
+
+    def maybe_finalize_async_calls(self, blocking=False) -> List[int]:
+        """ Finalizes all available calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until all active requests
+                are done. Otherwise, finalizes only the async request that already
+                finished. Defaults to False.
+        Returns:
+            List[int]: list of indices (as returned by `schedule_async_request`)
+                of async calls that have been successfully finalized.
+        """
+        call_idx_finalized = []
+        while self.async_calls:
+            next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking)
+            if not next_async_done:
+                break
+            call_idx, _, async_request = self.async_calls.popleft()
+            for finalize_fn in async_request.finalize_fns:
+                finalize_fn()
+            ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
+            torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
+            assert (
+                ten.item() == call_idx
+            ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization'
+            call_idx_finalized.append(call_idx)
+        return call_idx_finalized
+
+    def get_num_unfinalized_calls(self):
+        """ Get the number of active async calls. """
+        return len(self.async_calls)
+
+    def close(self):
+        """ Finalize all calls upon closing. """
+        self.maybe_finalize_async_calls(blocking=True)
diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py
index 3cba5345f1..97a033a443 100644
--- a/megatron/core/dist_checkpointing/strategies/base.py
+++ b/megatron/core/dist_checkpointing/strategies/base.py
@@ -6,9 +6,9 @@
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional
 
-from ..mapping import CheckpointingException, ShardedStateDict, ShardedTensor, StateDict
+from ..mapping import CheckpointingException, ShardedStateDict, StateDict
+from .async_utils import AsyncRequest
 
 
 class StrategyAction(Enum):
@@ -72,6 +72,9 @@ def can_handle_sharded_objects(self):
         """ Returns whether or not this strategy can handle saving ShardedObjects. """
         return False
 
+    def __str__(self):
+        return f'{self.__class__.__name__}({self.backend}, {self.version})'
+
 
 class LoadCommonStrategy(LoadStrategyBase):
     """ Load strategy for common (non-sharded) objects """
@@ -118,3 +121,28 @@ class SaveShardedStrategy(SaveStrategyBase):
     @abstractmethod
     def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
         raise NotImplementedError
+
+
+class AsyncSaveShardedStrategy(SaveShardedStrategy):
+    """ Save strategy suitable for async save. """
+
+    @abstractmethod
+    def async_save(
+        self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ) -> AsyncRequest:
+        """ Perform preparation and return an AsyncRequest to the external caller.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to save
+            checkpoint_dir (Path): checkpoint target directory
+
+        Returns:
+            AsyncRequest: represents the async save function and finalization function.
+                It is the caller responsibility to actually schedule the async save.
+        """
+        raise NotImplementedError
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """ Each async strategy can be trivially used as a sync strategy. """
+        async_request = self.async_save(sharded_state_dict, checkpoint_dir)
+        async_request.execute_sync()
diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
index ea502f198e..7a838c2366 100644
--- a/megatron/core/dist_checkpointing/strategies/filesystem_async.py
+++ b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
@@ -113,7 +113,7 @@ def gen_file():
             self.write_results = {}
         logger.debug(f"D2H and push, time: {time() - start}")
 
-    def get_save_function_and_args(self) -> Optional[Tuple[Callable, Tuple]]:
+    def get_save_function_and_args(self) -> Tuple[Optional[Callable], Tuple]:
         """
         Get function that saves the data to storage along with its arguments.
         Allows the external caller to apply the save function synchronously or asynchronously.
@@ -123,7 +123,7 @@ def get_save_function_and_args(self) -> Optional[Tuple[Callable, Tuple]]:
             - arguments to that function
         """
         if not self.write_buckets:
-            return None
+            return None, ()
         return (self.write_preloaded_data_multiproc, (self.write_buckets, self.write_results))
 
     @staticmethod
diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
index 7068062e45..1fafcf4b86 100644
--- a/megatron/core/dist_checkpointing/strategies/fully_parallel.py
+++ b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
@@ -21,6 +21,7 @@
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict, StateDict, is_main_replica
 from megatron.core.dist_checkpointing.serialization import validate_sharding_integrity
 from megatron.core.dist_checkpointing.strategies.base import (
+    AsyncSaveShardedStrategy,
     LoadShardedStrategy,
     SaveShardedStrategy,
 )
@@ -54,7 +55,7 @@ class SaveLoadDistribution(NamedTuple):
     shard_to_metadata: Dict[_ShardId, ShardedTensor]
 
 
-class FullyParallelSaveStrategyWrapper(SaveShardedStrategy):
+class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy):
     """ Wraps arbitrary strategy and distributes the save during `save`.
 
     The save distribution happens without any *data* communication.
@@ -92,6 +93,14 @@ def __init__(
 
         self.cached_distribution: Optional[SaveLoadDistribution] = None
 
+    def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        if not isinstance(self.base_strategy, AsyncSaveShardedStrategy):
+            raise CheckpointingException(
+                f'Cannot apply async_save to non-async base strategy {self.base_strategy}'
+            )
+        self.apply_saving_parallelization(sharded_state_dict)
+        return self.base_strategy.async_save(sharded_state_dict, checkpoint_dir)
+
     def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
         self.apply_saving_parallelization(sharded_state_dict)
         return self.base_strategy.save(sharded_state_dict, checkpoint_dir)
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
index 7199b7e3ed..5cddfc5cd6 100644
--- a/megatron/core/dist_checkpointing/strategies/torch.py
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -38,7 +38,14 @@
     StateDict,
     is_main_replica,
 )
-from .base import LoadShardedStrategy, SaveShardedStrategy, StrategyAction, default_strategies
+from .async_utils import AsyncRequest
+from .base import (
+    AsyncSaveShardedStrategy,
+    LoadShardedStrategy,
+    SaveShardedStrategy,
+    StrategyAction,
+    default_strategies,
+)
 from .filesystem_async import FileSystemWriterAsync
 from .state_dict_saver import save_state_dict_async_finalize, save_state_dict_async_plan
 
@@ -369,11 +376,12 @@ def create_local_plan(self) -> LoadPlan:
         return super().create_local_plan()
 
 
-class TorchDistSaveShardedStrategy(SaveShardedStrategy):
-    """Basic save strategy for the PyT Distributed format.
+class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy):
+    """Async save strategy for the PyT Distributed format.
 
     The idea is to translate MCore ShardedTensors into PyT ShardedTensors
-    and reuse the default torch.distributed.checkpoint saving mechanism.
+    and use the async-adjusted torch.distributed.checkpoint saving mechanism
+    provided by the FileSystemWriterAsync writer.
     """
 
     def __init__(
@@ -393,10 +401,9 @@ def __init__(
         self.keep_only_main_replica = keep_only_main_replica
         self.thread_count = thread_count
 
-        # Intermediate state
-        self.save_state_dict_ret: Optional[Tuple[Any, ...]] = None
-
-    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+    def async_save(
+        self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ) -> AsyncRequest:
         """ Translates MCore ShardedTensors to PyT ShardedTensors and saves in PyT Distributed format.
 
         Args:
@@ -414,32 +421,26 @@ def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
             sharded_state_dict, self.keep_only_main_replica
         )
         pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False)
-
-        # Using async infrastructure for sync save
+        # Use PyT saving mechanism
         writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count)
-        self.save_state_dict_ret = save_state_dict_async_plan(
+
+        save_state_dict_ret = save_state_dict_async_plan(
             pyt_state_dict,
             writer,
             None,
             planner=MCoreSavePlanner(dedup_replicated_tensors=not self.keep_only_main_replica),
         )
-        fun_args = writer.get_save_function_and_args()
-        if fun_args is not None:
-            fun, args = fun_args
-            fun(*args)
-        self._finalize_save()
+        return self._get_save_and_finalize_callbacks(writer, save_state_dict_ret)
 
-    def _finalize_save(self) -> None:
-        """ Perform save finalization.
+    def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret) -> AsyncRequest:
+        save_fn_args = writer.get_save_function_and_args()
+        save_fn, save_args = save_fn_args
 
-        Breakdown into `save` and `save_finalize` cn be useful for async saving.
-        """
-        if self.save_state_dict_ret is None:
-            raise CheckpointingException('finalize_save called, but no ckpt save in progress')
+        def finalize_fn():
+            save_state_dict_async_finalize(*save_state_dict_ret)
+            torch.distributed.barrier()
 
-        save_state_dict_async_finalize(*self.save_state_dict_ret)
-        self.save_state_dict_ret = None
-        torch.distributed.barrier()
+        return AsyncRequest(save_fn, save_args, [finalize_fn])
 
     def can_handle_sharded_objects(self):
         return True
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index c6206496f7..422a2854ed 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1234,6 +1234,9 @@ def _add_checkpointing_args(parser):
                        help='Apply full save parallelization across DP for'
                             ' distributed checkpoints. Depending on ckpt format'
                             ' might increase number of files in the checkpoint.')
+    group.add_argument('--async-save', action='store_true', default=None,
+                       help='Apply async checkpointing save. Currently works only with'
+                            '`torch_dist` distributed checkpoint format.')
     group.add_argument('--ckpt-fully-parallel-load', action='store_true',
                        help='Apply full load parallelization across DP for'
                             ' distributed checkpoints.')
@@ -1241,7 +1244,6 @@ def _add_checkpointing_args(parser):
                        help='If the model and optimizer state dict structure is'
                             'constant throughout a *single training job*, it allows for'
                             'different checkpointing performance optimizations.')
-
     return parser
 
 
diff --git a/megatron/training/async_utils.py b/megatron/training/async_utils.py
new file mode 100644
index 0000000000..44530ad9d9
--- /dev/null
+++ b/megatron/training/async_utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+This module provides a singleton instance of AsyncCallsQueue which manages
+the async checkpoint save calls.
+"""
+import logging
+
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest
+from megatron.training import get_args
+from megatron.training.utils import print_rank_0
+
+logger = logging.getLogger(__name__)
+
+# Singleton manager of async calls
+_async_calls_queue = AsyncCallsQueue()
+
+
+def schedule_async_save(async_request: AsyncRequest):
+    """ Schedule the async save request.
+
+    Args:
+        async_request (AsyncRequest): the async save request.
+    """
+    _async_calls_queue.schedule_async_request(async_request)
+
+
+def maybe_finalize_async_save(blocking: bool = False):
+    """ Finalizes active async save calls.
+
+    Args:
+        blocking (bool, optional): if True, will wait until all active requests
+            are done. Otherwise, finalizes only the async request that already
+            finished. Defaults to False.
+    """
+    args = get_args()
+    if not args.async_save:
+        return
+
+    if blocking and _async_calls_queue.get_num_unfinalized_calls() > 0:
+        print_rank_0('Unfinalized async checkpoint saves. Finalizing them synchronously now.')
+
+    _async_calls_queue.maybe_finalize_async_calls(blocking)
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index 380037b4fa..d5cc881fc8 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -15,9 +15,9 @@
 from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
 from megatron.core.dist_checkpointing.strategies.fully_parallel import \
     FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper
+from .async_utils import schedule_async_save
 from .global_vars import get_args
-from .utils import (unwrap_model,
-                    print_rank_0)
+from .utils import unwrap_model, print_rank_0, append_to_progress_log
 from ..core.dist_checkpointing.serialization import \
     get_default_save_sharded_strategy
 
@@ -298,6 +298,13 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
         ensure_directory_exists(optim_checkpoint_name)
         optimizer.save_parameter_state(optim_checkpoint_name)
 
+    async_save_request = None
+    if args.async_save:
+        if not args.use_dist_ckpt:
+            raise NotImplementedError('Async checkpoint save not implemented for legacy checkpoints')
+        elif args.dist_ckpt_format != 'torch_dist':
+            raise NotImplementedError(f'Async checkpoint save not implemented for {args.dist_ckpt_format} distributed checkpoint format')
+
     # Collect args, model, RNG.
     if not torch.distributed.is_initialized() \
             or mpu.get_data_modulo_expert_parallel_rank() == 0 \
@@ -329,28 +336,43 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
             # Store save strategy for future checkpoint saves
             if checkpointing_context is not None:
                 checkpointing_context['save_strategy'] = save_strategy
-
-            dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
-                                    validate_access_integrity=validate_sharding_integrity)
-
+            async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
+                                                         async_sharded_save=args.async_save)
         else:
             # Save.
             ensure_directory_exists(checkpoint_name)
             torch.save(state_dict, checkpoint_name)
 
-    # Wait so everyone is done (necessary)
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-    print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}' \
-                 .format(iteration, args.save))
+    if not args.async_save:
+        assert async_save_request is None
+        # Wait so everyone is done (necessary)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
 
     # And update the latest iteration
     if not torch.distributed.is_initialized() \
        or torch.distributed.get_rank() == 0:
         tracker_filename = get_checkpoint_tracker_filename(args.save)
-        with open(tracker_filename, 'w') as f:
-            f.write(str(iteration))
+
+        def iter_finalize_fn():
+            with open(tracker_filename, 'w') as f:
+                f.write(str(iteration))
+            print_rank_0('  successfully saved checkpoint from iteration {:7d} to {}'
+                         .format(iteration, args.save))
+            if args.log_progress and args.async_save:
+                append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}',
+                                       barrier=False)
+
+        if args.async_save:
+            assert async_save_request is not None
+            async_save_request.add_finalize_fn(iter_finalize_fn)
+        else:
+            iter_finalize_fn()
+
+    if args.async_save:
+        schedule_async_save(async_save_request)
+        print_rank_0('  scheduled an async checkpoint save at iteration {:7d} to {}' \
+                     .format(iteration, args.save))
 
     # Wait so everyone is done (not necessary)
     if torch.distributed.is_initialized():
diff --git a/megatron/training/training.py b/megatron/training/training.py
index e2128896af..b33b85eab2 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -35,7 +35,7 @@
 from megatron.legacy.data.data_samplers import build_pretraining_data_loader
 from megatron.core.transformer.moe.moe_utils import track_moe_metrics
 from megatron.core.pipeline_parallel import get_forward_backward_func
-
+from .async_utils import maybe_finalize_async_save
 from .utils import (
     calc_params_l2_norm,
     check_adlr_autoresume_termination,
@@ -43,7 +43,9 @@
     print_rank_0,
     print_rank_last,
     report_memory,
-    unwrap_model)
+    unwrap_model,
+    append_to_progress_log,
+)
 from .global_vars import (
     get_args,
     get_signal_handler,
@@ -103,20 +105,6 @@ def num_floating_point_operations(args, batch_size):
     )
 
 
-def append_to_progress_log(string):
-    args = get_args()
-    if args.save is None:
-        return
-    progress_log_filename = os.path.join(args.save, "progress.txt")
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        with open(progress_log_filename, 'a') as f:
-            job_id = os.getenv('SLURM_JOB_ID', '')
-            num_gpus = args.world_size
-            f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\tJob ID: {job_id}\t"
-                    f"# GPUs: {num_gpus}\t{string}\n")
-
-
 def get_start_time_from_progress_log():
     """
     Gets start time of earliest job with same world size. Also returns the number
@@ -313,6 +301,8 @@ def pretrain(train_valid_test_dataset_provider,
                                    iteration, process_non_loss_data_func, config,
                                    verbose=True, write_to_tensorboard=not args.skip_train)
 
+    maybe_finalize_async_save(blocking=True)
+
 
 
 def update_train_iters(args):
@@ -881,8 +871,8 @@ def compute_throughputs_and_append_to_progress_log(iteration,
             elapsed_time * 10**12 * args.world_size)
 
     tokens_so_far = args.consumed_train_samples * args.seq_length
-
-    append_to_progress_log(f"Saved checkpoint\tIteration: {iteration}\t"
+    saved_ckpt_prefix = 'Saving async checkpoint' if args.async_save else 'Saved checkpoint'
+    append_to_progress_log(f"{saved_ckpt_prefix}\tIteration: {iteration}\t"
                            f"Job throughput: {job_throughput:.1f} TFLOP/s/GPU\t"
                            f"Cumulative throughput: {cumulative_throughput:.1f} TFLOP/s/GPU\t"
                            f"Floating-point operations: {num_floating_point_operations_so_far:.2e}\t"
@@ -1015,6 +1005,8 @@ def track_e2e_metrics():
             torch.cuda.cudart().cudaProfilerStart()
             torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
 
+        maybe_finalize_async_save(False)
+
         # Update number of microbatches first without consistency check to decide if a
         # checkpoint should be saved. If the number of microbatches is different
         # from the previous iteration, save a checkpoint. Then run consistency check
@@ -1193,6 +1185,8 @@ def track_e2e_metrics():
     if args.use_distributed_optimizer and args.overlap_param_gather:
         optimizer.disable_pre_hook()
 
+    maybe_finalize_async_save(True)
+
     # If any exit conditions (signal handler, duration, iterations) have been reached, exit.
     if exit:
         sys.exit()
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
index ef2ec1cd37..61117576e6 100644
--- a/megatron/training/utils.py
+++ b/megatron/training/utils.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
 """General utilities."""
-
+import os
 import sys
+from datetime import datetime
 
 import torch
 
@@ -273,6 +274,22 @@ def print_rank_last(message):
         print(message, flush=True)
 
 
+def append_to_progress_log(string, barrier=True):
+    """ Append given string to progress log. """
+    args = get_args()
+    if args.save is None:
+        return
+    progress_log_filename = os.path.join(args.save, "progress.txt")
+    if barrier:
+        torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        with open(progress_log_filename, 'a') as f:
+            job_id = os.getenv('SLURM_JOB_ID', '')
+            num_gpus = args.world_size
+            f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\tJob ID: {job_id}\t"
+                    f"# GPUs: {num_gpus}\t{string}\n")
+
+
 def get_batch_on_this_tp_rank(data_iterator):
 
     args = get_args()
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index 59f05140c6..29080265fb 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -62,8 +62,8 @@ products:
   - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ["--qk-layernorm --test-mode"]}
   - {tp_size: [1], pp_size: [2], extra_args: ['"--position-embedding-type rope"'], args_meta: ["rope_embeddings"]}
   - {tp_size: [1], pp_size: [2], extra_args: ['"--position-embedding-type rope --rotary-interleaved --no-rope-fusion"'], args_meta: ["rope_embeddings_interleaved_no_fusion"]}
-  - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--swiglu --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["swiglu"]}
-  - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ["--disable-bias-linear"], args_meta: ["disable_bias_linear"]}
+  - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --async-save"'], args_meta: ["disable_bias_linear"]}
+  - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--swiglu --ckpt-fully-parallel-save --ckpt-fully-parallel-load --async-save"'], args_meta: ["swiglu"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ["--untie-embeddings-and-output-weights"], args_meta: ["untie_embeddings_and_outputs"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["sequence_parallel"]}
   - {tp_size: [1], pp_size: [1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]}
@@ -74,7 +74,7 @@ products:
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --overlap-grad-reduce --overlap-param-gather"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_groupedGEMM"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_top2router"]}
-  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ["--use-distributed-optimizer"], args_meta: ["dist_optimizer"]}
+  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --async-save"'], args_meta: ["dist_optimizer"]}
   - {tp_size: [1], pp_size: [1], extra_args: ['"--use-distributed-optimizer --no-mmap-bin-files"'], args_meta: ["dist_optimizer_no_mmap_bin_files"]}
   - {tp_size: [4], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
   - {tp_size: [4], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py
new file mode 100644
index 0000000000..3b74161b37
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_async_save.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor, save, load
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.strategies.async_utils import \
+    AsyncCallsQueue
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestAsyncSave:
+    def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        sharded_state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 4), replica_id=Utils.rank),
+            'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1),
+        }
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_equivalence_async') as async_ckpt_dir, \
+             TempNamedDir(tmp_path_dist_ckpt / 'test_equivalence_sync') as sync_ckpt_dir:
+            # async
+            async_calls = AsyncCallsQueue()
+            async_request = save(sharded_state_dict, async_ckpt_dir, async_sharded_save=True)
+            async_calls.schedule_async_request(async_request)
+
+            # sync
+            save(sharded_state_dict, sync_ckpt_dir, async_sharded_save=False)
+
+            # finalize async
+            async_calls.maybe_finalize_async_calls(blocking=True)
+
+            # load and compare
+            loaded_async_state_dict = load(sharded_state_dict, async_ckpt_dir)
+            loaded_sync_state_dict = load(sharded_state_dict, sync_ckpt_dir)
+            diffs = diff(loaded_async_state_dict, loaded_sync_state_dict)
+            assert not any(map(bool, diffs)), diffs
+
+        Utils.destroy_model_parallel()

From 900b7c7d6002b724e8b7090528a46a785b226c4e Mon Sep 17 00:00:00 2001
From: "Hao Wang (OV Infra)" <haowan@nvidia.com>
Date: Sat, 4 May 2024 08:49:23 -0700
Subject: [PATCH 11/92] Use multiple threads for dataset index reading

---
 .../blended_megatron_dataset_builder.py       | 141 +++++++++++++++---
 .../blended_megatron_dataset_config.py        |   3 +
 megatron/training/arguments.py                |   6 +-
 pretrain_gpt.py                               |   1 +
 4 files changed, 127 insertions(+), 24 deletions(-)

diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index 8b39948f39..1fdb749be7 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -2,6 +2,7 @@
 
 import logging
 import math
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Callable, Iterable, List, Optional, Type, Union
 
 import numpy
@@ -79,9 +80,9 @@ def __init__(
 
     def build(self) -> List[Optional[TopLevelDataset]]:
         """Build all dataset splits according to the provided blend(s)
-        
+
         This method is distributed-aware and must be called on all ranks.
-        
+
         The dataset splits returned can vary according to the config. Supply config.blend and
         config.split to build BlendedDataset and/or MegatronDataset splits from the same
         distribution. Supply config.blend_per_split to build BlendedDataset and/or MegatronDataset
@@ -94,7 +95,7 @@ def build(self) -> List[Optional[TopLevelDataset]]:
         (2) The split has one contributing dataset, and...
 
             (a) 'size' is not None
-                - Build a mid-level dataset with low-level dataset sampling in proportion to the size            
+                - Build a mid-level dataset with low-level dataset sampling in proportion to the size
 
             (b) 'size' is None
                 - Build mid-level datasets with no excess low-level dataset sampling
@@ -111,7 +112,8 @@ def build(self) -> List[Optional[TopLevelDataset]]:
             (c) 'weights' is None and 'size' is not None
                 - Build mid-level datasets with no excess low-level dataset sampling
                 - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
-                    - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
+
+                  - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
 
             (d) 'weights' is None and 'size' is None
                 - Build mid-level datasets with no excess low-level dataset sampling
@@ -139,7 +141,7 @@ def build(self) -> List[Optional[TopLevelDataset]]:
 
     def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
         """Build all dataset splits according to the provided blend(s)
-        
+
         See the BlendedMegatronDatasetBuilder.build alias for more information.
 
         Returns:
@@ -176,13 +178,11 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                 sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
             else:
                 sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
-            megatron_datasets = [[] for _ in range(len(Split))]
-            for i in range(len(prefixes)):
-                megatron_datasets_split = self._build_megatron_dataset_splits(
-                    prefixes[i], split, sizes_per_dataset[i]
-                )
-                for j in range(len(megatron_datasets_split)):
-                    megatron_datasets[j].append(megatron_datasets_split[j])
+
+            # build each dataset in parallel
+            megatron_datasets = self._build_megatron_datasets_parallel(
+                prefixes, split, sizes_per_dataset
+            )
 
             # Build the top-level datasets
             blended_datasets = [None] * len(Split)
@@ -207,6 +207,7 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                     blended_datasets[i] = self.build_generic_dataset(
                         BlendedDataset,
                         self.is_built_on_rank,
+                        True,  # synchronize_ranks, default behavior to build on rank-0 first
                         megatron_datasets[i],
                         weights_i,
                         size_i,
@@ -245,13 +246,11 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                         sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
                     else:
                         sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
-                    megatron_datasets = []
-                    for j in range(len(prefixes)):
-                        megatron_datasets.append(
-                            self._build_megatron_dataset_splits(
-                                prefixes[j], split_spoof, sizes_per_dataset[j],
-                            )[i]
-                        )
+
+                    # build each dataset in parallel
+                    megatron_datasets = self._build_megatron_datasets_parallel(
+                        prefixes, split_spoof, sizes_per_dataset
+                    )[i]
 
                     # Build top-level dataset
                     if weights is not None and self.sizes[i] is not None:
@@ -272,6 +271,7 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                     blended_datasets[i] = self.build_generic_dataset(
                         BlendedDataset,
                         self.is_built_on_rank,
+                        True,  # synchronize_ranks, default behavior to build on rank-0 first
                         megatron_datasets,
                         weights,
                         size,
@@ -280,8 +280,94 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
 
             return blended_datasets
 
+    def _build_megatron_datasets_parallel(
+        self, prefixes: List[str], split: List[float], sizes_per_dataset: List[List[int]],
+    ) -> List[List[Optional[MegatronDataset]]]:
+        """Build the megatron datasets for a list of prefixes in parallel
+
+        Args:
+            prefixes (List[str]): The list of prefix strings
+
+            split (List[float]): The dataset split ratios (must sum to 1.00)
+
+            sizes_per_dataset (List[List[int]]): The number of samples to request
+            per MegatronDataset per spilt
+
+        Returns:
+            List[List[Optional[MegatronDataset]]]: For each split, have a list of
+            MegatronDataset per prefix
+        """
+        # Helper function to wrap the threading logic
+        def _threading_helper(
+            megatron_datasets: List[List[Optional[MegatronDataset]]],
+            num_workers: int,
+            prefixes: List[str],
+            split: List[float],
+            sizes_per_dataset: List[List[int]],
+        ) -> None:
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                all_futures = []
+                for i in range(len(prefixes)):
+                    all_futures.append(
+                        executor.submit(
+                            self._build_megatron_dataset_splits,
+                            prefixes[i],
+                            split,
+                            sizes_per_dataset[i],
+                            False,  # synchronize_ranks, barrier is called in this function
+                        )
+                    )
+                for future in all_futures:
+                    try:
+                        megatron_datasets_split = future.result()
+                        for j in range(len(megatron_datasets_split)):
+                            megatron_datasets[j].append(megatron_datasets_split[j])
+                    except Exception as err:
+                        raise err
+            return megatron_datasets
+
+        megatron_datasets = [[] for _ in range(len(Split))]
+        num_dataset_builder_threads = self.config.num_dataset_builder_threads
+
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            # First, build on rank 0
+            if rank == 0:
+                num_workers = num_dataset_builder_threads
+                if num_workers > 1:
+                    # since only rank 0 is running, scale up the thread count
+                    # but not too much to avoid overloading storage on miss path.
+                    # if user set num_dataset_builder_threads to 1,
+                    # i.e. meant for serial build, do not scale up.
+                    num_workers *= min(2, max(1, torch.cuda.device_count()))
+                _threading_helper(
+                    megatron_datasets, num_workers, prefixes, split, sizes_per_dataset,
+                )
+
+            torch.distributed.barrier()
+
+            # Then, build on other ranks; guaranteed to be data_cache hit
+            if rank != 0:
+                _threading_helper(
+                    megatron_datasets,
+                    num_dataset_builder_threads,
+                    prefixes,
+                    split,
+                    sizes_per_dataset,
+                )
+        else:
+            _threading_helper(
+                megatron_datasets, num_dataset_builder_threads, prefixes, split, sizes_per_dataset,
+            )
+
+        return megatron_datasets
+
     def _build_megatron_dataset_splits(
-        self, dataset_path: Optional[str], split: List[float], sizes: List[int],
+        self,
+        dataset_path: Optional[str],
+        split: List[float],
+        sizes: List[int],
+        synchronize_ranks: bool = True,
     ) -> List[Optional[MidLevelDataset]]:
         """Build each MidLevelDataset split from a single LowLevelDataset
 
@@ -292,6 +378,8 @@ def _build_megatron_dataset_splits(
 
             sizes (List[int]): The number of total samples to draw from each split
 
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+
         Returns:
             List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
         """
@@ -319,6 +407,7 @@ def _build_megatron_dataset_splits(
                     self.build_generic_dataset(
                         self.cls,
                         self.is_built_on_rank,
+                        synchronize_ranks,
                         low_level_dataset,
                         dataset_path,
                         split_indices[i],
@@ -332,7 +421,10 @@ def _build_megatron_dataset_splits(
 
     @staticmethod
     def build_generic_dataset(
-        cls: Union[Type[DistributedDataset], Callable], is_built_on_rank: Callable, *args: Any
+        cls: Union[Type[DistributedDataset], Callable],
+        is_built_on_rank: Callable,
+        synchronize_ranks: bool,
+        *args: Any,
     ) -> Optional[Union[DistributedDataset, Iterable]]:
         """Build the DistributedDataset
 
@@ -342,6 +434,8 @@ def build_generic_dataset(
         Args:
             cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
 
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+
             args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
 
         Raises:
@@ -368,7 +462,8 @@ def build_generic_dataset(
                     )
                     raise Exception(log) from err
 
-            torch.distributed.barrier()
+            if synchronize_ranks:
+                torch.distributed.barrier()
 
             # After, build on other ranks
             if rank != 0 and is_built_on_rank():
@@ -383,7 +478,7 @@ def _get_size_per_split_per_dataset(
     normalized_weights: List[float], target_size_per_split: List[int]
 ) -> List[List[int]]:
     """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
-    
+
     Args:
         normalized_weights (List[float]): e.g. [0.3, 0.7]
 
diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py
index 871fff55f5..a4dd1b46d6 100644
--- a/megatron/core/datasets/blended_megatron_dataset_config.py
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -45,6 +45,9 @@ class BlendedMegatronDatasetConfig:
        'split'. Not to be passed in to the constructor.
     """
 
+    num_dataset_builder_threads: int = 1
+    """The number of threads to use for dataset building."""
+
     path_to_cache: Optional[str] = None
     """Where all re-useable dataset indices are to be cached."""
 
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 03bf635356..f8f7f9440c 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -295,6 +295,9 @@ def validate_args(args, defaults={}):
     if args.dataloader_type is None:
         args.dataloader_type = 'single'
 
+    # data
+    assert args.num_dataset_builder_threads > 0
+
     # Consumed tokens.
     args.consumed_train_samples = 0
     args.consumed_valid_samples = 0
@@ -1459,7 +1462,8 @@ def _add_data_args(parser):
     group.add_argument('--no-create-attention-mask-in-dataloader', action='store_false',
                        help='If set, do not create attention_masks in dataloader.',
                        dest='create_attention_mask_in_dataloader')
-
+    group.add_argument('--num-dataset-builder-threads', type=int, default=1,
+                       help='Number of parallel threads per rank for dataset builder')
     return parser
 
 
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 3b5593de0c..7f2ad3ed4e 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -198,6 +198,7 @@ def core_gpt_dataset_config_from_args(args):
             get_blend_from_list(args.test_data_path)
         ],
         split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
         path_to_cache=args.data_cache_path,
         mmap_bin_files=args.mmap_bin_files,
         tokenizer=tokenizer,

From d484aebc90ad3b0a3d7483bc61c6e13b47e8562c Mon Sep 17 00:00:00 2001
From: Dennis Liu <denliu@nvidia.com>
Date: Sat, 4 May 2024 08:50:35 -0700
Subject: [PATCH 12/92] Optimize the implementation of aux loss.

---
 megatron/core/transformer/moe/moe_utils.py | 23 ++++++++++++----------
 megatron/core/transformer/moe/router.py    |  8 ++++----
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index 55afb75d69..ef6a64661b 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -7,23 +7,26 @@
 from megatron.core import parallel_state
 
 
-def switch_load_balancing_loss_func(gates, tokens_per_expert, topk, moe_aux_loss_coeff):
-    """Calculate the auxiliary loss for better load balancing. 
+def switch_load_balancing_loss_func(
+    probs: torch.Tensor, tokens_per_expert: torch.Tensor, topk: int, moe_aux_loss_coeff: float
+):
+    """Calculate the auxiliary loss for better load balacing. 
     Please refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details.
 
     Args:
-        gates (torch.Tensor): The gates tensor representing the routing probabilities for each expert.
-        mask (torch.Tensor): The 2D mask tensor indicating which experts are selected.
+        probs (torch.Tensor): The softmax probs output by the router for each token. [num_tokens, num_experts]
+        tokens_per_expert (torch.Tensor): The number of assigned tokens for each expert. [num_experts]
 
     Returns:
         torch.Tensor: The auxiliary loss for load balancing.
     """
-    num_experts = gates.size(1)
-    num_tokens = gates.size(0) * topk
-    gates_mean = gates.mean(dim=0)
-    selection_mean = tokens_per_expert.float() / num_tokens
-    aux_loss = torch.sum(gates_mean * selection_mean) * num_experts
-    aux_loss *= moe_aux_loss_coeff
+    num_tokens = probs.shape[0] * topk
+    num_experts = probs.shape[1]
+
+    probs_mean_per_expert = probs.mean(dim=0)
+    aux_loss = torch.sum(probs_mean_per_expert * tokens_per_expert) * (
+        num_experts / num_tokens * moe_aux_loss_coeff
+    )
     return aux_loss
 
 
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index d3c2e4de70..d2378a1f4d 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -165,8 +165,8 @@ def apply_load_balancing_loss(
         """Applies auxiliary loss to the MoE layer.
 
         Args:
-            probs (torch.Tensor): The probabilities output by the MoE layer.
-            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert.
+            probs (torch.Tensor): The probs output by the router for each token. [num_tokens, num_experts]
+            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert. [num_experts]
             activation (torch.Tensor): The activation tensor to attach the gradient function to.
 
         Returns:
@@ -187,10 +187,10 @@ def apply_load_balancing_loss(
     def apply_z_loss(self, logits):
         """Encourages the router's logits to remain small to enhance stability.
         Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
-        
+
         Args:
             logits (torch.Tensor): The logits of the router.
-        
+
         Returns:
             torch.Tensor: The logits after applying the z-loss.
         """

From b7b98ba28db132f064b4cef3f8e0ba598dc3404b Mon Sep 17 00:00:00 2001
From: eharper <eharper@nvidia.com>
Date: Sat, 4 May 2024 10:01:13 -0600
Subject: [PATCH 13/92] update version

Signed-off-by: eharper <eharper@nvidia.com>
---
 megatron/core/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index c48a2adbfc..74fd91e0ca 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 7
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = ''
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

From efe22f79fe3f0c640057e9bb8a17d61d7361b2c6 Mon Sep 17 00:00:00 2001
From: Eric Harper <eharper@nvidia.com>
Date: Sat, 4 May 2024 09:33:30 -0700
Subject: [PATCH 14/92] Update minor version to 0.8

---
 megatron/core/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index c48a2adbfc..4e7f4b2180 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,7 +2,7 @@
 
 
 MAJOR = 0
-MINOR = 7
+MINOR = 8
 PATCH = 0
 PRE_RELEASE = 'rc0'
 

From ac08742c968db1f47a806f0df2e892ad518f82bf Mon Sep 17 00:00:00 2001
From: Jared Casper <jcasper@nvidia.com>
Date: Sat, 4 May 2024 20:59:09 -0700
Subject: [PATCH 15/92] Add a "deterministic mode".

---
 .gitlab-ci.yml                                | 14 +++++-----
 Dockerfile.test                               | 26 +++++++++----------
 README.md                                     | 25 +++++++++---------
 megatron/core/model_parallel_config.py        |  4 +++
 .../custom_layers/transformer_engine.py       |  8 ++++++
 megatron/legacy/model/fused_layer_norm.py     |  2 +-
 megatron/training/arguments.py                | 12 +++++++++
 megatron/training/training.py                 |  4 +++
 .../functional_tests/jet_recipes/MR-gpt.yaml  | 24 ++++++++---------
 .../jet_recipes/build-pyt.yaml                |  2 +-
 .../test_resume_checkpoint_pipeline.py        | 11 +++-----
 ...gx-a100-1n8g-mcore-tp2-pp2-local-spec.json |  2 +-
 ...e-request-dgx-a100-1n8g-mcore-tp2-pp2.json |  2 +-
 ...rge-request-dgx-a100-1n8g-tp1-pp4-vp2.json |  2 +-
 ...m-merge-request-dgx-a100-1n8g-tp2-pp2.json |  2 +-
 ...-pp1-dist-optimizer-no-mmap-bin-files.json |  2 +-
 ...100-1n8g-mcore-tp1-pp1-dist-optimizer.json |  2 +-
 ...-mcore-tp1-pp1-uniform-full-recompute.json |  2 +-
 ...rope-embeddings-interleaved-no-fusion.json |  2 +-
 ...00-1n8g-mcore-tp1-pp2-rope-embeddings.json |  2 +-
 ...n8g-mcore-tp1-pp4-disable-bias-linear.json |  2 +-
 ...-1n8g-mcore-tp1-pp4-sequence-parallel.json |  2 +-
 ...st-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json |  2 +-
 ...-tp1-pp4-untie-embeddings-and-outputs.json |  2 +-
 ...0-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json |  2 +-
 ...izer-overlap-grad-reduce-param-gather.json |  2 +-
 ...-optimizer-overlap-grad-reduce-untied.json |  2 +-
 ...p1-dist-optimizer-overlap-grad-reduce.json |  2 +-
 ...quest-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json |  2 +-
 ...1-te-8experts2parallel-dist-optimizer.json |  2 +-
 ...-pp1-te-8experts2parallel-groupedgemm.json |  2 +-
 ...-grad-reduce-param-gather-groupedgemm.json |  2 +-
 ...2-pp1-te-8experts2parallel-top2router.json |  2 +-
 ...8g-mcore-tp2-pp1-te-8experts2parallel.json |  2 +-
 ...o-create-attention-mask-in-dataloader.json |  2 +-
 ...-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json |  2 +-
 ...e-request-dgx-a100-1n8g-mcore-tp2-pp2.json |  2 +-
 ...izer-overlap-grad-reduce-param-gather.json |  2 +-
 ...p1-dist-optimizer-overlap-grad-reduce.json |  2 +-
 ...-mcore-tp4-pp1-qk-layernorm-test-mode.json |  1 +
 ...e-request-dgx-a100-1n8g-mcore-tp4-pp1.json |  1 -
 ...erge-request-dgx-a100-1n8g-te-tp2-pp2.json |  2 +-
 ...rge-request-dgx-a100-1n8g-tp1-pp4-vp1.json |  2 +-
 ...m-merge-request-dgx-a100-1n8g-tp2-pp2.json |  2 +-
 ...equest-dgx-a100-1n8g-mcore-te-tp1-pp1.json |  2 +-
 ...st-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json |  2 +-
 .../bert/pretrain_bert_distributed_test.sh    |  9 ++++++-
 .../gpt3/pretrain_gpt3_distributed_test.sh    | 12 ++++++---
 .../pretrain_llava_distributed_test.sh        | 13 +++++++---
 .../t5/pretrain_t5_distributed_test.sh        | 13 +++++++---
 50 files changed, 147 insertions(+), 100 deletions(-)
 create mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1.json

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 73b9fa9ee1..53c23cd098 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -79,7 +79,7 @@ unit_tests-dist-checkpointing:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-fusions:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -93,7 +93,7 @@ unit_tests-fusions:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-models:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -107,7 +107,7 @@ unit_tests-models:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-pipeline-parallel:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -121,7 +121,7 @@ unit_tests-pipeline-parallel:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-tensor-parallel:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -135,7 +135,7 @@ unit_tests-tensor-parallel:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-transformer:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -149,7 +149,7 @@ unit_tests-transformer:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 unit_tests-top-py:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
   tags:
@@ -163,7 +163,7 @@ unit_tests-top-py:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
-  
+
 docs_build_test:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
   stage: test
diff --git a/Dockerfile.test b/Dockerfile.test
index 5de0167f41..9abefbf327 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -1,14 +1,12 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3
-FROM ${FROM_IMAGE_NAME}
-
-RUN pip install --no-cache-dir \
-    "pytest-cov" \
-    "pytest_mock" \
-    "nltk" \
-    "wrapt" \
-    "zarr" \
-    "tensorstore==0.1.45" \
-    "git+https://github.com/fanshiqing/grouped_gemm@v1.0" \
-    "black==19.10b0" \
-    "isort" \
-    "click==8.0.2"
+# syntax=docker/dockerfile:experimental
+
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
+      /etc/apt/apt.conf.d/docker-clean
+
+RUN apt-get update && apt-get install -y --no-install-recommends
+
+RUN pip3 install sentencepiece einops flask-restful pytest wandb
+RUN pip3 install git+https://github.com/fanshiqing/grouped_gemm@v1.1.1
\ No newline at end of file
diff --git a/README.md b/README.md
index ea2f01f8b3..f2e4fe84b1 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ Megatron-LM & Megatron-Core
 This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a ressearch-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
 
 ## Megatron-LM
-First introduced in 2019, Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) sparked a wave of innovation in the AI community, enabling researchers and developers to utilize the underpinnings of this library to further LLM advancements. Today, many of the most popular LLM developer frameworks have been inspired by and built directly leveraging the open-source Megatron-LM library, spurring a wave of foundation models and AI startups. Some of the most popular LLM frameworks built on top of Megatron-LM include [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [HuggingFace Accelerate](https://github.com/huggingface/accelerate), and [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/). A list of projects that have directly used Megatron can be found [here](#projects-using-megatron). 
+First introduced in 2019, Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) sparked a wave of innovation in the AI community, enabling researchers and developers to utilize the underpinnings of this library to further LLM advancements. Today, many of the most popular LLM developer frameworks have been inspired by and built directly leveraging the open-source Megatron-LM library, spurring a wave of foundation models and AI startups. Some of the most popular LLM frameworks built on top of Megatron-LM include [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [HuggingFace Accelerate](https://github.com/huggingface/accelerate), and [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/). A list of projects that have directly used Megatron can be found [here](#projects-using-megatron).
 
 ## Megatron-Core
 Megatron-Core is a newly released open-source PyTorch-based library that further expands the collections of GPU optimized techniques inherited from Megatron-LM with more cutting-edge innovations on system-level optimizations. It abstracts them into composable and modular APIs, allowing full flexibility for developers and model researchers to train custom transformers at-scale on NVIDIA accelerated computing infrastructure. This library is compatible with all NVIDIA Tensor Core GPUs, including FP8 acceleration support for NVIDIA Hopper architectures.
@@ -72,7 +72,7 @@ The following table shows both model (MFU) and hardware (HFU) FLOPs utilization
 | 22B   | 41.5% | 43.7% |
 | 175B  | 51.4% | 52.8% |
 | 530B  | 56.0% | 57.0% |
-| 1T    | 56.3% | 57.0% |    
+| 1T    | 56.3% | 57.0% |
 
 # Setup
 We strongly recommend using the latest release of [NGC's PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) with DGX nodes. If you can't use this for some reason, use the latest pytorch, cuda, nccl, and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start) releases.  Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation, or downstream tasks.
@@ -251,20 +251,20 @@ With full global batch size of 1536 on 1024 A100 GPUs, each iteration takes arou
 ## Retro and InstructRetro
 
 
-Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM) pretrained with retrieval-augmentation. 
+Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM) pretrained with retrieval-augmentation.
 Retro features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of tokens.
-Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving lower perplexity than standard GPT. 
+Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving lower perplexity than standard GPT.
 Retro also provides the flexibility to update the
 knowledge stored in LMs [(Wang et al., 2023a)](https://arxiv.org/abs/2304.06762)
 by updating the retrieval database without training LMs again.
 
-InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B, featuring the largest LLM pretrained with retrieval (as of December 2023). 
+InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B, featuring the largest LLM pretrained with retrieval (as of December 2023).
 The obtained foundation model, Retro 48B, largely outperforms the GPT counterpart in terms of perplexity.
 With instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction tuned GPT on downstream tasks in the zero-shot setting. Specifically, the average improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form QA tasks, and 10% over GPT across 4 challenging long-form QA tasks. We also find that one can ablate the encoder from InstructRetro architecture and directly use the InstructRetro decoder backbone as GPT, while achieving comparable results.
 
 In this repo, we provide an end-to-end reproduction guide to implement Retro and InstructRetro, covering
-- **Retrieval database construction**, which supports billions or even trillions of tokens as a large-scale retrieval database. 
-- **Pretraining with retrieval**, which supports pretraining from scratch and pretraining from a pretrained GPT model (Retro-fitting).      
+- **Retrieval database construction**, which supports billions or even trillions of tokens as a large-scale retrieval database.
+- **Pretraining with retrieval**, which supports pretraining from scratch and pretraining from a pretrained GPT model (Retro-fitting).
 - **Instruction tuning**, where we provide an open-source instruction tuning dataset and the training recipe for instruction tuning on Retro.
 - **Downstream task evaluation**, where we provide the text generation and evaluation scripts for zero-shot question answering tasks.
 
@@ -548,13 +548,14 @@ We recommend using the `--json` argument when using WikiExtractor, which will du
 We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filter, clean, and deduplicate all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
 
 # Reproducibility
-Megatron training is intended to be bitwise reproducible. This means that the same training config run twice in the same HW and SW environment should produce identical model checkpoints, losses and accuracy metric values (iteration time metrics may vary).
+Megatron training can be bitwise reproducible; to enable this mode use `--deterministic-mode`. This means that the same training config run twice in the same HW and SW environment should produce identical model checkpoints, losses and accuracy metric values (iteration time metrics may vary).
 
-There are currently two known Megatron optimizations that break reproducibility whilst still producing almost identical training runs. The following workarounds should be applied in cases where reproducibility is required:
-1. When training using `--bf16`, reproducbility is only obtained when the checkpointing and resume schedule of training is identical. If the checkpointing schedule will change, i.e. checkpointing and resume will occur at different iterations, the option `--no-bias-gelu-fusion` should be used.
-2. Flash attention is nondeterministic. If reproducibility is required do not use `--use-flash-attn`.
+There are currently three known Megatron optimizations that break reproducibility whilst still producing almost identical training runs:
+1. The specific NCCL algorithm that is used during an all-reduce (as specified by the environment variable `NCCL_ALGO`) is important. We have tested the following: `^NVLS`, `Tree`, `Ring`, `CollnetDirect`, `CollnetChain`. The code admits the use of `^NVLS`, which allows NCCL the choice of non-NVLS algorithms; its choice seems to be stable.
+2. Flash attention is non-deterministic; do not use `--use-flash-attn`.
+3. If using Transformer Engine, you must also set the environment variable `NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`.
 
-These sources of nondeterminism are under active investigation. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
+In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
 
 ## Projects Using Megatron
 Below are some of the projects where we have directly used Megatron:
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index 43ad28dcd8..9be7cccedf 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -107,6 +107,10 @@ class ModelParallelConfig:
        be synchronized.
     """
 
+    deterministic_mode: bool = False
+    """If true, code that has deterministic execution will be chosen. This usually
+       means slower execution, but is good for debugging and testing. Defaults to False."""
+
     enable_autocast: bool = False
     """If true runs the forward step function inside torch.autocast context."""
 
diff --git a/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/core/transformer/custom_layers/transformer_engine.py
index a36c424fba..80de615204 100644
--- a/megatron/core/transformer/custom_layers/transformer_engine.py
+++ b/megatron/core/transformer/custom_layers/transformer_engine.py
@@ -451,6 +451,14 @@ def __init__(
                 self.config.context_parallel_size == 1
             ), "Only Transformer-Engine version >= 1.0.0 supports context parallelism!"
 
+        if self.config.deterministic_mode:
+            if int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")) != 0:
+                raise RuntimeError(
+                    "deterministic_mode is on and we are using DotProductAttention from "
+                    "Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. "
+                    f"Currently set to: {os.getenv('NVTE_ALLOW_NONDETERMINISTIC_ALGO', 'not set')}."
+                )
+
         if config.window_size is not None:
             # Check version
             assert _te_version >= packaging.version.Version(
diff --git a/megatron/legacy/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py
index f076302e4e..acf98f5ba0 100644
--- a/megatron/legacy/model/fused_layer_norm.py
+++ b/megatron/legacy/model/fused_layer_norm.py
@@ -83,7 +83,7 @@ def forward(self, input):
             "fused_layer_norm_affine is not available, please install apex from https://github.com/NVIDIA/apex"
         return fused_layer_norm_affine(input, weight, self.bias, self.normalized_shape, eps=self.eps)
     else:
-        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps, False)
 
         # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
         # a populated '_base' field). This will result in schedule.py's
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6b637adc6a..ea49b879f4 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -481,6 +481,7 @@ def validate_args(args, defaults={}):
     if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
         assert args.use_mcore_models, \
             '--decoupled-lr and --decoupled-min-lr only supported by Megatron Core, please add --use-mcore-models.'
+        assert not args.use_dist_ckpt, "Distributed checkpointing does not work with decoupled LR yet."
 
     # Legacy RoPE arguments
     if args.use_rotary_position_embeddings:
@@ -524,6 +525,14 @@ def validate_args(args, defaults={}):
         assert args.context_parallel_size * args.expert_model_parallel_size <= 1, \
             "context_parallel and expert_model_parallel can't be used with tp-pp-dp mapping."
 
+    # Deterministic mode
+    if args.deterministic_mode:
+        assert not args.use_flash_attn, 'Flash attention can not be used in deterministic mode.'
+
+        all_reduce_choices = ["Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS"]
+        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \
+            f"NCCL_ALGO must be one of {all_reduce_choices}."
+
     # Print arguments.
     _print_args("arguments", args)
 
@@ -1016,6 +1025,9 @@ def _add_training_args(parser):
                        help='Call torch.cuda.empty_cache() each iteration '
                        '(training and eval), to reduce fragmentation.'
                        '0=off, 1=moderate, 2=aggressive.')
+    group.add_argument('--deterministic-mode', action='store_true',
+                       help='Choose code that has deterministic execution. This usually '
+                       'means slower execution, but is good for debugging and testing.')
     group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None,
                        help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.')
 
diff --git a/megatron/training/training.py b/megatron/training/training.py
index b33b85eab2..67361d6b89 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -885,8 +885,12 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
     timers = get_timers()
     # Extra barrier is added to make sure all ranks report the max time.
     timers('save-checkpoint', log_level=0).start(barrier=True)
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.disable_pre_hook()
     save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
                     num_floating_point_operations_so_far, checkpointing_context)
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.enable_pre_hook()
     timers('save-checkpoint').stop(barrier=True)
     timers.log(['save-checkpoint'])
 
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index b7181fbca0..db0fb855d1 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -10,7 +10,7 @@ spec:
          {'_'+args_meta if args_meta else ''}"
   model: gpt3
   variant: 345m
-  build: mcore-pyt 
+  build: mcore-pyt
   scope: merge-request
   nodes: 1
   gpus: 8
@@ -56,17 +56,17 @@ spec:
 products:
   # MCore
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1]}
-  - {tp_size: [2], pp_size: [2], extra_args: ["--no-create-attention-mask-in-dataloader"], args_meta: ["no_create_attention_mask_in_dataloader"]}
-  - {tp_size: [2], pp_size: [2], extra_args: ["--no-mmap-bin-files"], args_meta: ["no_mmap_bin_files"]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-create-attention-mask-in-dataloader"], args_meta: ["no_create_attention_mask_in_dataloader"]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-mmap-bin-files"], args_meta: ["no_mmap_bin_files"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1]}
-  - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--qk-layernorm --test-mode"']}
-  - {tp_size: [1], pp_size: [2], extra_args: ['"--position-embedding-type rope"'], args_meta: ["rope_embeddings"]}
-  - {tp_size: [1], pp_size: [2], extra_args: ['"--position-embedding-type rope --rotary-interleaved --no-rope-fusion"'], args_meta: ["rope_embeddings_interleaved_no_fusion"]}
+  - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--qk-layernorm --test-mode"'], args_meta: ["qk_layernorm_test_mode"]}
+  - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope"'], args_meta: ["rope_embeddings"]}
+  - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope --rotary-interleaved --no-rope-fusion"'], args_meta: ["rope_embeddings_interleaved_no_fusion"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --async-save"'], args_meta: ["disable_bias_linear"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--swiglu --ckpt-fully-parallel-save --ckpt-fully-parallel-load --async-save"'], args_meta: ["swiglu"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ["--untie-embeddings-and-output-weights"], args_meta: ["untie_embeddings_and_outputs"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["sequence_parallel"]}
-  - {tp_size: [1], pp_size: [1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]}
+  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]}
     # - {tp_size: [2], pp_size: [1,2], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"']}  # TODO: need updated container with TE > 1.0.0
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_8experts2parallel_dist_optimizer"]}
@@ -75,11 +75,11 @@ products:
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_groupedGEMM"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_top2router"]}
   - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --async-save"'], args_meta: ["dist_optimizer"]}
-  - {tp_size: [1], pp_size: [1], extra_args: ['"--use-distributed-optimizer --no-mmap-bin-files"'], args_meta: ["dist_optimizer_no_mmap_bin_files"]}
-  - {tp_size: [4], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
-  - {tp_size: [4], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
-  - {tp_size: [1], pp_size: [4], vp_size: [1], extra_args: ['"--decoupled-lr 0.0002"'], args_meta: ["decoupled_lr"]}
-  - {tp_size: [1], pp_size: [4], vp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
+  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --no-mmap-bin-files"'], args_meta: ["dist_optimizer_no_mmap_bin_files"]}
+  - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
+  - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
+  - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--decoupled-lr 0.0002"'], args_meta: ["decoupled_lr"]}
+  - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --untie-embeddings-and-output-weights"'], args_meta: ["dist_optimizer_overlap_grad_reduce_untied"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather --check-weight-hash-across-dp-replicas-interval 10 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
   # Non-MCore, only legacy checkpoints supported
diff --git a/tests/functional_tests/jet_recipes/build-pyt.yaml b/tests/functional_tests/jet_recipes/build-pyt.yaml
index bc1eeb9cc9..c63edd78af 100644
--- a/tests/functional_tests/jet_recipes/build-pyt.yaml
+++ b/tests/functional_tests/jet_recipes/build-pyt.yaml
@@ -5,7 +5,7 @@ spec:
   name: pyt
   platforms: [linux/amd64]
   source:
-    image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:24.01v2
 
 ---
 type: build
diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
index 8eb497dc6c..f540dc3c4c 100644
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
@@ -26,14 +26,14 @@ def read_tb_logs_as_list(path, summary_name, index):
         summary_list = [round(x.value, 5) for x in summary]
         print(summary_list)
         return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+    raise FileNotFoundError(f"File not found matching: {path}/events*")
 
 def collect_train_test_metrics(logs_dir, index):
     train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
     train_loss_list = [round(elem,3) for elem in train_loss_list]
     train_metrics = {
         "lm loss": train_loss_list[0:len(train_loss_list):STEP_INTERVAL],
-    } 
+    }
     str_train_metrics = str(train_metrics).replace("'", "\"")
     print(f"\n ----------- The following are the metrics for ----------")
     print(f"\n {str_train_metrics}", flush=True)
@@ -64,8 +64,5 @@ def _test_helper(self, loss_type, test_type):
             else:
                 assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
 
-    # def test_lm_loss_deterministic(self):
-    #     self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
-
-    def test_lm_loss_approx(self):
-        self._test_helper("lm loss", TypeOfTest.APPROX)
+    def test_lm_loss_deterministic(self):
+        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json
index 9afb0ee0df..887f5e86fc 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49849, 10.48909, 10.48383, 10.45052, 10.4396, 10.34793, 10.13229, 10.03818, 9.86253, 9.67165]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2210.0, 2505.0, 2330.0, 2235.0, 2290.0, 2400.0, 2866.0, 3249.0, 3522.0, 2958.0]}, "iteration_timing_avg": 0.7140176470588235}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49566, 10.48166, 10.48045, 10.45348, 10.44393, 10.35605, 10.13787, 10.04034, 9.86836, 9.6732]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2183.0, 2469.0, 2115.0, 2126.0, 2322.0, 2411.0, 2892.0, 3234.0, 3637.0, 2992.0]}, "iteration_timing_avg": 0.7140176470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
index d411d8c1a7..474cdd87a1 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49838, 10.48932, 10.4839, 10.45043, 10.43933, 10.34765, 10.1322, 10.03809, 9.86242, 9.67174]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2309.0, 2556.0, 2286.0, 2336.0, 2345.0, 2428.0, 2974.0, 3161.0, 3625.0, 2918.0]}, "iteration_timing_avg": 0.8110379411764704}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44389, 10.35605, 10.13777, 10.04004, 9.86833, 9.67303]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2398.0, 2539.0, 2945.0, 3162.0, 3457.0, 3125.0]}, "iteration_timing_avg": 0.8110379411764704}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
index 4235b31fee..abf6da1c26 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.5315, 10.48776, 10.46238, 10.31421, 10.17038, 9.97219]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22539.0, 23012.0, 26350.0, 23699.0, 21775.0, 21356.0, 23232.0]}, "iteration_timing_avg": 0.7692817647058824}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52649, 10.49841, 10.45926, 10.32763, 10.17142, 9.96795]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22775.0, 23916.0, 27495.0, 22901.0, 22718.0, 20518.0, 23379.0]}, "iteration_timing_avg": 0.7692817647058824}
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
index dcf1a79143..f6a0f47fa8 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44877, 10.43852, 10.44018, 10.44113, 10.45683, 10.44131, 10.39016, 10.25639, 10.13221, 9.95659]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [24798.0, 25690.0, 28527.0, 26577.0, 24018.0, 20924.0, 21488.0]}, "iteration_timing_avg": 0.7523635294117648}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.45023, 10.44561, 10.38646, 10.25229, 10.12594, 9.95549]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [25037.0, 25599.0, 28336.0, 25502.0, 24023.0, 19471.0, 22109.0]}, "iteration_timing_avg": 0.7523635294117648}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json
index 633847bc15..87e9341e6a 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83721, 10.87648, 10.85327, 10.79634, 10.67874, 10.60491, 10.12636, 10.22252, 10.13977, 9.82346]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1640.0, 1873.0, 1930.0, 1910.0, 1936.0, 1807.0, 1630.0, 1962.0, 2317.0, 2314.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json
index 633847bc15..87e9341e6a 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83721, 10.87648, 10.85327, 10.79634, 10.67874, 10.60491, 10.12636, 10.22252, 10.13977, 9.82346]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1640.0, 1873.0, 1930.0, 1910.0, 1936.0, 1807.0, 1630.0, 1962.0, 2317.0, 2314.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json
index 2b29a51a27..94554bb448 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83721, 10.87648, 10.85329, 10.79637, 10.67873, 10.60491, 10.12635, 10.22253, 10.13979, 9.82348]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1589.0, 1913.0, 1924.0, 1876.0, 2005.0, 1749.0, 1631.0, 1981.0, 2346.0, 2380.0]}, "iteration_timing_avg": 0.09164500000000002}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json
index 4357d8badf..33a65cca16 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84407, 10.87551, 10.90356, 10.81577, 10.67451, 10.60208, 10.06584, 10.19215, 10.11381, 9.76133]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1717.0, 2136.0, 2046.0, 1923.0, 2052.0, 1910.0, 1717.0, 2008.0, 2269.0, 2231.0]}, "iteration_timing_avg": 0.11052176470588236}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json
index b4db7bde9b..2778958a4b 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84608, 10.87634, 10.90424, 10.81754, 10.67579, 10.60283, 10.06667, 10.19261, 10.11413, 9.7617]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1709.0, 2192.0, 2059.0, 1960.0, 2164.0, 1846.0, 1614.0, 2074.0, 2176.0, 2249.0]}, "iteration_timing_avg": 0.11051617647058823}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json
index eedf2baa8b..cdabc8e9d3 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79374, 10.86745, 10.89179, 10.78304, 10.66262, 10.58362, 10.08688, 10.19342, 10.13764, 9.81438]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1567.0, 1904.0, 1912.0, 1931.0, 1799.0, 1722.0, 1591.0, 1950.0, 2428.0, 2378.0]}, "iteration_timing_avg": 0.12243558823529416}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json
index ac3c1f57f2..6123f3ca4f 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79373, 10.86651, 10.89091, 10.78164, 10.66101, 10.58089, 10.08413, 10.19034, 10.13461, 9.81138]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1864.0, 1826.0, 1965.0, 1861.0, 1605.0, 1609.0, 1931.0, 2343.0, 2347.0]}, "iteration_timing_avg": 0.12348235294117646}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json
index a2d5ed7952..02520951bb 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.73353, 10.81676, 10.83941, 10.7586, 10.70146, 10.62786, 10.20836, 10.36754, 10.26496, 9.94346]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2536.0, 2988.0, 2925.0, 2895.0, 2617.0, 2603.0, 2325.0, 2704.0, 2592.0, 2406.0]}, "iteration_timing_avg": 0.12725500000000006}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json
index e294c75c0f..2039e2f498 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8968, 10.90735, 10.91688, 10.84693, 10.70699, 10.63243, 10.15516, 10.26078, 10.15949, 9.83311]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727844.0, 23021590.0, 22500488.0, 22830910.0, 22739472.0, 22546526.0, 22955764.0, 22588942.0, 22658932.0, 22884080.0]}, "iteration_timing_avg": 0.1246464705882353}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json
index 27683bd7bf..460f463a0a 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.8727, 10.8819, 10.79671, 10.68623, 10.59545, 10.09721, 10.21007, 10.13688, 9.7981]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1501.0, 1801.0, 1872.0, 1844.0, 1939.0, 1785.0, 1514.0, 1865.0, 2240.0, 2398.0]}, "iteration_timing_avg": 0.12273676470588235}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json
index cd7044ddda..f23c85a133 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87269, 10.88192, 10.79677, 10.68633, 10.59654, 10.09782, 10.21295, 10.13917, 9.80682]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1501.0, 1749.0, 1845.0, 1786.0, 1912.0, 1741.0, 1567.0, 1927.0, 2280.0, 2405.0]}, "iteration_timing_avg": 0.12873676470588236}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json
index d8ea1345ac..64f030d4bc 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9362, 10.93543, 10.9456, 10.87817, 10.75688, 10.66385, 10.16947, 10.27156, 10.19469, 9.85867]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727572.0, 23021722.0, 22500652.0, 22830476.0, 22739252.0, 22547046.0, 22954704.0, 22589164.0, 22659710.0, 22883876.0]}, "iteration_timing_avg": 0.12799705882352944}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json
index c9e2aa6032..2d807f5ac2 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87269, 10.88192, 10.79677, 10.68633, 10.59654, 10.09782, 10.21295, 10.13917, 9.80682]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1501.0, 1749.0, 1845.0, 1786.0, 1912.0, 1741.0, 1567.0, 1927.0, 2280.0, 2405.0]}, "iteration_timing_avg": 0.12168999999999999}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json
index 3da54b9c18..939863d9d8 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87269, 10.88192, 10.79677, 10.68633, 10.59654, 10.09776, 10.21294, 10.13909, 9.80679]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1501.0, 1749.0, 1794.0, 1829.0, 1913.0, 1793.0, 1585.0, 1815.0, 2296.0, 2266.0]}, "iteration_timing_avg": 0.12502588235294115}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
index 1818cb41de..12df0ef48c 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79896, 10.8594, 10.87122, 10.79881, 10.71717, 10.6354, 10.19743, 10.30887, 10.2168, 9.90751]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [30665.0, 37001.0, 37644.0, 35953.0, 33382.0, 35191.0, 30525.0, 35253.0, 36653.0, 37931.0]}, "iteration_timing_avg": 0.2890776470588235}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86453, 10.87233, 10.80777, 10.71193, 10.63878, 10.19208, 10.3079, 10.21681, 9.90869]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 36902.0, 37803.0, 36259.0, 33529.0, 35091.0, 30918.0, 35455.0, 36584.0, 37538.0]}, "iteration_timing_avg": 0.2890776470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
index f45f321721..b1e031706b 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80961, 10.86075, 10.86755, 10.80331, 10.71906, 10.64746, 10.21053, 10.32037, 10.22013, 9.92387]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16604.0, 19509.0, 19801.0, 18644.0, 17084.0, 17721.0, 14980.0, 17754.0, 18357.0, 18520.0]}, "iteration_timing_avg": 0.19267441176470584}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86535, 10.86435, 10.80257, 10.71679, 10.64491, 10.21076, 10.31975, 10.2191, 9.92009]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16395.0, 19716.0, 19656.0, 18538.0, 17152.0, 17399.0, 15327.0, 17720.0, 18390.0, 18684.0]}, "iteration_timing_avg": 0.19267441176470584}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
index f9faeec1b9..7e169607b0 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80961, 10.86088, 10.86703, 10.80386, 10.71988, 10.64698, 10.21161, 10.32003, 10.22052, 9.92363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31228.0, 37860.0, 38327.0, 36135.0, 33138.0, 34687.0, 30217.0, 34984.0, 35952.0, 37036.0]}, "iteration_timing_avg": 0.17911029411764712}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86512, 10.86334, 10.80317, 10.71694, 10.64429, 10.21025, 10.31925, 10.21976, 9.92004]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37837.0, 38276.0, 36315.0, 33331.0, 34715.0, 30485.0, 34571.0, 36189.0, 36953.0]}, "iteration_timing_avg": 0.17911029411764712}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
index 38b989333f..3ad535db01 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80682, 10.86734, 10.87997, 10.79306, 10.66584, 10.57572, 10.05454, 10.17682, 10.09527, 9.75032]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13454.0, 16317.0, 16781.0, 16315.0, 14876.0, 15877.0, 14704.0, 17095.0, 17749.0, 18463.0]}, "iteration_timing_avg": 0.2969329411764706}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86865, 10.87469, 10.79787, 10.66376, 10.57925, 10.05295, 10.18001, 10.09173, 9.74805]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13563.0, 16221.0, 16838.0, 16335.0, 14835.0, 15726.0, 14714.0, 17118.0, 17526.0, 18766.0]}, "iteration_timing_avg": 0.3051714705882352}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
index 8f14311c51..7e0b0a6092 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79896, 10.8601, 10.87152, 10.79856, 10.71624, 10.6355, 10.19683, 10.30917, 10.21632, 9.90782]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16152.0, 19202.0, 19645.0, 18594.0, 17375.0, 17768.0, 15576.0, 17888.0, 18387.0, 18810.0]}, "iteration_timing_avg": 0.29991823529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86447, 10.87277, 10.80684, 10.71251, 10.63895, 10.19317, 10.30823, 10.21751, 9.90833]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16117.0, 19202.0, 19572.0, 18615.0, 17501.0, 17675.0, 15669.0, 18087.0, 18717.0, 19010.0]}, "iteration_timing_avg": 0.29991823529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json
index e5c571448d..265ad7c9b9 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92392, 10.93645, 10.89657, 10.86919, 10.74782, 10.658, 10.15864, 10.24906, 10.15088, 9.83933]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1735.0, 1861.0, 2111.0, 1844.0, 1762.0, 1858.0, 1554.0, 2031.0, 2309.0, 2225.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json
index e5c571448d..265ad7c9b9 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92392, 10.93645, 10.89657, 10.86919, 10.74782, 10.658, 10.15864, 10.24906, 10.15088, 9.83933]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1735.0, 1861.0, 2111.0, 1844.0, 1762.0, 1858.0, 1554.0, 2031.0, 2309.0, 2225.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
index e5c571448d..265ad7c9b9 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92392, 10.93645, 10.89657, 10.86919, 10.74782, 10.658, 10.15864, 10.24906, 10.15088, 9.83933]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1735.0, 1861.0, 2111.0, 1844.0, 1762.0, 1858.0, 1554.0, 2031.0, 2309.0, 2225.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
index ef3ee44978..49917fe78d 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86174, 10.88685, 10.8766, 10.83063, 10.71362, 10.60782, 10.13037, 10.2308, 10.15865, 9.83394]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1747.0, 2204.0, 2128.0, 2098.0, 2033.0, 1943.0, 1761.0, 2152.0, 2427.0, 2590.0]}, "iteration_timing_avg": 0.22043823529411763}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json
index 447f6efaf8..196e4b2905 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86174, 10.88685, 10.8766, 10.83063, 10.71362, 10.60782, 10.13037, 10.2308, 10.15865, 9.83394]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1747.0, 2204.0, 2128.0, 2098.0, 2033.0, 1943.0, 1761.0, 2152.0, 2427.0, 2590.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json
new file mode 100644
index 0000000000..203663187b
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86172, 10.88732, 10.87796, 10.83292, 10.71829, 10.60962, 10.13562, 10.23129, 10.16333, 9.83853]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1947.0, 2356.0, 2266.0, 2292.0, 2241.0, 2141.0, 1951.0, 2486.0, 2714.0, 2755.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1.json
deleted file mode 100644
index 3ac3145032..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86134, 10.88772, 10.87691, 10.83223, 10.71584, 10.61182, 10.13429, 10.23398, 10.1625, 9.83778]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1940.0, 2389.0, 2366.0, 2311.0, 2331.0, 2090.0, 1920.0, 2439.0, 2710.0, 2811.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json
index ddd7132a35..5c516f0562 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85632, 10.88791, 10.86527, 10.81439, 10.69842, 10.61079, 10.109, 10.21405, 10.12865, 9.80275]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1714.0, 1877.0, 1928.0, 1863.0, 1960.0, 1646.0, 1648.0, 2023.0, 2318.0, 2333.0]}, "iteration_timing_avg": 0.14203264705882354}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
index e79ac5e576..474abd4ef0 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.7951, 10.84939, 10.87411, 10.83459, 10.8304, 10.81894, 10.74686, 10.80731, 10.80557, 10.63597]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [29527.0, 26879.0, 26865.0, 28093.0]}, "iteration_timing_avg": 0.1211408823529412}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.83137, 10.81979, 10.74667, 10.80852, 10.8044, 10.6368]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28515.0, 27094.0, 26111.0, 29819.0]}, "iteration_timing_avg": 0.1211408823529412}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
index 012834b1c2..3a4e85afcc 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85543, 10.89355, 10.87608, 10.87365, 10.88231, 10.86963, 10.82616, 10.85069, 10.83875, 10.70229]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [29373.0, 30031.0, 29845.0, 30013.0]}, "iteration_timing_avg": 0.14292588235294112}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88381, 10.86694, 10.82041, 10.84998, 10.83732, 10.70774]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [29453.0, 30329.0, 28824.0, 29477.0]}, "iteration_timing_avg": 0.14292588235294112}
diff --git a/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json b/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
index f416c67697..dcdf8cd82d 100644
--- a/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
+++ b/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13518, 9.14056, 9.13428, 9.12654, 9.09548, 9.07751, 9.02899, 8.99955, 8.96916, 8.93077]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2594449.0, 2527269.0, 2601851.0, 2496920.0, 2554324.0, 2677927.0, 2491921.0, 2610337.0, 2656049.0, 2684012.0]}, "iteration_timing_avg": 0.12631823529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13273, 9.13911, 9.13383, 9.12657, 9.09489, 9.07765, 9.02826, 9.00005, 8.96948, 8.92915]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2594526.0, 2527198.0, 2601909.0, 2496960.0, 2554383.0, 2678214.0, 2491802.0, 2610525.0, 2656421.0, 2684195.0]}, "iteration_timing_avg": 0.1316635294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json b/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json
index 9716d97c9f..7d87869c71 100644
--- a/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json
+++ b/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32918, 9.4263, 8.86291, 8.56362, 8.28553, 8.10995, 7.85275, 7.53944, 7.41758, 7.30235, 7.38565, 7.22824, 7.10889, 7.05923, 6.91261, 6.95823, 6.97764, 7.04028, 6.71005, 6.97552]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40965.0, 44041.0, 41715.0, 44784.0, 43950.0, 41291.0, 42533.0, 44720.0, 43953.0, 41217.0, 43278.0, 39742.0, 45393.0, 43328.0, 43941.0, 45398.0, 45721.0, 46281.0, 44705.0]}, "iteration_timing_avg": 0.17640776119402987}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.33692, 9.42684, 8.86347, 8.56218, 8.28402, 8.10585, 7.84893, 7.53544, 7.41091, 7.29556, 7.39322, 7.21918, 7.103, 7.04859, 6.90381, 6.96025, 6.96467, 7.03545, 6.70046, 6.96655]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43335.0, 41016.0, 44013.0, 41737.0, 44813.0, 43943.0, 41248.0, 42538.0, 44705.0, 43912.0, 41141.0, 43279.0, 39762.0, 45412.0, 43319.0, 43922.0, 45387.0, 45708.0, 46322.0, 44694.0]}, "iteration_timing_avg": 0.17640776119402987}
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
index de8ebf45d6..97a9d1695b 100755
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
@@ -16,6 +16,7 @@ set -exo pipefail
 if [[ -z $MBS ]]; then MBS=4; fi
 if [[ -z $GBS ]]; then GBS=128; fi
 if [[ -z $VOCAB_FILE ]]; then VOCAB_FILE="/workspace/data/bert_data/vocab.txt" ; fi
+if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi
 
 # Change for multinode config
 GPUS_PER_NODE=8
@@ -28,11 +29,17 @@ command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 TRAINING_DTYPE=fp16
 TRANSFORMER_IMPL=local
 
+if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;"
+else
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;"
+   ADDITIONAL_PARAMS+=" --deterministic-mode"
+fi
+
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0;"
        USE_MCORE=1
 fi
 if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
index f358dfccd0..0925c223d6 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
@@ -29,14 +29,20 @@ WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 
 command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 
-TRANSFORMER_IMPL=local
 TRAINING_DTYPE=fp16
+TRANSFORMER_IMPL=local
+
+if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;"
+else
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=Tree;"
+   ADDITIONAL_PARAMS+=" --deterministic-mode"
+fi
 
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=transformer_engine
        TRAINING_DTYPE=bf16
-       command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=$ALLOW_NONDETERMINISTIC;"
        USE_MCORE=1
 fi
 
@@ -118,8 +124,6 @@ build_torch_run_cmd() {
        --transformer-impl $TRANSFORMER_IMPL \
        --tensor-model-parallel-size $TP_SIZE \
        --pipeline-model-parallel-size $PP_SIZE \
-       --no-bias-swiglu-fusion \
-       --no-rope-fusion \
        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
        ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
        ${USE_MCORE:+--use-mcore-models} \
diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
index 3961f2c225..1b7bedb582 100755
--- a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
@@ -16,6 +16,7 @@ set -exo pipefail
 if [[ -z $MBS ]]; then MBS=4; fi
 if [[ -z $GBS ]]; then GBS=32; fi
 if [[ -z $MOE_GROUPED_GEMM ]]; then MOE_GROUPED_GEMM=0; fi
+if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi
 
 GPUS_PER_NODE=8
 # Change for multinode config
@@ -26,14 +27,20 @@ WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 
 command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 
-TRANSFORMER_IMPL=local
 TRAINING_DTYPE=fp16
+TRANSFORMER_IMPL=local
+
+if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;"
+else
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;"
+   ADDITIONAL_PARAMS+=" --deterministic-mode"
+fi
 
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0;"
        USE_MCORE=1
 fi
 
@@ -107,8 +114,6 @@ build_torch_run_cmd() {
       --transformer-impl $TRANSFORMER_IMPL \
       --tensor-model-parallel-size $TP_SIZE \
       --pipeline-model-parallel-size $PP_SIZE \
-      --no-bias-swiglu-fusion \
-      --no-rope-fusion \
       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
       ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
       ${USE_MCORE:+--use-mcore-models} \
diff --git a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
index ec5bceb599..7ad640bb77 100755
--- a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
@@ -16,6 +16,7 @@ set -exo pipefail
 if [[ -z $MBS ]]; then MBS=4; fi
 if [[ -z $GBS ]]; then GBS=32; fi
 if [[ -z $VOCAB_PATH ]]; then VOCAB_PATH="/workspace/data/t5_data/bert-large-cased-vocab.txt"; fi
+if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi
 
 GPUS_PER_NODE=8
 # Change for multinode config
@@ -26,16 +27,21 @@ WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 
 command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 
-TRANSFORMER_IMPL=local
 TRAINING_DTYPE=fp16
+TRANSFORMER_IMPL=local
+
+if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;"
+else
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;"
+   ADDITIONAL_PARAMS+=" --deterministic-mode"
+fi
 
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0;"
        USE_MCORE=1
-       export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
 fi
 
 if [[ $NO_FA -eq 1 ]]; then
@@ -116,6 +122,7 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
     --eval-iters 10 \
     --distributed-backend nccl \
     ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \
+    ${USE_MCORE:+--use-mcore-models} \
     ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS}"
 
 command="$command $torch_run_cmd"

From f0a43ac8833f9ca66c08d6d365d1b89bc0513264 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Mon, 6 May 2024 13:58:24 -0700
Subject: [PATCH 16/92] Configurable VIT ln_pre

---
 megatron/core/models/vision/clip_vit_model.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
index 56e017ddfc..e5b005c0a9 100644
--- a/megatron/core/models/vision/clip_vit_model.py
+++ b/megatron/core/models/vision/clip_vit_model.py
@@ -1,14 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
-from megatron.core import tensor_parallel
 from megatron.core.models.common.vision_module.vision_module import VisionModule
 from megatron.core.transformer.custom_layers.transformer_engine import TENorm
 from megatron.core.transformer.enums import ModelType
-from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 
@@ -18,8 +17,9 @@ class CLIPViTModel(VisionModule):
     """CLIP ViT vision model.
 
     Args:
-        transformer_config (TransformerConfig): Transformer config
-        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
+        transformer_config (TransformerConfig): Transformer config.
+        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers.
+        ln_pre_impl (ModuleSpec or type): Specifies the layer norm type to use for ln_pre.
         patch_dim (int): Image patch size.
         img_h (int): Input image height.
         img_w (int): Input image width.
@@ -31,6 +31,7 @@ def __init__(
         self,
         transformer_config: TransformerConfig,
         transformer_layer_spec: ModuleSpec,
+        ln_pre_impl: Union[ModuleSpec, type] = TENorm,
         patch_dim: int = 14,
         img_h: int = 336,
         img_w: int = 336,
@@ -72,10 +73,11 @@ def __init__(
                 torch.randn(1, self.class_token_len, self.visual_hidden_size)
             )
 
-        self.ln_pre = TENorm(
-            config=self.config,
+        self.ln_pre = build_module(
+            ln_pre_impl,
+            config=transformer_config,
             hidden_size=self.visual_hidden_size,
-            eps=self.config.layernorm_epsilon,
+            eps=transformer_config.layernorm_epsilon,
         )
 
         self.model_type = ModelType.encoder_or_decoder

From 4857b1fd5d0f3eea151ab9c213a9339d947af409 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Mon, 6 May 2024 15:32:48 -0700
Subject: [PATCH 17/92] Quick GELU activation for multimodal

---
 megatron/training/activations.py | 18 ++++++++++++++++++
 megatron/training/arguments.py   |  8 +-------
 2 files changed, 19 insertions(+), 7 deletions(-)
 create mode 100644 megatron/training/activations.py

diff --git a/megatron/training/activations.py b/megatron/training/activations.py
new file mode 100644
index 0000000000..e3f9a407fc
--- /dev/null
+++ b/megatron/training/activations.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+try:
+    jit_fuser = torch.compile
+except:
+    jit_fuser = torch.jit.script
+
+
+@jit_fuser
+def squared_relu(x: torch.Tensor) -> torch.Tensor:
+    return torch.pow(F.relu(x), 2)
+
+
+@jit_fuser
+def quick_gelu(x: torch.Tensor) -> torch.Tensor:
+    return x * torch.sigmoid(1.702 * x)
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index c6206496f7..b711b8a0e4 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -15,6 +15,7 @@
     get_gpt_data_dir as get_retro_data_dir,
 )
 from megatron.core.transformer import TransformerConfig
+from megatron.training.activations import squared_relu
 
 
 def parse_args(extra_args_provider=None, ignore_unknown_args=False):
@@ -566,13 +567,6 @@ def core_transformer_config_from_args(args, config_class=None):
         kw_args['bias_activation_fusion'] = args.bias_gelu_fusion
     if args.squared_relu:
         assert not args.swiglu
-        try:
-            jit_fuser = torch.compile
-        except:
-            jit_fuser = torch.jit.script
-        @jit_fuser
-        def squared_relu(x):
-            return torch.pow(F.relu(x), 2)
         kw_args['activation_func'] = squared_relu
     if args.init_method_xavier_uniform:
         kw_args['init_method'] = torch.nn.init.xavier_uniform_

From 307dcf37f03d44da131ba21999278abe4112d2ad Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Tue, 7 May 2024 16:59:47 -0700
Subject: [PATCH 18/92] Fix mixed messaging for `attention_softmax_in_fp32` and
 `apply_query_key_layer_scaling`

---
 docs/llama2.md                 |  1 +
 megatron/training/arguments.py | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/llama2.md b/docs/llama2.md
index e382d6b167..286a29c06f 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -98,6 +98,7 @@ If loading for either inference or finetuning, use the following arguments:
 --normalization RMSNorm \
 --no-position-embedding \
 --no-masked-softmax-fusion \
+--attention-softmax-in-fp32
 ```
 
 ### Launch Meta
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6cf2ef05e1..8cc265d7e6 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -534,6 +534,10 @@ def validate_args(args, defaults={}):
         assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \
             f"NCCL_ALGO must be one of {all_reduce_choices}."
 
+    # Update the printed args to reflect that `apply_query_key_layer_scaling` also controls `attention_softmax_in_fp32`
+    if args.apply_query_key_layer_scaling:
+        args.attention_softmax_in_fp32 = True
+
     # Print arguments.
     _print_args("arguments", args)
 
@@ -1285,11 +1289,9 @@ def _add_mixed_precision_args(parser):
                        help='Move residual connections to fp32.')
     group.add_argument('--apply-query-key-layer-scaling', action='store_true',
                        help='Scale Q * K^T by 1 / layer-number. '
-                       'Useful for fp16 training.')
+                       'Useful for fp16 training. Also sets `attention_softmax_in_fp32` to True.')
     group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32. '
-                       'This flag is ignored unless '
-                       '--no-query-key-layer-scaling is specified.')
+                       help='Run attention masking and softmax in fp32.')
     group.add_argument('--accumulate-allreduce-grads-in-fp32',
                        action='store_true',
                        help='Gradient accumulation and all-reduce in fp32.')

From 748f6ac28bc16f910e5e0e2f9a0c7b89a2180073 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Wed, 8 May 2024 14:02:58 -0700
Subject: [PATCH 19/92] Fix for data blend check

---
 megatron/training/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6cf2ef05e1..476daea36a 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -520,7 +520,7 @@ def validate_args(args, defaults={}):
     assert args.mock_data + \
            bool(args.data_path) + \
            any([args.train_data_path, args.valid_data_path, args.test_data_path]) \
-           == 1, "A single data source must be provided"
+           <= 1, "A single data source must be provided in training mode, else None"
 
     if args.use_tp_pp_dp_mapping:
         assert args.context_parallel_size * args.expert_model_parallel_size <= 1, \

From 227bfb1a4c0c3a52231490cac972952a3cd65ec3 Mon Sep 17 00:00:00 2001
From: Mike Chrzanowski <mchrzanowski@nvidia.com>
Date: Wed, 8 May 2024 23:06:22 -0700
Subject: [PATCH 20/92] Update Docker Container to Contain Testing Modules

---
 .gitlab-ci.yml                                | 19 +++++++++----------
 Dockerfile.test                               |  3 +--
 examples/bert/README.md                       |  6 +++---
 examples/gpt3/README.md                       |  8 ++++----
 examples/pretrain_gpt3_175B.sh                |  2 +-
 examples/retro/README.md                      |  4 ++--
 megatron/legacy/model/fused_layer_norm.py     |  7 +++++--
 .../jet_recipes/build-pyt.yaml                |  2 +-
 .../jet_recipes/local-generator.py            |  2 +-
 .../python_test_utils/jet_test_pipeline.py    |  4 ++--
 tests/unit_tests/__init__.py                  |  2 ++
 .../moe/test_a2a_token_dispatcher.py          |  3 +++
 tools/retro/README.md                         |  2 +-
 13 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 53c23cd098..6227c4928e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -17,7 +17,6 @@ stages:
 variables: &VARS
   SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
   DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
-  PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
   PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
   TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
   TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
@@ -37,7 +36,7 @@ include:
   - jet-tests.yml
 
 unit_tests:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -53,7 +52,7 @@ unit_tests:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
 
 unit_tests-data:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -67,7 +66,7 @@ unit_tests-data:
     - when: always
 
 unit_tests-dist-checkpointing:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -81,7 +80,7 @@ unit_tests-dist-checkpointing:
     - when: always
 
 unit_tests-fusions:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -95,7 +94,7 @@ unit_tests-fusions:
     - when: always
 
 unit_tests-models:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -109,7 +108,7 @@ unit_tests-models:
     - when: always
 
 unit_tests-pipeline-parallel:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -123,7 +122,7 @@ unit_tests-pipeline-parallel:
     - when: always
 
 unit_tests-tensor-parallel:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -137,7 +136,7 @@ unit_tests-tensor-parallel:
     - when: always
 
 unit_tests-transformer:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
@@ -151,7 +150,7 @@ unit_tests-transformer:
     - when: always
 
 unit_tests-top-py:
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
     - 8xL40S
   stage: test
diff --git a/Dockerfile.test b/Dockerfile.test
index 9abefbf327..dd7638ae6d 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -8,5 +8,4 @@ RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 
 RUN apt-get update && apt-get install -y --no-install-recommends
 
-RUN pip3 install sentencepiece einops flask-restful pytest wandb
-RUN pip3 install git+https://github.com/fanshiqing/grouped_gemm@v1.1.1
\ No newline at end of file
+RUN pip3 install --no-cache-dir einops flask-restful nltk pytest pytest-cov pytest_mock sentencepiece wrapt git+https://github.com/fanshiqing/grouped_gemm@v1.1.1
\ No newline at end of file
diff --git a/examples/bert/README.md b/examples/bert/README.md
index 9b8ba3652a..6c1fe95bf0 100644
--- a/examples/bert/README.md
+++ b/examples/bert/README.md
@@ -9,7 +9,7 @@
 
 To run the model using a docker container run it as follows
 ```
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
 CHECKPOINT_PATH="" #<Specify path>
 TENSORBOARD_LOGS_PATH=""#<Specify path>
 VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
@@ -21,7 +21,7 @@ docker run \
   --workdir /workspace/megatron-lm \
   -v /path/to/data:/path/to/data \
   -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:23.04-py3 \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
   bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
 
 ```
@@ -42,7 +42,7 @@ The example in this folder shows you how to run 340m large model. There are othe
 
 ```
 
-### 20B 
+### 20B
 ```
        --num-layers 48 \
        --hidden-size 6144 \
diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md
index 2b442b69e1..8d6f267416 100644
--- a/examples/gpt3/README.md
+++ b/examples/gpt3/README.md
@@ -10,7 +10,7 @@
 
 To run the model using a docker container run it as follows
 ```
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
 CHECKPOINT_PATH="" #<Specify path>
 TENSORBOARD_LOGS_PATH=""#<Specify path>
 VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
@@ -23,7 +23,7 @@ docker run \
   --workdir /workspace/megatron-lm \
   -v /path/to/data:/path/to/data \
   -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:23.04-py3 \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
   bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
 
 ```
@@ -34,7 +34,7 @@ NOTE: Depending on the environment you are running it the above command might li
 <a id="markdown-configurations" name="configurations"></a>
 The example in this folder shows you how to run 175B model. There are other configs you could run as well
 
-### 345M 
+### 345M
 ```
        --num-layers 12 \
        --hidden-size 512 \
@@ -45,7 +45,7 @@ The example in this folder shows you how to run 175B model. There are other conf
 
 ```
 
-### 857M 
+### 857M
 ```
        --num-layers 24 \
        --hidden-size 1024 \
diff --git a/examples/pretrain_gpt3_175B.sh b/examples/pretrain_gpt3_175B.sh
index c26b8ee6c8..98886e1f19 100755
--- a/examples/pretrain_gpt3_175B.sh
+++ b/examples/pretrain_gpt3_175B.sh
@@ -55,7 +55,7 @@ run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
 
 
 srun -l \
-     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-image "nvcr.io/nvidia/pytorch:24.01-py3" \
      --container-mounts "<DIRECTORIES TO MOUNT>" \
      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
 
diff --git a/examples/retro/README.md b/examples/retro/README.md
index f015c0b611..f78bcdeb56 100644
--- a/examples/retro/README.md
+++ b/examples/retro/README.md
@@ -20,7 +20,7 @@ docker run \
   --workdir /workspace/megatron-lm \
   -v /path/to/data:/path/to/data \
   -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:23.04-py3 \
+  megatron-lm nvcr.io/nvidia/pytorch:23.09-py3 \
   bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
 
 ```
@@ -52,7 +52,7 @@ Retro preprocesses and caches data prior to pretraining, to greatly speed up pre
 <a id="markdown-configurations" name="configurations"></a>
 The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
 
-### 857M 
+### 857M
 ```
        --num-layers 24 \
        --hidden-size 1024 \
diff --git a/megatron/legacy/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py
index acf98f5ba0..fcec35a56f 100644
--- a/megatron/legacy/model/fused_layer_norm.py
+++ b/megatron/legacy/model/fused_layer_norm.py
@@ -4,6 +4,7 @@
       https://github.com/NVIDIA/apex
    with some changes. """
 
+import inspect
 import numbers
 import torch
 from torch.nn.parameter import Parameter
@@ -83,8 +84,10 @@ def forward(self, input):
             "fused_layer_norm_affine is not available, please install apex from https://github.com/NVIDIA/apex"
         return fused_layer_norm_affine(input, weight, self.bias, self.normalized_shape, eps=self.eps)
     else:
-        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps, False)
-
+        if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps, False)
+        else:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
         # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
         # a populated '_base' field). This will result in schedule.py's
         # deallocate_output_tensor() throwing an error, so a viewless tensor is
diff --git a/tests/functional_tests/jet_recipes/build-pyt.yaml b/tests/functional_tests/jet_recipes/build-pyt.yaml
index c63edd78af..e5184d7b11 100644
--- a/tests/functional_tests/jet_recipes/build-pyt.yaml
+++ b/tests/functional_tests/jet_recipes/build-pyt.yaml
@@ -5,7 +5,7 @@ spec:
   name: pyt
   platforms: [linux/amd64]
   source:
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:24.01v2
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:24.01v3
 
 ---
 type: build
diff --git a/tests/functional_tests/jet_recipes/local-generator.py b/tests/functional_tests/jet_recipes/local-generator.py
index 047ae2f31c..513c6abcdf 100644
--- a/tests/functional_tests/jet_recipes/local-generator.py
+++ b/tests/functional_tests/jet_recipes/local-generator.py
@@ -5,7 +5,7 @@
 import yaml
 
 SBATCH_TEMPLATE = '''
-srun --container-image nvcr.io/nvidia/pytorch:23.04-py3 \\
+srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\
      --container-mounts "{}:{},{}:/workspace/megatron-lm" \\
      bash -c \"
      \n{}
diff --git a/tests/functional_tests/python_test_utils/jet_test_pipeline.py b/tests/functional_tests/python_test_utils/jet_test_pipeline.py
index 92d2a06d00..2700639e0b 100644
--- a/tests/functional_tests/python_test_utils/jet_test_pipeline.py
+++ b/tests/functional_tests/python_test_utils/jet_test_pipeline.py
@@ -115,14 +115,14 @@ def save_scripts(results, save_dir):
         if result['obj_workload']['obj_spec']['flat_artifacts']:
             dataset_mount = list(result['obj_workload']['obj_spec']['flat_artifacts'].keys())[0]
             content = f'''
-            srun --container-image nvcr.io/nvidia/pytorch:23.04-py3 \\
+            srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\
                  --container-mounts "/path/to/data:{dataset_mount},/path/to/megatron-lm:/workspace/megatron-lm" \\
                  bash -c'''
             content = dedent(content)
             content += f' \'\n{script}\n\''
         else:
             content = '''
-            srun --container-image nvcr.io/nvidia/pytorch:23.04-py3 \\
+            srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\
                  --container-mounts "/path/to/megatron-lm:/workspace/megatron-lm" \\
                  bash -c'''
             content = dedent(content)
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
index e69de29bb2..1d3c586a5d 100644
--- a/tests/unit_tests/__init__.py
+++ b/tests/unit_tests/__init__.py
@@ -0,0 +1,2 @@
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
\ No newline at end of file
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
index 6912708157..af7bad3319 100644
--- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -15,6 +15,7 @@ def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
         (8, 1),
@@ -33,6 +34,7 @@ def test_forward_backward(self, tp_size, ep_size):
         container.dispatcher_dropless_test()
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
         (8, 1)
@@ -52,6 +54,7 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
         container.dispacher_capacity_test()
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
         (8, 1),
diff --git a/tools/retro/README.md b/tools/retro/README.md
index f7a38c8a04..395005e73b 100644
--- a/tools/retro/README.md
+++ b/tools/retro/README.md
@@ -185,7 +185,7 @@ An example command to run instruction tuning on 843M Retro is as follows:
 
 ```bash
                                       [blend-dataset-name] [model-size] [batch-size]  [lr]    [checkpoints]
-bash tools/retro/sft/sft_retro_lm.sh       open_inst               843m            128    5e-6  <path/to/pretrained/retro>  
+bash tools/retro/sft/sft_retro_lm.sh       open_inst               843m            128    5e-6  <path/to/pretrained/retro>
 ```
 
 The `blend_dataset_name` argument will blend all the datasets within the `$DATA_HOME` following the weights and

From 73acfcdfb91b5d10a7236925bfe17ae18f8d82b0 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 10 May 2024 12:01:25 -0700
Subject: [PATCH 21/92] Nemo fix

---
 tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml             | 2 +-
 .../test_scripts/gpt3/pretrain_gpt3_nemo_test.sh                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
index f898c890eb..6bc7e98787 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
@@ -8,7 +8,7 @@ launchers:
     no_container_mount_home: 'true'
 spec:
   name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\
-         mbs{mbs}_gbs{gbs}_ \
+         mbs{mbs}_gbs{gbs}_\
          {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\
          tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\
          {'_'+args_meta if args_meta else ''}"
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
index 063ee5c258..74d6a45f54 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
@@ -53,7 +53,7 @@ build_run_cmd() {
         model.megatron_amp_O2=True \
         model.data.data_prefix=[] \
         model.data.data_impl=mock \
-        model.data.splits_string=[99990,8,2] \
+        model.data.splits_string=\'[99990,8,2]\' \
         model.optim.name=distributed_fused_adam \
         model.optim.weight_decay=0.1 \
         exp_manager.create_checkpoint_callback=False \

From 795b45cc0eb4225e4bdb72a4b9cedc648a41f07c Mon Sep 17 00:00:00 2001
From: Mike Chrzanowski <mchrzanowski@nvidia.com>
Date: Fri, 10 May 2024 12:11:31 -0700
Subject: [PATCH 22/92] Put Per-Token-Cross-Entropy calculation behind an
 argument

---
 .../distributed/distributed_data_parallel.py  | 19 ++++++++++++++++---
 .../core/distributed/finalize_model_grads.py  | 16 +++++++---------
 .../core/distributed/param_and_grad_buffer.py | 13 +++++++++++++
 megatron/core/pipeline_parallel/schedules.py  | 17 +++++++++++++----
 .../core/transformer/transformer_config.py    |  4 ++++
 megatron/training/arguments.py                |  3 +++
 pretrain_gpt.py                               |  9 +++++----
 pretrain_t5.py                                | 10 ++++++++--
 .../functional_tests/jet_recipes/MR-gpt.yaml  |  3 ++-
 .../jet_recipes/MR-multimodal.yaml            |  2 +-
 tests/functional_tests/jet_recipes/MR-t5.yaml |  4 ++--
 .../python_test_utils/test_ci_pipeline.py     |  8 +++++---
 ...rge-request-dgx-a100-1n8g-tp1-pp4-vp2.json |  2 +-
 ...m-merge-request-dgx-a100-1n8g-tp2-pp2.json |  2 +-
 ...-tp1-pp4-vp1-calculate-per-token-loss.json |  1 +
 ...2-pp1-te-8experts2parallel-top2router.json |  2 +-
 ...rge-request-dgx-a100-1n8g-tp1-pp4-vp1.json |  2 +-
 ...m-merge-request-dgx-a100-1n8g-tp2-pp2.json |  2 +-
 ...tp1-pp1-vp1-calculate-per-token-loss.json} |  0
 19 files changed, 85 insertions(+), 34 deletions(-)
 create mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json
 rename tests/functional_tests/test_results/jet/{t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json => t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json} (100%)

diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
index cd0fb41526..cdb58594d9 100644
--- a/megatron/core/distributed/distributed_data_parallel.py
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -94,7 +94,9 @@ def __init__(
             else:
                 expert_parallel_params.append(param)
 
-        def allocate_buffers_for_parameters(input_params, data_parallel_group):
+        def allocate_buffers_for_parameters(
+            input_params, data_parallel_group, gradient_scaling_factor,
+        ):
             param_and_grad_dtype_to_params = {}
 
             # Group parameters by their gradient type.
@@ -121,6 +123,7 @@ def allocate_buffers_for_parameters(input_params, data_parallel_group):
                         data_parallel_group,
                         self.bucket_size,
                         param_to_name,
+                        gradient_scaling_factor,
                     )
                 )
                 for param in params:
@@ -128,12 +131,22 @@ def allocate_buffers_for_parameters(input_params, data_parallel_group):
 
             return buffers
 
+        if config.calculate_per_token_loss:
+            gradient_scaling_factor = 1.0
+        else:
+            data_parallel_world_size = torch.distributed.get_world_size(data_parallel_group)
+            gradient_scaling_factor = 1.0 / data_parallel_world_size
+
         # Allocate the param+grad buffers for dense params' grads.
-        self.buffers = allocate_buffers_for_parameters(dense_params, data_parallel_group,)
+        self.buffers = allocate_buffers_for_parameters(
+            dense_params, data_parallel_group, gradient_scaling_factor=gradient_scaling_factor,
+        )
 
         # Allocate separate param+grad buffers for expert parallel params' grads.
         self.expert_parallel_buffers = allocate_buffers_for_parameters(
-            expert_parallel_params, expert_data_parallel_group,
+            expert_parallel_params,
+            expert_data_parallel_group,
+            gradient_scaling_factor=gradient_scaling_factor,
         )
 
         # Delete references to weight_tensor if they exist since we don't want two parameter copies
diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
index addfd12996..4eaa776b48 100644
--- a/megatron/core/distributed/finalize_model_grads.py
+++ b/megatron/core/distributed/finalize_model_grads.py
@@ -131,11 +131,9 @@ def finalize_model_grads(model: List[torch.nn.Module], num_tokens: Optional[torc
     if config.timers is not None:
         config.timers('embedding-grads-all-reduce').stop()
 
-    # normalize gradients.
+    # normalize gradients for per-token loss normalization.
     # if we are using by the number of tokens, then we use that as a divisor. this number
     # will be the total number of non-padded tokens in the global batch.
-    # otherwise, we simply divide by the number of data parallel ranks, which is the original
-    # behavior in megatron and is identical to the previous version when sequences are not padded.
     if num_tokens is not None:
         # the number of tokens is only present on the last stage, so broadcast it
         # to the other ranks in the pipeline parallel group.
@@ -144,9 +142,9 @@ def finalize_model_grads(model: List[torch.nn.Module], num_tokens: Optional[torc
             src=parallel_state.get_pipeline_model_parallel_last_rank(),
             group=parallel_state.get_pipeline_model_parallel_group(),
         )
-    for model_chunk in model:
-        if num_tokens is not None and num_tokens > 0:
-            scaling = 1.0 / num_tokens
-        else:
-            scaling = 1.0 / parallel_state.get_data_parallel_world_size()
-        model_chunk.scale_gradients(scaling)
+        # all-reduce across DP ranks.
+        torch.distributed.all_reduce(num_tokens, group=parallel_state.get_data_parallel_group())
+        for model_chunk in model:
+            if num_tokens > 0:
+                scaling = 1.0 / num_tokens
+                model_chunk.scale_gradients(scaling)
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index 445cb17e5a..54aeaab2b9 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -46,6 +46,9 @@ class Bucket:
         numel_unpadded: Number of unpadded elements in bucket.
         data_parallel_group: Data-parallel process group.
         data_parallel_world_size: World size using the data-parallel group group.
+        gradient_scaling_factor: This factor is utilized to scale gradients prior to their
+            communication. Its application is twofold: it facilitates the averaging of gradients
+            and the scaling of gradients in the context of the Mixture of Experts (MoE) model.
     """
 
     def __init__(
@@ -58,6 +61,7 @@ def __init__(
         numel_unpadded: int,
         data_parallel_group: torch.distributed.ProcessGroup,
         data_parallel_world_size: int,
+        gradient_scaling_factor: float,
     ):
         self.ddp_config = ddp_config
 
@@ -77,6 +81,7 @@ def __init__(
         self.data_parallel_group = data_parallel_group
         self.data_parallel_world_size = data_parallel_world_size
         self.data_parallel_rank = torch.distributed.get_rank(group=data_parallel_group)
+        self.gradient_scaling_factor = gradient_scaling_factor
 
         self.reset()
 
@@ -112,6 +117,8 @@ def start_grad_sync(self):
                 f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
             )
 
+        if self.gradient_scaling_factor != 1.0:
+            self.grad_data *= self.gradient_scaling_factor
         # Use async_op only when overlap_grad_reduce is True.
         if self.ddp_config.use_distributed_optimizer:
             local_data_view = shard_buffer(self.grad_data, self.data_parallel_world_size)[
@@ -181,6 +188,9 @@ class ParamAndGradBuffer:
         data_parallel_group: Data-parallel process group.
         bucket_size: The rough size of each bucket in terms of number of parameters.
         param_to_name: Mapping from `torch.nn.Parameter` to name (for logging purposes).
+        gradient_scaling_factor: This factor is utilized to scale gradients prior to their
+            communication. Its application is twofold: it facilitates the averaging of gradients
+            and the scaling of gradients in the context of the Mixture of Experts (MoE) model.
     """
 
     def __init__(
@@ -192,6 +202,7 @@ def __init__(
         data_parallel_group: torch.distributed.ProcessGroup,
         bucket_size: int,
         param_to_name: Dict[torch.nn.Parameter, str],
+        gradient_scaling_factor: float,
     ):
         self.ddp_config = ddp_config
 
@@ -209,6 +220,7 @@ def __init__(
         self.data_parallel_world_size = torch.distributed.get_world_size(
             group=self.data_parallel_group
         )
+        self.gradient_scaling_factor = gradient_scaling_factor
         self.is_last_microbatch = True
 
         # Data structures to store underlying buckets and relevant indexing data.
@@ -455,6 +467,7 @@ def _set_bucket(
             numel_unpadded=numel_unpadded,
             data_parallel_group=self.data_parallel_group,
             data_parallel_world_size=self.data_parallel_world_size,
+            gradient_scaling_factor=self.gradient_scaling_factor,
         )
         self.buckets.append(bucket)
         for bucket_param in bucket_params:
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index b1907dac03..1700619e97 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -215,11 +215,14 @@ def forward_step(
             outputs = loss_func(output_tensor)
             if len(outputs) == 3:
                 output_tensor, num_tokens, loss_reduced = outputs
+                if not config.calculate_per_token_loss:
+                    output_tensor /= num_tokens
+                    output_tensor /= num_microbatches
             else:
                 # preserve legacy loss averaging behavior (ie, over the number of microbatches)
                 assert len(outputs) == 2
                 output_tensor, loss_reduced = outputs
-                output_tensor = output_tensor / num_microbatches
+                output_tensor /= num_microbatches
             forward_data_store.append(loss_reduced)
         else:
             data = loss_func(output_tensor, non_loss_data=True)
@@ -415,7 +418,9 @@ def forward_backward_no_pipelining(
     if config.finalize_model_grads_func is not None and not forward_only:
         # Finalize model grads (perform full grad all-reduce / reduce-scatter for
         # data parallelism and layernorm all-reduce for sequence parallelism).
-        config.finalize_model_grads_func([model], total_num_tokens)
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
 
     if config.timers is not None:
         config.timers('forward-backward').stop()
@@ -1021,7 +1026,9 @@ def backward_step_helper(microbatch_id):
         # Finalize model grads (perform full grad all-reduce / reduce-scatter for
         # data parallelism, layernorm all-reduce for sequence parallelism, and
         # embedding all-reduce for pipeline parallelism).
-        config.finalize_model_grads_func(model, total_num_tokens)
+        config.finalize_model_grads_func(
+            model, total_num_tokens if config.calculate_per_token_loss else None
+        )
 
     if config.timers is not None:
         config.timers('forward-backward').stop()
@@ -1390,7 +1397,9 @@ def enable_grad_sync():
         # Finalize model grads (perform full grad all-reduce / reduce-scatter for
         # data parallelism, layernorm all-reduce for sequence parallelism, and
         # embedding all-reduce for pipeline parallelism).
-        config.finalize_model_grads_func([model], total_num_tokens)
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
 
     if config.timers is not None:
         config.timers('forward-backward').stop()
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index d68e7aed4b..0235d1e753 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -98,6 +98,10 @@ class TransformerConfig(ModelParallelConfig):
     test_mode: bool = False
     """Whether to run real-time tests."""
 
+    calculate_per_token_loss: bool = False
+    """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the
+    global batch, versus the default behavior of assuming all tokens are non-padded."""
+
     ####################
     # initialization
     ####################
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index a0d573bea1..1f8a5ce99f 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1028,6 +1028,9 @@ def _add_training_args(parser):
                        'means slower execution, but is good for debugging and testing.')
     group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None,
                        help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.')
+    group.add_argument('--calculate-per-token-loss', action='store_true',
+                       help=('Scale cross entropy loss by the number of non-padded tokens in the '
+                             'global batch, versus the default behavior of assuming all tokens are non-padded.'))
 
     # deprecated
     group.add_argument('--checkpoint-activations', action='store_true',
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 7f2ad3ed4e..6ba99de751 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -121,8 +121,9 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
 
     Returns:
         the loss scalar for this micro-batch
-        the total number of tokens across all data parallel ranks and microbatches
-        a dict containing reporting metrics on the loss and number of tokens across the data parallel ranks
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
     """
     args = get_args()
 
@@ -146,10 +147,10 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
     reporting_loss = loss.clone().detach()
     torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
 
-    num_tokens = reporting_loss[1].clone().detach().to(torch.int)
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
     return (
         loss[0] * args.context_parallel_size,
-        num_tokens,
+        local_num_tokens,
         {'lm loss': (reporting_loss[0], reporting_loss[1])},
     )
 
diff --git a/pretrain_t5.py b/pretrain_t5.py
index a271850c3d..a5dfdc0403 100644
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -155,6 +155,12 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
     Args:
         loss_mask (torch.Tensor): Used to mask out some portions of the loss
         output_tensor (torch.Tensor): The tensor with the losses
+
+    Returns:
+        the loss scalar for this micro-batch
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
     """
     lm_loss_ = output_tensor.float()
     total_tokens = loss_mask.sum()
@@ -162,10 +168,10 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
     lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1))
     lm_loss = torch.cat([lm_loss.view(1), total_tokens.view(1)])
 
-    reporting_loss = lm_loss.detach()
+    reporting_loss = lm_loss.clone().detach()
     torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
 
-    num_tokens = lm_loss[1].detach().to(torch.int)
+    num_tokens = lm_loss[1].clone().detach().to(torch.int)
     return lm_loss[0], num_tokens, {'lm loss': (reporting_loss[0], reporting_loss[1])}
 
 
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index db0fb855d1..ac382ef295 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -24,7 +24,7 @@ spec:
   batch_size: 32 # GBS, JET schema requires 'batch_size'
   moe_grouped_gemm: 0
   precision: bf16
-  time_limit: 1200
+  time_limit: 1500
   artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00}
   ckpt_format: torch_dist
   ckpt_resume: 0
@@ -59,6 +59,7 @@ products:
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-create-attention-mask-in-dataloader"], args_meta: ["no_create_attention_mask_in_dataloader"]}
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-mmap-bin-files"], args_meta: ["no_mmap_bin_files"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1]}
+  - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]}
   - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--qk-layernorm --test-mode"'], args_meta: ["qk_layernorm_test_mode"]}
   - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope"'], args_meta: ["rope_embeddings"]}
   - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope --rotary-interleaved --no-rope-fusion"'], args_meta: ["rope_embeddings_interleaved_no_fusion"]}
diff --git a/tests/functional_tests/jet_recipes/MR-multimodal.yaml b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
index deab2ce0dc..3f16288645 100644
--- a/tests/functional_tests/jet_recipes/MR-multimodal.yaml
+++ b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
@@ -49,4 +49,4 @@ spec:
         JOB_NAME={key.split("/")[1]} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
-  - {use_te: [True], tp_size: [1],  pp_size: [1]}
+  - {use_te: [True], tp_size: [1],  pp_size: [1], ckpt_resume: [0, 1]}
diff --git a/tests/functional_tests/jet_recipes/MR-t5.yaml b/tests/functional_tests/jet_recipes/MR-t5.yaml
index 566d943b12..a05c6ad85e 100644
--- a/tests/functional_tests/jet_recipes/MR-t5.yaml
+++ b/tests/functional_tests/jet_recipes/MR-t5.yaml
@@ -10,7 +10,7 @@ spec:
          {'_'+args_meta if args_meta else ''}"
   model: t5
   variant: 220m
-  build: mcore-pyt 
+  build: mcore-pyt
   scope: merge-request
   nodes: 1
   gpus: 8
@@ -48,4 +48,4 @@ spec:
         JOB_NAME={key.split("/")[1]} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
-  - {use_te: [True], tp_size: [1],  pp_size: [1], vp_size: [1]}
+  - {use_te: [True], tp_size: [1],  pp_size: [1], vp_size: [1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]}
diff --git a/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
index 0930dadc0f..4bda2242d8 100644
--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
@@ -19,13 +19,15 @@ def _setup(self):
         if os.path.exists(EXPECTED_METRICS_FILE):
             with open(EXPECTED_METRICS_FILE) as f:
                 self.expected = json.load(f)
+        else:
+            print(f"File {EXPECTED_METRICS_FILE} not found!")
 
     def _get_actual(self, loss_type):
         return read_tb_logs_as_list(LOGS_DIR, loss_type)
 
     def _test_helper(self, loss_type, test_type):
         if self.expected is None:
-            raise FileNotFoundError("Expected data is none")
+            raise FileNotFoundError(f"Expected data is none")
         expected = self.expected[loss_type]
         expected_list = expected["values"]
         print(f"The list of expected values: {expected_list}")
@@ -55,10 +57,10 @@ def test_num_zeros_deterministic(self):
         # Expected validation loss curve at different global steps.
         self._setup()
         self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
-    
+
     def iteration_timing_node(self):
         expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
         iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
-        idx = len(iteration_time)//3   
+        idx = len(iteration_time)//3
         iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
         assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
index abf6da1c26..85940e2f42 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52649, 10.49841, 10.45926, 10.32763, 10.17142, 9.96795]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22775.0, 23916.0, 27495.0, 22901.0, 22718.0, 20518.0, 23379.0]}, "iteration_timing_avg": 0.7692817647058824}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49569, 10.4596, 10.32846, 10.17265, 9.96951]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27627.0, 22759.0, 22567.0, 20671.0, 23229.0]}, "iteration_timing_avg": 0.7692817647058824}
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
index f6a0f47fa8..5e5b762761 100644
--- a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.45023, 10.44561, 10.38646, 10.25229, 10.12594, 9.95549]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [25037.0, 25599.0, 28336.0, 25502.0, 24023.0, 19471.0, 22109.0]}, "iteration_timing_avg": 0.7523635294117648}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json
new file mode 100644
index 0000000000..939863d9d8
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
index 3ad535db01..e946d83fa3 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86865, 10.87469, 10.79787, 10.66376, 10.57925, 10.05295, 10.18001, 10.09173, 9.74805]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13563.0, 16221.0, 16838.0, 16335.0, 14835.0, 15726.0, 14714.0, 17118.0, 17526.0, 18766.0]}, "iteration_timing_avg": 0.3051714705882352}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86905, 10.87593, 10.79804, 10.66451, 10.5803, 10.05453, 10.18348, 10.09461, 9.7533]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13204.0, 16437.0, 17053.0, 16247.0, 14948.0, 15533.0, 14496.0, 17106.0, 17472.0, 18590.0]}, "iteration_timing_avg": 0.3051714705882352}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
index 474abd4ef0..68d9fe822f 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.83137, 10.81979, 10.74667, 10.80852, 10.8044, 10.6368]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28515.0, 27094.0, 26111.0, 29819.0]}, "iteration_timing_avg": 0.1211408823529412}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
index 3a4e85afcc..87df9ed6c0 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88381, 10.86694, 10.82041, 10.84998, 10.83732, 10.70774]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [29453.0, 30329.0, 28824.0, 29477.0]}, "iteration_timing_avg": 0.14292588235294112}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
diff --git a/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json b/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1.json
rename to tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json

From 8d31792f9e8c10081c033a2078ffefdb7803629c Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Mon, 13 May 2024 13:02:08 -0700
Subject: [PATCH 23/92] Multimodal text generation

---
 .../core/models/multimodal/llava_model.py     |  49 ++++-
 megatron/core/models/vision/clip_vit_model.py |  13 +-
 megatron/inference/text_generation/api.py     |  20 +-
 .../inference/text_generation/forward_step.py | 195 ++++++++---------
 .../inference/text_generation/generation.py   |  41 ++--
 pretrain_vlm.py                               |  20 +-
 ...equest-dgx-a100-1n8g-mcore-te-tp1-pp1.json |   2 +-
 .../unit_tests/models/test_clip_vit_model.py  |   5 +-
 tests/unit_tests/models/test_llava_model.py   |  24 +-
 tools/run_vlm_text_generation.py              | 207 ++++++++++++++++++
 10 files changed, 418 insertions(+), 158 deletions(-)
 create mode 100644 tools/run_vlm_text_generation.py

diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 08132fa607..1c6c01c96d 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import InferenceParams, parallel_state
 from megatron.core.models.gpt import GPTModel
 from megatron.core.models.vision.clip_vit_model import CLIPViTModel
 from megatron.core.models.vision.multimodal_projector import MultimodalProjector
@@ -22,10 +22,12 @@ class LLaVAModel(MegatronModule):
     Args:
         language_transformer_config (TransformerConfig): Transformer config for the language model.
         language_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the language model.
+        language_position_embedding_type (str): Type of the positional embedding to use in the language model.
         vocab_size (int): Vocabulary size.
         max_sequence_length (int): maximum sequence length. This is used for positional embedding.
         vision_transformer_config (TransformerConfig): Transformer config for the vision model.
         vision_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the vision model.
+        drop_vision_class_token (bool): Drop vision class token(s) before input to the language model.
         vision_projection_config (TransformerConfig): Config for the projection from vision model outputs to language model inputs.
         vision_projection_layer_spec (ModuleSpec): Specifies the module to use for the vision projection.
         vision_projection_type (str): Type of the vision projection to use. Default is a 2-layer MLP.
@@ -36,10 +38,12 @@ def __init__(
         self,
         language_transformer_config: TransformerConfig,
         language_transformer_layer_spec: ModuleSpec,
+        language_position_embedding_type: str,
         vocab_size: int,
         max_sequence_length: int,
         vision_transformer_config: TransformerConfig,
         vision_transformer_layer_spec: ModuleSpec,
+        drop_vision_class_token: bool,
         vision_projection_config: TransformerConfig,
         vision_projection_layer_spec: ModuleSpec,
         vision_projection_type: str = "mlp",
@@ -59,9 +63,11 @@ def __init__(
             language_transformer_layer_spec,
             vocab_size,
             max_sequence_length,
+            position_embedding_type=language_position_embedding_type,
         )
 
         self.vision_model = CLIPViTModel(vision_transformer_config, vision_transformer_layer_spec)
+        self._drop_vision_class_token = drop_vision_class_token
 
         # Map (intermediate) vision model outputs to the language model input dimension.
         self.vision_projection = MultimodalProjector(
@@ -123,6 +129,7 @@ def forward(
         position_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         labels: torch.Tensor = None,
+        inference_params: InferenceParams = None,
     ) -> torch.Tensor:
         """Forward function of the LLaVA model.
 
@@ -132,22 +139,44 @@ def forward(
             position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
             attention_mask (torch.Tensor): attention mask for the language model [batch, 1, combined_seq_len, combined_seq_len].
             labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
+            inference_params (InferenceParams): Inference-time parameters including KV cache.
 
         Returns:
             output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
         """
-        image_embeddings = self.vision_model(images)  # [b, img_seq_len, h_vision]
-
-        # map vision model output size to language model input size.
-        image_embeddings = self.vision_projection(image_embeddings)  # [b, img_seq_len, h_language]
-
-        image_embeddings = image_embeddings.permute(1, 0, 2)  # [img_seq_len, b, h_language]
         language_embeddings = self.language_model.embedding(
             input_ids=input_ids, position_ids=position_ids
         )  # [text_seq_len, b, h_language]
-        combined_embeddings = torch.cat(
-            [image_embeddings, language_embeddings], dim=0
-        )  # [combined_seq_len, b, h_language]
+
+        # If running inference, we can skip image token computation if they were computed already earlier for this sample.
+        if (
+            inference_params is not None
+            and "image_tokens_count" in inference_params.key_value_memory_dict
+        ):
+            combined_embeddings = language_embeddings
+        else:
+            image_embeddings = self.vision_model(images)  # [b, img_seq_len, h_vision]
+
+            if self._drop_vision_class_token:
+                image_embeddings = image_embeddings[:, self.vision_model.class_token_len :, :]
+
+            image_embeddings = image_embeddings.permute(1, 0, 2)  # [img_seq_len, b, h_vision]
+
+            # map vision model output size to language model input size.
+            image_embeddings = self.vision_projection(
+                image_embeddings
+            )  # [b, img_seq_len, h_language]
+
+            # If running inference, the language model KV cache will be updated for image token positions.
+            # Here we store the image tokens sequence length, which can be used as an offset to the KV cache later.
+            if inference_params is not None:
+                inference_params.key_value_memory_dict[
+                    "image_tokens_count"
+                ] = image_embeddings.shape[1]
+
+            combined_embeddings = torch.cat(
+                [image_embeddings, language_embeddings], dim=0
+            )  # [combined_seq_len, b, h_language]
 
         # Embedding is computed above so we can discard input and position ids.
         input_ids = None
diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
index e5b005c0a9..84be735695 100644
--- a/megatron/core/models/vision/clip_vit_model.py
+++ b/megatron/core/models/vision/clip_vit_model.py
@@ -82,24 +82,23 @@ def __init__(
 
         self.model_type = ModelType.encoder_or_decoder
 
-        # Transformer + final layer norm (via post_process)
+        # Transformer layers.
         # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism.
-        self.transformer = TransformerBlock(
+        # Note: a final layer norm and/or linear layer present in some implementations are omitted here. They can be added separately where needed.
+        self.decoder = TransformerBlock(
             config=transformer_config,
             spec=transformer_layer_spec,
             pre_process=True,
-            post_process=True,
+            post_process=False,
         )
 
-        # Note: a final linear layer present in some implementations is omitted here. It can be added separately where needed.
-
     def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
         """Sets input tensor to the model.
 
         Args:
             input_tensor (Tensor): Sets the input tensor for the model.
         """
-        self.transformer.set_input_tensor(input_tensor)
+        self.decoder.set_input_tensor(input_tensor)
 
     def forward(
         self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
@@ -133,7 +132,7 @@ def forward(
         if attention_mask is None:
             attention_mask = torch.ones(1, 1, x.shape[0], x.shape[0]).cuda()  # [1, 1, s, s]
             attention_mask = attention_mask < 0.5  # to bool
-        x = self.transformer(x.contiguous(), attention_mask)
+        x = self.decoder(x.contiguous(), attention_mask)
         x = x.permute(1, 0, 2)  # [s, b, h] -> [b, s, h]
         x = x.contiguous()
 
diff --git a/megatron/inference/text_generation/api.py b/megatron/inference/text_generation/api.py
index 4557ff3c12..4015ac5cdb 100644
--- a/megatron/inference/text_generation/api.py
+++ b/megatron/inference/text_generation/api.py
@@ -14,8 +14,10 @@
 from .tokenization import (
     tokenize_prompts,
     detokenize_generations)
+from .forward_step import ForwardStep
 
 def generate_and_post_process(model,
+                              forward_step=ForwardStep,
                               prompts=None,
                               tokens_to_generate=0,
                               return_output_log_probs=False,
@@ -37,6 +39,7 @@ def generate_and_post_process(model,
     # Main inference.
     tokens, lengths, output_log_probs, logits = generate(
         model,
+        forward_step=forward_step,
         prompts=prompts,
         tokens_to_generate=tokens_to_generate,
         return_output_log_probs=return_output_log_probs,
@@ -74,6 +77,7 @@ def generate_and_post_process(model,
     return None
 
 def generate(model,
+             forward_step=None,
              prompts=None,
              tokens_to_generate=0,
              return_output_log_probs=False,
@@ -127,18 +131,18 @@ def generate(model,
     # Note that these tensors are broadcaseted to all ranks.
     if torch.distributed.get_rank() == 0:
         assert prompts is not None
-    
+
     context_tokens_tensor, context_length_tensor = tokenize_prompts(
         prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
 
     if tokens_to_generate == 0:
         return score_and_return_on_first_stage(
             model, context_tokens_tensor, context_length_tensor)
-    
+
     # Main inference function.
     # Note that the outputs are available on the first stage.
     return generate_tokens_probs_and_return_on_first_stage(
-        model, context_tokens_tensor, context_length_tensor,
+        model, forward_step, context_tokens_tensor, context_length_tensor,
         return_output_log_probs=return_output_log_probs,
         top_k=top_k_sampling,
         top_p=top_p_sampling,
@@ -151,6 +155,7 @@ def generate(model,
         prevent_newline_after_colon=prevent_newline_after_colon)
 
 def beam_search_and_post_process(model,
+                                 forward_step=ForwardStep,
                                  prompts=None,
                                  tokens_to_generate=0,
                                  beam_size=0,
@@ -164,6 +169,7 @@ def beam_search_and_post_process(model,
 
     # Main inference.
     tokens, scores = beam_search(model,
+                                 forward_step=forward_step,
                                  prompts=prompts,
                                  tokens_to_generate=tokens_to_generate,
                                  beam_size=beam_size,
@@ -174,14 +180,14 @@ def beam_search_and_post_process(model,
                                  prevent_newline_after_colon=prevent_newline_after_colon)
     # Only post-process on first stage.
     if mpu.is_pipeline_first_stage():
-        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) 
+        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device())
         tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True)
         scores = scores.cpu().numpy().tolist()
         return prompts_plus_generations, prompts_plus_generations_segments, scores
 
     return None
 
-def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False):
+def beam_search(model, forward_step, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False):
     # Make sure input params are avaialble to all ranks.
     values = [tokens_to_generate,
               beam_size,
@@ -201,7 +207,7 @@ def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=
 
     context_tokens_tensor, context_length_tensor = tokenize_prompts(
         prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
-    
-    return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, 
+
+    return beam_search_and_return_on_first_stage(model, forward_step, context_tokens_tensor, context_length_tensor,
             beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty,
             prevent_newline_after_colon=prevent_newline_after_colon)
diff --git a/megatron/inference/text_generation/forward_step.py b/megatron/inference/text_generation/forward_step.py
index e6951966c6..4d4878d337 100644
--- a/megatron/inference/text_generation/forward_step.py
+++ b/megatron/inference/text_generation/forward_step.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Forward step utilities."""
 
@@ -36,6 +36,8 @@ def __init__(self, model, max_batch_size, max_sequence_length):
         self.pipelining_batch_x_seqlen = \
             args.inference_batch_times_seqlen_threshold
 
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(tokens, position_ids, attention_mask, inference_params=self.inference_params)
 
     def __call__(self, tokens, position_ids, attention_mask):
         """Invocation of the forward methods. Note that self.inference_params
@@ -46,132 +48,117 @@ def __call__(self, tokens, position_ids, attention_mask):
             if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
                 micro_batch_size = \
                     max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
-                return _with_pipelining_forward_step(self.model,
-                                                     tokens,
-                                                     position_ids,
-                                                     attention_mask,
-                                                     self.inference_params,
-                                                     micro_batch_size)
+                return self._with_pipelining_forward_step(tokens,
+                                                          position_ids,
+                                                          attention_mask,
+                                                          micro_batch_size)
 
-        return _no_pipelining_forward_step(self.model,
-                                           tokens,
-                                           position_ids,
-                                           attention_mask,
-                                           self.inference_params)
+        return self._no_pipelining_forward_step(tokens,
+                                                position_ids,
+                                                attention_mask)
 
 
+    def _forward_step_helper(self, tokens, position_ids, attention_mask, recv_buffer=None):
+        """Single forward step. Update the allocate memory flag so
+        only the first time the memory is allocated."""
+        batch_size = tokens.size(0)
+        sequence_length = tokens.size(1)
+        if recv_buffer is None:
+            recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
 
-def _get_recv_buffer_dtype(args):
-    """Receive happens between the layers."""
-    if args.fp32_residual_connection:
-        return torch.float
-    return args.params_dtype
-
-
-
-def _allocate_recv_buffer(batch_size, sequence_length):
-    """Receive happens between the layers with size [s, b, h]."""
-    if mpu.is_pipeline_first_stage():
-        return None
-    args = get_args()
-    recv_size = (sequence_length, batch_size, args.hidden_size)
-    return torch.empty(recv_size,
-                       dtype=_get_recv_buffer_dtype(args),
-                       device=torch.cuda.current_device())
-
+        # Receive from previous stage.
+        recv_from_prev_pipeline_rank_(recv_buffer)
 
+        # Forward pass through the model.
+        self.model.set_input_tensor(recv_buffer)
+        output_tensor = self._forward(tokens, position_ids, attention_mask)
 
-def _forward_step_helper(model, tokens, position_ids, attention_mask,
-                         inference_params, recv_buffer=None):
-    """Single forward step. Update the allocate memory flag so
-    only the first time the memory is allocated."""
-    batch_size = tokens.size(0)
-    sequence_length = tokens.size(1)
-    if recv_buffer is None:
-        recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
+        # Send output to the next stage.
+        send_to_next_pipeline_rank(output_tensor)
 
-    # Receive from previous stage.
-    recv_from_prev_pipeline_rank_(recv_buffer)
+        return output_tensor
 
-    # Forward pass through the model.
-    model.set_input_tensor(recv_buffer)
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          inference_params=inference_params)
 
-    # Send output to the next stage.
-    send_to_next_pipeline_rank(output_tensor)
-
-    return output_tensor
 
+    def _no_pipelining_forward_step(self, tokens, position_ids, attention_mask,
+                                    recv_buffer=None):
+        """If recv_buffer is none, we will allocate one on the fly."""
+        # Run a simple forward pass.
+        output_tensor = self._forward_step_helper(tokens, position_ids,
+                                                  attention_mask, recv_buffer=recv_buffer)
+        # Update the sequence length offset.
+        self.inference_params.sequence_len_offset += tokens.size(1)
 
+        logits = None
+        if mpu.is_pipeline_last_stage():
+            logits = output_tensor
 
-def _no_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                inference_params, recv_buffer=None):
-    """If recv_buffer is none, we will allocate one on the fly."""
-    # Run a simple forward pass.
-    output_tensor = _forward_step_helper(model, tokens, position_ids,
-                                         attention_mask, inference_params,
-                                         recv_buffer=recv_buffer)
-    # Update the sequence length offset.
-    inference_params.sequence_len_offset += tokens.size(1)
+        return logits
 
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        logits = output_tensor
 
-    return logits
+    def _with_pipelining_forward_step(self, tokens, position_ids, attention_mask, micro_batch_size):
+        """No interleaving is supported."""
+        sequence_length = tokens.size(1)
+        batch_size = tokens.size(0)
 
+        # Divide the batch dimension into micro batches.
+        num_micro_batches, last_chunk = divmod(batch_size,
+                                            micro_batch_size)
+        if last_chunk > 0:
+            num_micro_batches += 1
 
+        # Preallocate memory for output logits.
+        logits = None
+        if mpu.is_pipeline_last_stage():
+            args = get_args()
+            logits = torch.empty(
+                (batch_size, sequence_length, args.padded_vocab_size),
+                dtype=torch.float32, device=torch.cuda.current_device())
 
-def _with_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                  inference_params, micro_batch_size):
-    """No interleaving is supported."""
-    sequence_length = tokens.size(1)
-    batch_size = tokens.size(0)
+        # Preallocate recv buffer.
+        recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
 
-    # Divide the batch dimension into micro batches.
-    num_micro_batches, last_chunk = divmod(batch_size,
-                                           micro_batch_size)
-    if last_chunk > 0:
-        num_micro_batches += 1
+        for micro_batch_index in range(num_micro_batches):
+            # Slice among the batch dimenion.
+            start = micro_batch_index * micro_batch_size
+            end = min(start + micro_batch_size, batch_size)
+            this_micro_batch_size = end - start
+            tokens2use = tokens[start:end, ...]
+            position_ids2use = position_ids[start:end, ...]
 
-    # Preallocate memory for output logits.
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        args = get_args()
-        logits = torch.empty(
-            (batch_size, sequence_length, args.padded_vocab_size),
-            dtype=torch.float32, device=torch.cuda.current_device())
+            # Run a simple forward pass.
+            if this_micro_batch_size != micro_batch_size:
+                recv_buffer = None
+            output = self._forward_step_helper(tokens2use, position_ids2use, attention_mask, recv_buffer=recv_buffer)
 
-    # Preallocate recv buffer.
-    recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
+            # Adjust the batch size offset to account for the micro-batch.
+            self.inference_params.batch_size_offset += this_micro_batch_size
 
-    for micro_batch_index in range(num_micro_batches):
-        # Slice among the batch dimenion.
-        start = micro_batch_index * micro_batch_size
-        end = min(start + micro_batch_size, batch_size)
-        this_micro_batch_size = end - start
-        tokens2use = tokens[start:end, ...]
-        position_ids2use = position_ids[start:end, ...]
+            # Copy logits.
+            if mpu.is_pipeline_last_stage():
+                logits[start:end, ...] = output
 
-        # Run a simple forward pass.
-        if this_micro_batch_size != micro_batch_size:
-            recv_buffer = None
-        output = _forward_step_helper(model, tokens2use, position_ids2use,
-                                      attention_mask, inference_params,
-                                      recv_buffer=recv_buffer)
+        # Once we are done with all the micro-batches, we can
+        # adjust the sequence length offset.
+        self.inference_params.sequence_len_offset += sequence_length
+        # and reset the batch size offset
+        self.inference_params.batch_size_offset = 0
 
-        # Adjust the batch size offset to account for the micro-batch.
-        inference_params.batch_size_offset += this_micro_batch_size
+        return logits
 
-        # Copy logits.
-        if mpu.is_pipeline_last_stage():
-            logits[start:end, ...] = output
 
-    # Once we are done with all the micro-batches, we can
-    # adjust the sequence length offset.
-    inference_params.sequence_len_offset += sequence_length
-    # and reset the batch size offset
-    inference_params.batch_size_offset = 0
+def _get_recv_buffer_dtype(args):
+    """Receive happens between the layers."""
+    if args.fp32_residual_connection:
+        return torch.float
+    return args.params_dtype
 
-    return logits
+def _allocate_recv_buffer(batch_size, sequence_length):
+    """Receive happens between the layers with size [s, b, h]."""
+    if mpu.is_pipeline_first_stage():
+        return None
+    args = get_args()
+    recv_size = (sequence_length, batch_size, args.hidden_size)
+    return torch.empty(recv_size,
+                       dtype=_get_recv_buffer_dtype(args),
+                       device=torch.cuda.current_device())
diff --git a/megatron/inference/text_generation/generation.py b/megatron/inference/text_generation/generation.py
index 84e4af160f..e17ea2b9cb 100644
--- a/megatron/inference/text_generation/generation.py
+++ b/megatron/inference/text_generation/generation.py
@@ -35,10 +35,10 @@ def score_and_return_on_first_stage(model, tokens, lengths):
     batch_size = tokens.size(0)
     max_prompt_length = lengths.max().item()
     assert max_prompt_length == tokens.size(1)
-    
+
     if max_prompt_length > args.max_position_embeddings:
         raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
+
     if max_prompt_length * batch_size > args.max_tokens_to_oom:
         raise ValueError("Too many tokens.  " + str(max_prompt_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
 
@@ -52,18 +52,18 @@ def score_and_return_on_first_stage(model, tokens, lengths):
     # Log probability of the sequence (prompt + generated tokens).
     output_log_probs = None
     output_log_probs_size = (batch_size, max_prompt_length - 1)
-    
+
     if mpu.is_pipeline_last_stage():
         output_log_probs = torch.empty(output_log_probs_size,
                                        dtype=torch.float32,
                                        device=torch.cuda.current_device())
-    
+
     # =============
     # Run infernece
     # =============
     with torch.no_grad():
         attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
-        
+
         # logits will be meanigful only in the last pipeline stage.
         logits = forward_step(tokens, position_ids, attention_mask)
 
@@ -71,24 +71,24 @@ def score_and_return_on_first_stage(model, tokens, lengths):
             # Always the last stage should have an output.
             assert logits is not None
             log_probs = F.log_softmax(logits, dim=2)
-            
+
             # Pick the tokens that we need to get the log
             # probabilities for. Note that next input token is
             # the token which we selected in the current logits,
             # so shift by 1.
             indices = torch.unsqueeze(tokens[:, 1:], 2)
             output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2)
-    
+
     # ======================================
     # Broadcast to the first pipeline stage.
     # ======================================
     output_log_probs = broadcast_from_last_to_first_pipeline_stage(
         output_log_probs_size, torch.float32, output_log_probs)
-    
+
     return tokens, lengths, output_log_probs, logits
 
 def generate_tokens_probs_and_return_on_first_stage(
-        model, tokens, lengths,
+        model, forward_step, tokens, lengths,
         return_output_log_probs=False,
         top_k=0, top_p=0.0, top_p_decay=0.0, top_p_bound=0.0,
         temperature=1.0,
@@ -101,6 +101,7 @@ def generate_tokens_probs_and_return_on_first_stage(
 
     Args:
         model: no interleaving is supported.
+        forward_step (ForwardStep): Class for running the model forward step.
         tokens: prompt tokens extended to be of size [b, max-sequence-length]
         lengths: original prompt length, size: [b]
         return_output_log_probs: flag to calculate the log probability of
@@ -135,12 +136,12 @@ def generate_tokens_probs_and_return_on_first_stage(
 
     if max_sequence_length > args.max_position_embeddings:
         raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
+
     if max_sequence_length * batch_size > args.max_tokens_to_oom:
         raise ValueError("Too many tokens.  " + str(max_sequence_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
 
     # forward step.
-    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+    forward_step = forward_step(model, batch_size, max_sequence_length)
 
     # Added termination_id to support the case that we want to terminate the
     # generation once that id is generated.
@@ -166,7 +167,7 @@ def generate_tokens_probs_and_return_on_first_stage(
         generated_sequence_lengths = torch.ones(
                 batch_size, dtype=torch.int64,
                 device=torch.cuda.current_device()) * max_sequence_length
-    
+
     # Whether we have reached a termination id.
     is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
                                      device=torch.cuda.current_device())
@@ -252,10 +253,10 @@ def generate_tokens_probs_and_return_on_first_stage(
                     hit_double_eol = (new_sample == 628).byte() & started.byte()
                     hit_eol = (new_sample == 198).byte() & started.byte()
                     done_token = hit_double_eol | hit_eol
-                else: 
+                else:
                     done_token = (new_sample == termination_id).byte() & \
                         started.byte()
-                
+
                 just_finished = (done_token & ~is_generation_done).bool()
                 generated_sequence_lengths[just_finished.view(-1)] = \
                     context_length + 1
@@ -265,7 +266,7 @@ def generate_tokens_probs_and_return_on_first_stage(
                                                       tensor=done)
             if use_eod_token_for_early_termination and done:
                 break
-            
+
     # ===================================================
     # Update the length of based on max generated length.
     # ===================================================
@@ -288,7 +289,7 @@ def generate_tokens_probs_and_return_on_first_stage(
 
     return tokens, generated_sequence_lengths, output_log_probs, None
 
-def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True):
+def beam_search_and_return_on_first_stage(model, forward_step, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True):
     args = get_args()
     tokenizer = get_tokenizer()
 
@@ -297,13 +298,13 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
     prompt_length = lengths.item()
     final_sequence_length = tokens.size(1)
     final_sequence_length = min(final_sequence_length, args.max_position_embeddings)
-    
+
     # If the context is too big, this happens
     if prompt_length >= final_sequence_length:
         raise ValueError("context length + tokens_to_generate too large")
 
     # forward step.
-    forward_step = ForwardStep(model, beam_size, final_sequence_length)
+    forward_step = forward_step(model, beam_size, final_sequence_length)
 
     beam_hyp = BeamHypotheses(beam_size, length_penalty)
     best_batches = None
@@ -369,12 +370,12 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
 
                 if beam_hyp.is_done(best_scores.max().item(), context_length + 1 - prompt_length):
                     done = torch.ones(1, dtype=torch.uint8, device=torch.cuda.current_device())
-            
+
                 best_batches = tokens.new([item[2] for item in next_beams])
                 tokens = tokens[best_batches,:]
                 tokens[:, context_length] = tokens.new([item[0] for item in next_beams])
                 scores = scores.new([item[1] for item in next_beams]).unsqueeze(1)
-          
+
             # torch.distributed.barrier()
             done = broadcast_from_last_pipeline_stage(1, torch.uint8, done)
             if done:
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index cd44cc99e5..8df6584fbb 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -6,8 +6,6 @@
 
 import torch
 
-from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
 from megatron.core import tensor_parallel
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import MockGPTLowLevelDataset
@@ -17,7 +15,8 @@
 from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec
 from megatron.core.models.multimodal.llava_model import LLaVAModel
 from megatron.core.transformer.spec_utils import import_module
-from megatron.training import pretrain
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
 from pretrain_gpt import is_dataset_built_on_rank, loss_func
 
 
@@ -57,10 +56,12 @@ def model_provider(pre_process=True, post_process=True) -> LLaVAModel:
     model = LLaVAModel(
         language_transformer_config=language_transformer_config,
         language_transformer_layer_spec=language_transformer_layer_spec,
+        language_position_embedding_type=args.position_embedding_type,
         vocab_size=args.padded_vocab_size,
         max_sequence_length=args.max_position_embeddings,
         vision_transformer_config=vision_transformer_config,
         vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.drop_vision_class_token,
         vision_projection_config=vision_projection_config,
         vision_projection_layer_spec=vision_projection_modules,
         vision_projection_type=vision_projection_type,
@@ -192,6 +193,18 @@ def forward_step(data_iterator, model: LLaVAModel):
     return output_tensor, partial(loss_func, loss_mask)
 
 
+def add_vlm_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='vision language model specific arguments')
+    group.add_argument(
+        "--drop-vision-class-token",
+        action="store_true",
+        default=False,
+        help="Drop vision class token before input to the language model.",
+    )
+    return parser
+
+
 if __name__ == "__main__":
     train_valid_test_datasets_provider.is_distributed = True
 
@@ -201,4 +214,5 @@ def forward_step(data_iterator, model: LLaVAModel):
         ModelType.encoder_or_decoder,
         forward_step,
         args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_vlm_extra_args,
     )
diff --git a/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json b/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
index dcdf8cd82d..a3efbeb21e 100644
--- a/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
+++ b/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13273, 9.13911, 9.13383, 9.12657, 9.09489, 9.07765, 9.02826, 9.00005, 8.96948, 8.92915]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2594526.0, 2527198.0, 2601909.0, 2496960.0, 2554383.0, 2678214.0, 2491802.0, 2610525.0, 2656421.0, 2684195.0]}, "iteration_timing_avg": 0.1316635294117647}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13475, 9.1392, 9.13457, 9.12454, 9.09413, 9.07808, 9.02886, 9.00177, 8.96967, 8.92995]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2594425.0, 2527253.0, 2602008.0, 2497235.0, 2554616.0, 2677868.0, 2491787.0, 2610638.0, 2656468.0, 2684047.0]}, "iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
diff --git a/tests/unit_tests/models/test_clip_vit_model.py b/tests/unit_tests/models/test_clip_vit_model.py
index 3c15684fb4..b20ab2ddf1 100644
--- a/tests/unit_tests/models/test_clip_vit_model.py
+++ b/tests/unit_tests/models/test_clip_vit_model.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
 import pytest
 import torch
 
@@ -29,7 +28,7 @@ def test_constructor(self):
         assert isinstance(self.model, CLIPViTModel)
 
         num_weights = sum([p.numel() for p in self.model.parameters()])
-        assert num_weights == 174848
+        assert num_weights == 174720
 
     def test_set_input_tensor(self):
         # [s, b, h] expected to the transformer.
@@ -38,7 +37,7 @@ def test_set_input_tensor(self):
 
         self.model.set_input_tensor(input_tensor)
 
-        assert self.model.transformer.input_tensor.shape == torch.Size(expected_shape)
+        assert self.model.decoder.input_tensor.shape == torch.Size(expected_shape)
 
     def test_forward(self):
         self.model.cuda()
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index 7b4ca0e5f8..9635f2e3b2 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from megatron.core import InferenceParams
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.models.multimodal.llava_model import LLaVAModel
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
@@ -37,10 +38,12 @@ def setup_method(self, method):
         self.model = LLaVAModel(
             language_transformer_config=language_config,
             language_transformer_layer_spec=language_layer_spec,
+            language_position_embedding_type="rope",
             vocab_size=2048,
             max_sequence_length=1024,
             vision_transformer_config=vision_config,
             vision_transformer_layer_spec=vision_layer_spec,
+            drop_vision_class_token=False,
             vision_projection_config=vision_projection_config,
             vision_projection_layer_spec=vision_projection_spec,
         )
@@ -52,13 +55,13 @@ def test_constructor(self):
         assert isinstance(self.model, LLaVAModel)
 
         num_weights = sum([p.numel() for p in self.model.parameters()])
-        assert num_weights == 1439432
+        assert num_weights == 1308232
 
     def test_set_input_tensor(self):
         expected_shape = (1, 2, 3, 4)
         input_tensor = torch.zeros(expected_shape)
         self.model.set_input_tensor(input_tensor)
-        assert self.model.vision_model.transformer.input_tensor.shape == expected_shape
+        assert self.model.vision_model.decoder.input_tensor.shape == expected_shape
 
     def test_forward(self):
         self.model.cuda()
@@ -72,13 +75,28 @@ def test_forward(self):
         attention_mask = attention_mask < 0.5
         labels = torch.randint(0, 2048, (2, 1601)).cuda()
 
-        # Try with and without labels.
+        # Try with labels.
         loss = self.model.forward(img, input_ids, position_ids, attention_mask, labels)
         assert loss.shape == torch.Size((2, 1601))
 
+        # Try without labels and without inference params.
         logits = self.model.forward(img, input_ids, position_ids, attention_mask, labels=None)
         assert logits.shape == torch.Size((2, 1601, 2048))
 
+        # Try without labels and with inference params.
+        inference_params = InferenceParams(2, 1601)
+        logits = self.model.forward(
+            img,
+            input_ids,
+            position_ids,
+            attention_mask,
+            labels=None,
+            inference_params=inference_params,
+        )
+        assert logits.shape == torch.Size((2, 1601, 2048))
+        # Check KV cache got created.
+        assert len(inference_params.key_value_memory_dict) > 0
+
     def test_save_load(self, tmp_path):
         path = tmp_path / "model.pt"
         torch.save(self.model.state_dict(), path)
diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py
new file mode 100644
index 0000000000..ab0a2df41d
--- /dev/null
+++ b/tools/run_vlm_text_generation.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Generate text using a vision language model."""
+import glob
+import json
+import logging
+import os
+import sys
+from collections import defaultdict
+from functools import partial
+
+# Add megatron to the path.
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToPILImage
+
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.training import get_args, get_model, print_rank_0
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from pretrain_vlm import model_provider
+
+
+def add_text_generation_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='Vision language model text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=1024, help='Size of the output generated text.'
+    )
+    group.add_argument("--output-path", type=str, required=True, help='Output file path')
+    group.add_argument('--input-path', type=str, required=True, help="Input directory")
+    group.add_argument(
+        '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
+    )
+    group.add_argument('--partition-id', type=int, default=0, help="Partition index")
+    group.add_argument("--drop-vision-class-token", action="store_true", default=False)
+    group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+
+    return parser
+
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+
+def _transform_test(img_h, img_w):
+    return Compose([ToPILImage(), Resize((img_h, img_w)), _convert_image_to_rgb])
+
+
+def preprocess(img_h, img_w, img):
+    # Example image preprocessing.
+    pixel_mean = [123.675, 116.28, 103.53]  # Imagenet's mean.
+    pixel_std = [58.395, 57.12, 57.375]
+    pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
+    pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+
+    raw_h, raw_w = img.shape[0], img.shape[1]
+    ratio = float(max(img_h, img_w)) / max(raw_h, raw_w)
+    H, W = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5)
+    image_transform = _transform_test(H, W)
+    img = image_transform(img)
+    img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
+    delta_h, delta_w = img_h - H, img_w - W
+    padded_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+    return padded_img
+
+
+def generate_samples(model):
+    """Text generation using a trained vision language model. This is an example for the COCO dataset."""
+    args = get_args()
+
+    image_files = sorted(glob.glob(args.input_path + "/*"))
+    # Optionally, process only a subset of the input files.
+    if args.num_partitions > 0:
+        per_part = len(image_files) // args.num_partitions
+        image_files = image_files[per_part * args.partition_id : per_part * (args.partition_id + 1)]
+
+    num_samples = len(image_files)
+    images = []
+
+    # Run image preprocessing.
+    for image_file in image_files:
+        img = np.array(Image.open(image_file))
+        img = preprocess(args.img_h, args.img_w, img)
+
+        images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+
+    # Load optional ground truth.
+    gt_image_id_to_captions = defaultdict(list)
+    if args.gt_path:
+        gts = json.load(open(args.gt_path))
+        for gt in gts["annotations"]:
+            gt_image_id_to_captions[gt["image_id"]].append(gt['caption'])
+
+    idx = 0
+    while True:
+        image = images[idx].cuda()
+        image_id = int(image_files[idx].split("_")[-1].split(".")[0])
+
+        forward_step = partial(VLMForwardStep, image)
+
+        if torch.distributed.get_rank() == 0:
+            prompt = "Give a short and clear explanation of the subsequent image.\n"
+
+            resp_sentences, _, _, _ = generate_and_post_process(
+                model,
+                forward_step=forward_step,
+                prompts=[prompt],
+                tokens_to_generate=args.out_seq_length,
+                return_output_log_probs=False,
+                top_k_sampling=args.top_k,
+                top_p_sampling=args.top_p,
+                add_BOS=False,
+                temperature=args.temperature,
+                random_seed=123,
+            )
+
+            for prompt, generation in zip([prompt], resp_sentences):
+                output = {
+                    "question_id": image_id,
+                    "prompt": prompt,
+                    "caption": generation[len(prompt) :],
+                }
+
+                output["ground_truth"] = gt_image_id_to_captions[image_id]
+
+                print_rank_0(output)
+
+                yield output
+                idx += 1
+                if idx >= num_samples:
+                    break
+        else:
+            generate_and_post_process(model, forward_step=forward_step)
+
+            idx += 1
+            if idx >= num_samples:
+                break
+
+
+def generate_and_write_samples(model):
+    args = get_args()
+
+    for output in generate_samples(model):
+        if torch.distributed.get_rank() == 0:
+            with open(args.output_path, 'a') as f:
+                f.write(json.dumps(output) + "\n")
+
+
+class VLMForwardStep(ForwardStep):
+    def __init__(self, images, model, max_batch_size, max_sequence_length):
+        super().__init__(model, max_batch_size, max_sequence_length)
+        self._images = images
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(
+            self._images,
+            tokens,
+            position_ids,
+            attention_mask,
+            inference_params=self.inference_params,
+        )
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        logits = super().__call__(tokens, position_ids, attention_mask)
+
+        # On the first inference iteration, we compute image tokens.
+        # Update the sequence length offset by the number of image tokens.
+        num_tokens = tokens.size(1)
+        if num_tokens > 1:
+            self.inference_params.sequence_len_offset += self.inference_params.key_value_memory_dict[
+                "image_tokens_count"
+            ]
+
+        return logits
+
+
+def main():
+    """Vision language model text generation."""
+
+    logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.")
+
+    initialize_megatron(extra_args_provider=add_text_generation_args)
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider, wrap_with_ddp=False)
+
+    args = get_args()
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    model = model[0]
+    model.eval()
+
+    generate_and_write_samples(model)
+
+
+if __name__ == "__main__":
+    main()

From 6b014641212d815cf00018fa8ae017e808ebce0c Mon Sep 17 00:00:00 2001
From: Mikolaj Blaz <mblaz@nvidia.com>
Date: Mon, 13 May 2024 15:02:13 -0700
Subject: [PATCH 24/92] Decrease fully parallel save/load logging verbosity

---
 .../strategies/fully_parallel.py              | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
index 1fafcf4b86..7ec9b78201 100644
--- a/megatron/core/dist_checkpointing/strategies/fully_parallel.py
+++ b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
@@ -121,10 +121,10 @@ def apply_saving_parallelization(self, sharded_state_dict: ShardedStateDict) ->
         Returns: None
         """
         if self.do_cache_distribution and self.cached_distribution is not None:
-            logger.info(f'Apply *cached* save parallelization')
+            logger.debug(f'Apply *cached* save parallelization')
             precomputed_distribution = self.cached_distribution
         else:
-            logger.info(f'Apply save parallelization')
+            logger.debug(f'Apply save parallelization')
             precomputed_distribution = determine_main_replica_uniform_distribution(
                 sharded_state_dict, self.parallelization_group
             )
@@ -223,7 +223,7 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
             precomputed_distribution is not None
         ), 'Expecting non-trivial distribution for non-trivial parallelization group'
         end = time()
-        logger.info(f'self.apply_loading_parallelization took {end - start}s')
+        logger.debug(f'self.apply_loading_parallelization took {end - start}s')
         start = end
 
         # Step 3: load part of the checkpoint.
@@ -238,18 +238,18 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
         loaded_state_dict = self.base_strategy.load(sharded_state_dict, checkpoint_dir)
 
         end = time()
-        logger.info(f'Base load of ShardedObjects took {end - start}s')
+        logger.debug(f'Base load of ShardedObjects took {end - start}s')
         start = end
 
         # Load sharded tensors separately
         loaded_tensors = self.base_strategy.load(to_load_shards, checkpoint_dir)
 
         end = time()
-        logger.info(f'Base load of ShardedTensors took {end - start}s')
+        logger.debug(f'Base load of ShardedTensors took {end - start}s')
         start = end
 
         # Step 4: exchange data between ranks
-        logger.info(f'Applying parallel load with algo {self.exchange_algo}')
+        logger.debug(f'Applying parallel load with algo {self.exchange_algo}')
         if self.exchange_algo == 'gather_object':
             exchange_fn = self.exchange_loaded_tensors_gather_object
         elif self.exchange_algo == 'gather_rounds':
@@ -271,8 +271,8 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
         sync_start = time()
         torch.cuda.synchronize()
         end = time()
-        logger.info(f'torch.cuda.synchronize took {end - sync_start}s')
-        logger.info(f'self.exchange_loaded_tensors took {end - start}s')
+        logger.debug(f'torch.cuda.synchronize took {end - sync_start}s')
+        logger.debug(f'self.exchange_loaded_tensors took {end - start}s')
 
         self.fill_in_deferred_sharded_tensors(sharded_tensors, all_loaded_tensors)
         merge(loaded_state_dict, sharded_tensors)
@@ -344,10 +344,10 @@ def apply_loading_parallelization(
             SaveLoadDistribution (optional): the computed loading distribution
         """
         if self.do_cache_distribution and self.cached_distribution is not None:
-            logger.info(f'Apply *cached* load parallelization')
+            logger.debug(f'Apply *cached* load parallelization')
             precomputed_distribution = self.cached_distribution
         else:
-            logger.info(f'Apply load parallelization')
+            logger.debug(f'Apply load parallelization')
             precomputed_distribution = determine_main_replica_uniform_distribution(
                 sharded_state_dict, self.parallelization_group, True
             )
@@ -493,7 +493,7 @@ def exchange_loaded_tensors_gather_rounds(
 
             end = time()
             if torch.distributed.get_rank() == 0:
-                logger.info(f'{dtype} exchange rounds all_gather schedule took {end - start}s')
+                logger.debug(f'{dtype} exchange rounds all_gather schedule took {end - start}s')
 
         return all_loaded_tensors
 
@@ -547,7 +547,7 @@ def exchange_loaded_tensors_broadcast(
 
         end = time()
         if torch.distributed.get_rank() == 0:
-            logger.info(f'exchange broadcast schedule took {end - start}s')
+            logger.debug(f'exchange broadcast schedule took {end - start}s')
 
         return all_loaded_tensors
 
@@ -821,6 +821,6 @@ def distribute_shards_to_ranks(
         shard_to_saving_rank[shard_id] = rank
         rank_sizes[rank] = (size + shard_to_size[shard_id], rank)
 
-    logger.info(f'distribute_shards_to_ranks distribution: {rank_sizes}')
+    logger.debug(f'distribute_shards_to_ranks distribution: {rank_sizes}')
 
     return shard_to_saving_rank

From 4b44f0a1ee43982ef021487b960af0928ee4ea1f Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Mon, 13 May 2024 17:44:45 -0700
Subject: [PATCH 25/92] Workaround for TE bug where it can pick the wrong
 cuBLAS algorithm

---
 .../core/distributed/distributed_data_parallel.py    |  4 +---
 megatron/core/distributed/param_and_grad_buffer.py   | 12 ++++++++----
 megatron/core/optimizer/__init__.py                  |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
index cdb58594d9..b587c36b57 100644
--- a/megatron/core/distributed/distributed_data_parallel.py
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -61,9 +61,7 @@ def __init__(
 
         self.ddp_config = ddp_config
         if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-            logger.info(
-                f'Setting up DistributedDataParallel with {type(self.ddp_config).__name__}: {self.ddp_config}'
-            )
+            logger.info(f'Setting up DistributedDataParallel with config {self.ddp_config}')
 
         # Turn off bucketing if we are on a pipeline stage that is not the first (since
         # data-parallel communication on these stages is not on the critical path), or if
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index 54aeaab2b9..1d037c86e9 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -228,15 +228,19 @@ def __init__(
         self.param_to_bucket = {}  # Param -> bucket mapping.
         self.param_index_map = {}  # Param -> location in buffer mapping (used in dist. optimizer).
 
+        def _pad(number_to_be_padded: int, divisor: int) -> int:
+            return int(math.ceil(number_to_be_padded / divisor) * divisor)
+
         def _pad_if_needed(data_index: int) -> int:
             """
             Pads data indices if using distributed optimizer (to ensure uniform sharding).
             """
             if self.ddp_config.use_distributed_optimizer:
-                return (
-                    int(math.ceil(data_index / self.data_parallel_world_size))
-                    * self.data_parallel_world_size
-                )
+                # Workaround for TE bug causing cuBLAS to pick an incompatible algorithm.
+                # This also helps cuBLAS pick more efficient algorithms for GEMMs.
+                # We now ensure that all buckets start at a memory address that is 256-byte
+                # aligned (128 values since params and grads use >= 16-bit precision).
+                return _pad(data_index, math.lcm(self.data_parallel_world_size, 128))
             return data_index
 
         # First, figure out how many elements should be in the underlying buffer storage.
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
index 3f3f3fe877..95e6c31377 100644
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -278,7 +278,7 @@ def get_megatron_optimizer(
     """
 
     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-        logger.info(f'Setting up optimizer with {type(config).__name__}: {config}')
+        logger.info(f'Setting up optimizer with config {config}')
 
     # Collect param groups.
     param_groups = _get_param_groups(

From 7aa929544fd8ccc2f2d967e6370578e06bf4244c Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Tue, 14 May 2024 11:42:11 -0700
Subject: [PATCH 26/92] some updates.

---
 tools/bert_embedding/embed.py  | 2 +-
 tools/retro/cli/__main__.py    | 2 +-
 tools/retro/cli/cli.py         | 4 ++--
 tools/retro/preprocess_data.py | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py
index b1f7eb86f2..2236182a75 100644
--- a/tools/bert_embedding/embed.py
+++ b/tools/bert_embedding/embed.py
@@ -16,7 +16,7 @@
 from megatron.core.enums import ModelType
 from megatron.core.pipeline_parallel import get_forward_backward_func
 from megatron.legacy.model import BertModel
-from megatron.training import setup_model_and_optimizer
+from megatron.training.training import setup_model_and_optimizer
 from pretrain_bert import model_provider, get_batch, loss_func, forward_step
 
 from .dataset import BertEmbeddingDataset
diff --git a/tools/retro/cli/__main__.py b/tools/retro/cli/__main__.py
index 7c196fe69b..37d096a953 100644
--- a/tools/retro/cli/__main__.py
+++ b/tools/retro/cli/__main__.py
@@ -6,4 +6,4 @@
 
 
 if __name__ == "__main__":
-    retro.init(os.environ["RETRO_WORKDIR"])
+    retro.init(os.environ["RETRO_PROJECT_DIR"])
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
index 18da6c7779..2a75679a37 100644
--- a/tools/retro/cli/cli.py
+++ b/tools/retro/cli/cli.py
@@ -13,8 +13,8 @@
     get_merged_train_dataset as get_db_dataset,
 )
 from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets, RetroDataset
-from megatron.global_vars import set_global_variables
-from megatron.training import build_train_valid_test_datasets, update_train_iters
+from megatron.training.global_vars import set_global_variables
+from megatron.training.training import build_train_valid_test_datasets, update_train_iters
 from pretrain_retro import train_valid_test_datasets_provider
 from tools.retro.preprocess_data import get_tokenizers
 
diff --git a/tools/retro/preprocess_data.py b/tools/retro/preprocess_data.py
index c2896e24ef..dd36eb0667 100644
--- a/tools/retro/preprocess_data.py
+++ b/tools/retro/preprocess_data.py
@@ -13,8 +13,6 @@
 import sys
 import torch
 
-from megatron import get_args, initialize_megatron, print_rank_0
-from megatron.arguments import core_transformer_config_from_args
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.datasets.retro.db import build_db
@@ -37,6 +35,8 @@
     get_config_path,
     get_gpt_data_dir,
 )
+from megatron.training import get_args, initialize_megatron, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
 from megatron.training.tokenizer.tokenizer import (
     _BertWordPieceTokenizer,
     _GPT2BPETokenizer,

From 4e7d6de8e62fc661febd1dae271b2e8c2594278d Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Tue, 14 May 2024 16:25:49 -0700
Subject: [PATCH 27/92] examples/multimodal vision model converter

---
 examples/multimodal/README.md         |  11 ++
 examples/multimodal/clip_converter.py | 154 ++++++++++++++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 examples/multimodal/README.md
 create mode 100644 examples/multimodal/clip_converter.py

diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
new file mode 100644
index 0000000000..cc00bb2925
--- /dev/null
+++ b/examples/multimodal/README.md
@@ -0,0 +1,11 @@
+# Multimodal Example
+
+NOTE: This is work in progress.
+
+## Vision model.
+
+This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
+
+```
+python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
+```
\ No newline at end of file
diff --git a/examples/multimodal/clip_converter.py b/examples/multimodal/clip_converter.py
new file mode 100644
index 0000000000..e6c0fd8cc5
--- /dev/null
+++ b/examples/multimodal/clip_converter.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+
+import clip
+import torch
+
+
+def convert(download_root, output_path, tensor_parallel_size, use_te_layernorm_linear):
+    device = "cuda"
+
+    model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root)
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 64
+    hidden_dim = 1024
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+
+    indices = torch.cat(indices)
+
+    for name, tensor in state_dict.items():
+        # Skip text model.
+        if "visual" not in name:
+            continue
+
+        # Skip final layers not used in our model.
+        if name == "visual.proj" or "ln_post" in name:
+            continue
+
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "class_embedding" in name:
+            new_name = "class_token"
+            # Our model uses class token that is expanded to input dimensions already.
+            new_tensor = new_tensor.expand(1, 1, -1)
+        elif "positional_embedding" in name:
+            new_name = "position_embeddings.weight"
+        elif "conv1" in name:
+            new_name = "conv1.weight"
+        elif "ln_pre.weight" in name:
+            new_name = "ln_pre.weight"
+        elif "ln_pre.bias" in name:
+            new_name = "ln_pre.bias"
+        elif "transformer.resblocks" in name:
+            layer_idx = name.split(".")[3]
+            base = f"decoder.layers.{layer_idx}"
+
+            if "attn.in_proj_weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.in_proj_bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.out_proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.out_proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "ln_1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "ln_1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.c_fc.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.c_fc.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.c_proj.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.c_proj.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "ln_2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "ln_2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+
+        assert new_name != "", f"unexpected layer name {name}"
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            new_state_dicts[i]["model"][new_name] = new_tensors[i]
+
+    for i in range(tensor_parallel_size):
+        output_path_tp = os.path.join(output_path, f"state_dict_tp_{i}.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert OpenAI CLIP VIT weights to megatron format.
+
+
+Example usage:
+python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights",
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size",
+    )
+    parser.add_argument(
+        "--use-te-layernorm-linear",
+        action="store_true",
+        help="Use Transformer Engine's LayerNormLinear",
+    )
+
+    args = parser.parse_args()
+
+    convert(
+        args.download_root, args.output, args.tensor_parallel_size, args.use_te_layernorm_linear
+    )
+
+    print("done.")

From 80bc60c23481359ead0f6e4f28945f9004182b2b Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Wed, 15 May 2024 09:15:19 -0700
Subject: [PATCH 28/92] debugged dataset type discrepency.

---
 .../blended_megatron_dataset_builder.py       | 32 +++++++++++++++++--
 pretrain_retro.py                             | 11 +++++++
 tools/retro/cli/cli.py                        | 14 ++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index 1fdb749be7..f7af4bda39 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -124,6 +124,11 @@ def build(self) -> List[Optional[TopLevelDataset]]:
         """
         datasets = self._build_blended_dataset_splits()
 
+        # >>>
+        # from lutil import pax
+        # pax("datasets")
+        # <<<
+
         for dataset in datasets:
             if dataset is not None and len(dataset) > 0:
                 if isinstance(dataset, BlendedDataset):
@@ -137,6 +142,11 @@ def build(self) -> List[Optional[TopLevelDataset]]:
                                 f"{type(dataset).__name__} blend goes out of bounds for {type([dataset_and_size[0]]).__name__} {i} for {dataset.split.name} split"
                             )
 
+        # >>>
+        # from lutil import pax
+        # pax("datasets")
+        # <<<
+
         return datasets
 
     def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
@@ -169,9 +179,15 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
 
             split = self.config.split_matrix
 
-            # Blend consists of a single prefix
-            if len(prefixes) == 1:
-                return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
+            # >>>
+            if 0:
+                # Blend consists of a single prefix
+                if len(prefixes) == 1:
+                    # >>>
+                    # raise Exception("hi.")
+                    # <<<
+                    return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
+            # <<<
 
             # Build the mid-level datasets
             if weights is None:
@@ -214,6 +230,11 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                         self.config,
                     )
 
+            # >>>
+            # from lutil import pax
+            # pax("blended_datasets")
+            # <<<
+
             return blended_datasets
 
         ##
@@ -278,6 +299,11 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                         self.config,
                     )
 
+            # >>>
+            from lutil import pax
+            pax("blended_datasets")
+            # <<<
+
             return blended_datasets
 
     def _build_megatron_datasets_parallel(
diff --git a/pretrain_retro.py b/pretrain_retro.py
index e50e3077c1..0aa3475d3d 100644
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
@@ -205,12 +205,23 @@ def train_valid_test_datasets_provider(train_valid_test_num_samples):
         data_config,
     ).build()
 
+    # >>>
+    # from lutil import pax
+    # pax("train_valid_test_num_samples")
+    # pax({"datasets": [ train_ds, valid_ds, test_ds ]})
+    # <<<
+
     gpt_datasets = {
         "train" : (train_ds, train_valid_test_num_samples[0]),
         "valid" : (valid_ds, train_valid_test_num_samples[1]),
         "test"  : (test_ds, train_valid_test_num_samples[2]),
     }
 
+    # >>>
+    from lutil import pax
+    pax({k:"%s, %d" % (len(d) if d else "--", n) for k, (d, n) in gpt_datasets.items()})
+    # <<<
+
     # Retro datasets.
     if args.retro_add_retriever:
         return get_retro_datasets(
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
index 2a75679a37..ea89e4d5fc 100644
--- a/tools/retro/cli/cli.py
+++ b/tools/retro/cli/cli.py
@@ -60,6 +60,15 @@ def init(cls, project_dir: str) -> None:
                                         cls.config.retro_gpt_chunk_length,
                                         cls.config.retro_tokenizers.gpt.eod)
 
+        # >>>
+        # from megatron.training.training import build_train_valid_test_data_loaders
+        # args.iteration = 0
+        # train_loader, valid_loader, test_loader = \
+        #     build_train_valid_test_data_loaders(
+        #         train_valid_test_datasets_provider)
+        # pax("train_loader, valid_loader, test_loader")
+        # <<<
+
         # Pretraining datasets.
         pt_train_ds, pt_valid_ds, pt_test_ds = build_train_valid_test_datasets(
             train_valid_test_datasets_provider)
@@ -69,6 +78,11 @@ def init(cls, project_dir: str) -> None:
             test=pt_test_ds,
         )
 
+        # >>>
+        from lscratch import analyze_retro_dataset
+        analyze_retro_dataset("0.7", pt_train_ds)
+        # <<<
+
         # Print usage.
         cls.print_usage()
 

From 7968fd65326594d649f8a10de10f21188d3e294c Mon Sep 17 00:00:00 2001
From: Zijie Yan <zijiey@nvidia.com>
Date: Wed, 15 May 2024 10:52:05 -0700
Subject: [PATCH 29/92] Fix the typo in topk_with_capacity.

---
 megatron/core/transformer/moe/moe_utils.py            |  4 +++-
 megatron/core/transformer/transformer_config.py       |  4 ++--
 megatron/training/arguments.py                        |  2 +-
 .../transformer/moe/test_a2a_token_dispatcher.py      | 11 +++++++++--
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index ef6a64661b..9af23f1911 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -310,7 +310,7 @@ def topk_softmax_with_capacity(
         topk_mask = torch.zeros_like(logits).scatter(1, top_indices, 1)
 
         # Maskout exceeded tokens
-        if drop_policy == "prob":
+        if drop_policy == "probs":
             capacity_probs, capacity_indices = torch.topk(
                 topk_masked_gates, k=expert_capacity, dim=0, sorted=False
             )
@@ -319,6 +319,8 @@ def topk_softmax_with_capacity(
             _, capacity_indices = torch.topk(topk_mask, k=expert_capacity, dim=0, sorted=False)
             capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1)
             capacity_probs = torch.gather(topk_masked_gates, 0, capacity_indices)
+        else:
+            raise ValueError(f"Invalid drop_policy: {drop_policy}")
 
         if pad_to_capacity:
             final_probs, final_indices = (
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index 0235d1e753..250b2fdcd2 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -261,8 +261,8 @@ class TransformerConfig(ModelParallelConfig):
     moe_pad_expert_input_to_capacity: bool = False
     """moe_pad_expert_input_to_capacity (bool): If True, pads the input for each expert to match the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False."""
 
-    moe_token_drop_policy: str = 'position'
-    """The policy to drop tokens. Can be either "prob" or "position". If "prob", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.
+    moe_token_drop_policy: str = 'probs'
+    """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.
     """
     moe_layer_recompute: bool = False
     """Memory optimization: checkpointing moe_layer to save actiavtion memory."""
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 1f8a5ce99f..881c60e921 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1652,7 +1652,7 @@ def _add_moe_args(parser):
     group.add_argument('--moe-pad-expert-input-to-capacity', action='store_true',
                        help='Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set.')
     group.add_argument('--moe-token-drop-policy', type=str, default='probs', choices=['probs', 'position'],
-                       help='The policy to drop tokens. Can be either "prob" or "position". If "prob", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.')
+                       help='The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.')
     group.add_argument('--moe-layer-recompute', action='store_true',
                        help='Enable checkpointing for moe_layer, should be used when memory is not sufficient.')
     group.add_argument('--moe-extended-tp', action='store_true',
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
index af7bad3319..c6cfcac18b 100644
--- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -19,7 +19,8 @@ def teardown_method(self, method):
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
         (8, 1),
-        (4, 2)
+        (4, 2),
+        (1, 1),
     ])
     def test_forward_backward(self, tp_size, ep_size):
         container = MoEModelTestContainer(
@@ -37,7 +38,9 @@ def test_forward_backward(self, tp_size, ep_size):
     @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
-        (8, 1)
+        (8, 1),
+        (4, 2),
+        (1, 1),
     ])
     def test_capacity_forward_backward(self, tp_size, ep_size):
         container = MoEModelTestContainer(
@@ -48,6 +51,7 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
             moe_router_topk=2,
             moe_router_load_balancing_type="aux_loss",
             moe_token_dispatcher_type="alltoall",
+            moe_token_drop_policy="probs",
             moe_expert_capacity_factor=0.5,
             moe_pad_expert_input_to_capacity=False,
         )
@@ -58,6 +62,8 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
     @pytest.mark.parametrize("tp_size,ep_size", [
         (1, 8),
         (8, 1),
+        (4, 2),
+        (1, 1)
     ])
     def test_capacity_padding_forward_backward(self, tp_size, ep_size):
         import time
@@ -70,6 +76,7 @@ def test_capacity_padding_forward_backward(self, tp_size, ep_size):
             moe_router_topk=2,
             moe_router_load_balancing_type="aux_loss",
             moe_token_dispatcher_type="alltoall",
+            moe_token_drop_policy="probs",
             moe_expert_capacity_factor=0.5,
             moe_pad_expert_input_to_capacity=True,
         )

From f32c51f2176d001a10cb03c46f2590a5b0d14904 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 15 May 2024 11:50:10 -0700
Subject: [PATCH 30/92] Use new NeMo repo/image for NeMo tests

---
 tests/functional_tests/jet_recipes/build-pyt.yaml               | 2 +-
 .../test_scripts/gpt3/pretrain_gpt3_nemo_test.sh                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/jet_recipes/build-pyt.yaml b/tests/functional_tests/jet_recipes/build-pyt.yaml
index e5184d7b11..b42a39f178 100644
--- a/tests/functional_tests/jet_recipes/build-pyt.yaml
+++ b/tests/functional_tests/jet_recipes/build-pyt.yaml
@@ -28,7 +28,7 @@ spec:
   name: nemo
   platforms: [linux/amd64]
   source:
-    image: nvcr.io/nvidian/bignlp-train:nemofw-nightly
+    image: nvcr.io/nvidian/nemo:nightly
 
 ---
 type: build
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
index 74d6a45f54..7367b1d318 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh
@@ -21,7 +21,7 @@ MASTER_PORT=6000
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 
-command="export CUDA_DEVICE_MAX_CONNECTIONS=1; export HF_HOME=/workspace/huggingface/hub;"
+command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 
 set +x
 # Runs the "126m" parameter model

From c839ce396f95346d8534056c3eb70b71600ccdef Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 15 May 2024 14:44:52 -0700
Subject: [PATCH 31/92] Llava additional config options

---
 .../core/models/multimodal/llava_model.py     | 26 ++++++++++++-------
 pretrain_vlm.py                               | 15 ++++++-----
 tests/unit_tests/models/test_llava_model.py   |  7 +++--
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 1c6c01c96d..65f45c795b 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -22,9 +22,8 @@ class LLaVAModel(MegatronModule):
     Args:
         language_transformer_config (TransformerConfig): Transformer config for the language model.
         language_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the language model.
-        language_position_embedding_type (str): Type of the positional embedding to use in the language model.
-        vocab_size (int): Vocabulary size.
-        max_sequence_length (int): maximum sequence length. This is used for positional embedding.
+        language_vocab_size (int): Language model vocabulary size.
+        language_max_sequence_length (int): Language model maximum sequence length. This is used for positional embedding.
         vision_transformer_config (TransformerConfig): Transformer config for the vision model.
         vision_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the vision model.
         drop_vision_class_token (bool): Drop vision class token(s) before input to the language model.
@@ -32,15 +31,17 @@ class LLaVAModel(MegatronModule):
         vision_projection_layer_spec (ModuleSpec): Specifies the module to use for the vision projection.
         vision_projection_type (str): Type of the vision projection to use. Default is a 2-layer MLP.
         allow_missing_vision_projection_checkpoint (bool): Allow vision projection weights to be missing when loading a checkpoint. Default False.
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks. This is typically True for training and False for inference.
+        language_position_embedding_type (str): Position embedding type to use in the language model. Default learned absolute.
+        language_rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings in the language model. Defaults to 1.0.
     """
 
     def __init__(
         self,
         language_transformer_config: TransformerConfig,
         language_transformer_layer_spec: ModuleSpec,
-        language_position_embedding_type: str,
-        vocab_size: int,
-        max_sequence_length: int,
+        language_vocab_size: int,
+        language_max_sequence_length: int,
         vision_transformer_config: TransformerConfig,
         vision_transformer_layer_spec: ModuleSpec,
         drop_vision_class_token: bool,
@@ -48,6 +49,9 @@ def __init__(
         vision_projection_layer_spec: ModuleSpec,
         vision_projection_type: str = "mlp",
         allow_missing_vision_projection_checkpoint: bool = False,
+        parallel_output: bool = True,
+        language_position_embedding_type: str = 'learned_absolute',
+        language_rotary_percent: float = 1.0,
     ) -> None:
         super().__init__(config=language_transformer_config)
 
@@ -59,11 +63,13 @@ def __init__(
             raise NotImplementedError("pipeline parallelism is not supported in this model yet.")
 
         self.language_model = GPTModel(
-            language_transformer_config,
-            language_transformer_layer_spec,
-            vocab_size,
-            max_sequence_length,
+            config=language_transformer_config,
+            transformer_layer_spec=language_transformer_layer_spec,
+            vocab_size=language_vocab_size,
+            max_sequence_length=language_max_sequence_length,
+            parallel_output=parallel_output,
             position_embedding_type=language_position_embedding_type,
+            rotary_percent=language_rotary_percent,
         )
 
         self.vision_model = CLIPViTModel(vision_transformer_config, vision_transformer_layer_spec)
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index 8df6584fbb..2bee06913b 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -12,15 +12,15 @@
 from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
 from megatron.core.enums import ModelType
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
-from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec
 from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec
 from megatron.core.transformer.spec_utils import import_module
 from megatron.training import get_args, get_timers, get_tokenizer, pretrain, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args
 from pretrain_gpt import is_dataset_built_on_rank, loss_func
 
 
-def model_provider(pre_process=True, post_process=True) -> LLaVAModel:
+def model_provider(pre_process=True, post_process=True, parallel_output=True) -> LLaVAModel:
     """Builds the model.
 
     Note: currently, only LLaVA model is supported. Follow-up changes will make this configurable.
@@ -28,6 +28,7 @@ def model_provider(pre_process=True, post_process=True) -> LLaVAModel:
     Args:
         pre_process (bool): Enable preprocessing in the model. NOTE: Not used at the moment.
         post_process (bool): Enable postprocessing in the model. NOTE: Not used at the moment.
+        parallel_output (bool): Enable model parallel output.
 
     Returns:
         model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model
@@ -43,7 +44,7 @@ def model_provider(pre_process=True, post_process=True) -> LLaVAModel:
         language_transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
             args.num_experts, args.moe_grouped_gemm
         )
-    
+
     vision_transformer_layer_spec = get_vit_layer_with_transformer_engine_spec()
 
     # TODO: Make these configurable via input .yaml config.
@@ -56,15 +57,17 @@ def model_provider(pre_process=True, post_process=True) -> LLaVAModel:
     model = LLaVAModel(
         language_transformer_config=language_transformer_config,
         language_transformer_layer_spec=language_transformer_layer_spec,
-        language_position_embedding_type=args.position_embedding_type,
-        vocab_size=args.padded_vocab_size,
-        max_sequence_length=args.max_position_embeddings,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.max_position_embeddings,
         vision_transformer_config=vision_transformer_config,
         vision_transformer_layer_spec=vision_transformer_layer_spec,
         drop_vision_class_token=args.drop_vision_class_token,
         vision_projection_config=vision_projection_config,
         vision_projection_layer_spec=vision_projection_modules,
         vision_projection_type=vision_projection_type,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
     )
 
     return model
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index 9635f2e3b2..6a9ab594af 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -38,9 +38,8 @@ def setup_method(self, method):
         self.model = LLaVAModel(
             language_transformer_config=language_config,
             language_transformer_layer_spec=language_layer_spec,
-            language_position_embedding_type="rope",
-            vocab_size=2048,
-            max_sequence_length=1024,
+            language_vocab_size=2048,
+            language_max_sequence_length=1024,
             vision_transformer_config=vision_config,
             vision_transformer_layer_spec=vision_layer_spec,
             drop_vision_class_token=False,
@@ -55,7 +54,7 @@ def test_constructor(self):
         assert isinstance(self.model, LLaVAModel)
 
         num_weights = sum([p.numel() for p in self.model.parameters()])
-        assert num_weights == 1308232
+        assert num_weights == 1439304
 
     def test_set_input_tensor(self):
         expected_shape = (1, 2, 3, 4)

From 4b99f57c2bb480c8f34f95af824e1597206c851f Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 15 May 2024 14:53:08 -0700
Subject: [PATCH 32/92] Multimodal example - initial training scripts

---
 examples/multimodal/README.md      |  24 ++-
 examples/multimodal/config.py      |  92 +++++++++
 examples/multimodal/layer_specs.py |  98 ++++++++++
 examples/multimodal/pretrain_8b.sh | 124 ++++++++++++
 examples/multimodal/sft_8b.sh      | 118 ++++++++++++
 examples/multimodal/train.py       | 296 +++++++++++++++++++++++++++++
 6 files changed, 749 insertions(+), 3 deletions(-)
 create mode 100644 examples/multimodal/config.py
 create mode 100644 examples/multimodal/layer_specs.py
 create mode 100755 examples/multimodal/pretrain_8b.sh
 create mode 100755 examples/multimodal/sft_8b.sh
 create mode 100644 examples/multimodal/train.py

diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index cc00bb2925..ce483e1998 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -1,11 +1,29 @@
 # Multimodal Example
 
-NOTE: This is work in progress.
+NOTE: This is work in progress and not fully functional yet.
 
-## Vision model.
+## Setup
+
+### Vision model
 
 This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
 
 ```
 python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
-```
\ No newline at end of file
+```
+
+## Training
+
+### Pretraining
+
+Run the following script:
+```
+examples/multimodal/pretrain_8b.sh
+```
+
+### SFT
+
+Run the following script:
+```
+examples/multimodal/sft_8b.sh
+```
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
new file mode 100644
index 0000000000..5d5830bf7a
--- /dev/null
+++ b/examples/multimodal/config.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import torch
+
+from megatron.training.activations import quick_gelu, squared_relu
+
+
+def get_language_model_config(config):
+    if config.language_model_type == "2b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+    elif config.language_model_type == "8b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = False
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.attention_dropout = 0.0
+        config.apply_rope_fusion = False
+        config.activation_func = squared_relu
+        config.ffn_hidden_size = 16384
+        config.masked_softmax_fusion = True
+        config.attention_softmax_in_fp32 = True
+        config.num_query_groups = 32
+        config.kv_channels = 128
+        config.rotary_interleaved = False
+    elif config.my_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.te_attn_mask_type = None
+        config.rotary_percent = 0.5
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+
+    return config
+
+
+def get_vision_model_config(config, apply_query_key_layer_scaling=False):
+    config.num_layers = 24
+    config.num_attention_heads = 16
+    config.add_bias_linear = True
+    config.add_qkv_bias = True
+    config.hidden_size = 1024
+    config.hidden_dropout = 0.0
+    config.attention_dropout = 0.0
+    config.ffn_hidden_size = 4096
+    config.gated_linear_unit = False
+    config.activation_func = quick_gelu
+    config.kv_channels = 64
+    config.num_attention_heads = 16
+    config.num_query_groups = 16
+    config.layernorm_zero_centered_gamma = False
+    config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+    config.bias_activation_fusion = False
+    config.bias_dropout_fusion = False
+    config.attention_softmax_in_fp32 = True
+
+    return config
+
+
+def get_vision_projection_config(config, hidden_size):
+    config.gated_linear_unit = False
+    config.bias_activation_fusion = False
+    config.add_bias_linear = False
+    config.hidden_size = hidden_size
+    if config.language_model_type == "2b":
+        config.ffn_hidden_size = 5440
+        config.activation_func = torch.nn.functional.gelu
+    if config.language_model_type == "8b":
+        config.ffn_hidden_size = 16384
+        config.activation_func = squared_relu
+    elif config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.silu
+
+    return config
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
new file mode 100644
index 0000000000..c80b84ec0e
--- /dev/null
+++ b/examples/multimodal/layer_specs.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TEColumnParallelLinear,
+    TELayerNormColumnParallelLinear,
+    TEColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+
+class TorchLayerNormWrapper(torch.nn.LayerNorm):
+    def __init__(self, config, hidden_size, eps):
+        super().__init__(hidden_size, eps)
+
+
+def get_layer_spec(is_vit=False) -> ModuleSpec:
+    mlp = get_mlp_module_spec(use_te=False)
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=FusedLayerNorm if not is_vit else TorchLayerNormWrapper,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=FusedLayerNorm if not is_vit else TorchLayerNormWrapper,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_layer_spec_te(is_vit=False) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+
+    mlp = get_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear,
+            linear_fc2=TERowParallelLinear,
+        ),
+    )
\ No newline at end of file
diff --git a/examples/multimodal/pretrain_8b.sh b/examples/multimodal/pretrain_8b.sh
new file mode 100755
index 0000000000..efa638360e
--- /dev/null
+++ b/examples/multimodal/pretrain_8b.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+# Pretrain a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+MODEL_NAME="mcore-llava-8b-${DATETIME}"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+if [[ -z $TOKENIZER_MODEL ]]; then
+    echo "Please set TOKENIZER_MODEL for tokenizer model name."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
+DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
+
+DEBUG=1
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=8
+    NW=1
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=0
+else
+    BZ=256
+    NW=2
+    HD=0.1
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+fi
+
+OPTIONS=" \
+    --num-workers ${NW} \
+    --exit-duration-in-mins 230 \
+    --use-flash-attn \
+    --apply-layernorm-1p \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 0.5 \
+    --squared-relu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 1024 \
+    --max-position-embeddings 4096 \
+    --train-samples 410000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --lr 1e-5 \
+    --min-lr 2.5e-6 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 1000 \
+    --tokenizer-type GPTSentencePieceTokenizer \
+    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --data-path ${DATA_TRAIN} \
+    --valid-path ${DATA_VALID} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --dataset-config ${SOURCE}/examples/multimodal/dataset_config.yaml \
+    --save-interval 1000 \
+    --save ${FINETUNE_DIR} \
+    --load ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 0.5 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --eod-mask-loss \
+    --finetune \
+    --freeze-LM \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=8b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+    --allow-missing-vision-projection-checkpoint \
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=1
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
+
+# MULTI GPU
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
diff --git a/examples/multimodal/sft_8b.sh b/examples/multimodal/sft_8b.sh
new file mode 100755
index 0000000000..a88c51870e
--- /dev/null
+++ b/examples/multimodal/sft_8b.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Run SFT on a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+MODEL_NAME="mcore-llava-sft-${DATETIME}"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+if [[ -z $TOKENIZER_MODEL ]]; then
+    echo "Please set TOKENIZER_MODEL for tokenizer model name."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
+DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml"
+
+DEBUG=0
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=8
+    NW=1
+    HD=0.0
+    EXTRA_ARGS=""
+else
+    BZ=128
+    NW=1
+    HD=0.1
+    EXTRA_ARGS=""
+fi
+
+OPTIONS=" \
+    --num-workers ${NW} \
+    --use-flash-attn \
+    --apply-layernorm-1p \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 0.5 \
+    --squared-relu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 1024 \
+    --max-position-embeddings 4096 \
+    --train-samples 665000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --lr 1e-6 \
+    --min-lr 1e-7 \
+    --lr-decay-style cosine \
+    --log-interval 10 \
+    --eval-iters 10 \
+    --eval-interval 1000 \
+    --tokenizer-type GPTSentencePieceTokenizer \
+    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --data-path ${DATA_TRAIN} \
+    --valid-path ${DATA_VALID} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --dset-config ${SOURCE}/examples/multimodal/dataset_config.yaml \
+    --save-interval 1000 \
+    --exit-duration-in-mins 230 \
+    --save ${FINETUNE_DIR} \
+    --load ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 0.5 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --eod-mask-loss \
+    --finetune \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=8b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=1
+
+# MULTI GPU
+torchrun --nproc_per_node 8 pretrain_multimodal.py ${OPTIONS}
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
new file mode 100644
index 0000000000..836185aacb
--- /dev/null
+++ b/examples/multimodal/train.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain or SFT multimodal."""
+from copy import deepcopy
+from functools import partial
+import os
+import sys
+
+import torch
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+
+from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te
+from megatron.training import pretrain
+from megatron.training.utils import average_losses_across_data_parallel_group
+
+
+def model_provider(pre_process=True, post_process=True, parallel_output=True) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Enable preprocessing in the model. NOTE: Not used at the moment.
+        post_process (bool): Enable postprocessing in the model. NOTE: Not used at the moment.
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(is_vit=False)
+    else:
+        language_transformer_layer_spec = get_layer_spec(is_vit=False)
+
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=use_te)
+
+    if use_te:
+        vision_transformer_layer_spec = get_layer_spec_te(is_vit=True)
+    else:
+        vision_transformer_layer_spec = get_layer_spec(is_vit=True)
+
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size)
+    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te)
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.max_position_embeddings,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+    )
+
+    model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False)
+
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+
+    args = get_args()
+
+    tokens = None
+    labels = None
+    loss_mask = None
+    attention_mask = None
+    position_ids = None
+
+    # Broadcast data.
+    torch.cuda.nvtx.range_push("get_data")
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+
+    data_text = tensor_parallel.broadcast_data(["text"], data, torch.int64)["text"]
+    data_img = tensor_parallel.broadcast_data(["img"], data, torch.float32)
+    prompt_len = tensor_parallel.broadcast_data(["prompt_len"], data, torch.int64)["prompt_len"]
+
+    torch.cuda.nvtx.range_pop()
+
+    tokens_ = data_text.long()
+
+    img_raw = data_img['img'].reshape(-1, 3, args.img_h, args.img_w)
+
+    torch.cuda.nvtx.range_push("index tokens")
+    tokenizer = get_tokenizer()
+    tokens = tokens_[:, :args.seq_length].contiguous()
+    labels = tokens_[:, 1:args.seq_length+1].contiguous()
+
+    torch.cuda.nvtx.range_pop()
+
+    torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
+    attention_mask, loss_mask, position_ids = \
+        get_ltor_masks_and_position_ids(tokens, tokenizer.eod,
+                                        args.reset_position_ids,
+                                        args.reset_attention_mask,
+                                        args.eod_mask_loss,
+                                        question_length=prompt_len)
+    torch.cuda.nvtx.range_pop()
+
+    loss_mask, labels, attention_mask = _preprocess_data_for_llava(loss_mask, labels, attention_mask)
+
+    tokens = tokens[:, 1:]  # drop image index token
+
+    return tokens, labels, loss_mask, attention_mask, position_ids, img_raw
+
+
+def _preprocess_data_for_llava(loss_mask, labels, attention_mask):
+    """Preprocess data sample to the format expected by a LLaVA model."""
+    args = get_args()
+
+    add_class_token = not args.disable_vision_class_token
+
+    num_patches_per_dim_h = args.img_h // args.patch_dim
+    num_patches_per_dim_w = args.img_w // args.patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    num_image_tokens = num_patches + (1 if add_class_token else 0)
+    batch_size = loss_mask.shape[0]
+
+    loss_mask2 = torch.cat(
+        [torch.zeros(batch_size, num_image_tokens - 1, dtype=torch.float32, device=loss_mask.device), loss_mask], dim=1
+    )
+    labels2 = torch.cat([torch.zeros(batch_size, num_image_tokens - 1, dtype=torch.int64, device=labels.device), labels], dim=1)
+
+    full_seq_length = len(labels2[0])
+    attention_mask2 = torch.tril(torch.ones((1, 1, full_seq_length, full_seq_length), device=attention_mask.device))
+    attention_mask2 = attention_mask2 < 0.5
+
+    return loss_mask2, labels2, attention_mask2
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss,
+                                    question_length=None,
+                                    weights=None):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+
+    if question_length is not None:
+        for b in range(micro_batch_size):
+            loss_mask[b, :max(0, question_length[b].item() - 1)] = 0.0
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+    if weights is not None:
+        loss_mask = loss_mask * weights
+
+    return attention_mask, loss_mask, position_ids
+
+
+def loss_func(loss_mask, output_tensor):
+    losses = output_tensor.float()
+    if loss_mask is not None:
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / max( 1,loss_mask.sum() )
+    else:
+        loss = torch.mean(losses)
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+
+
+def forward_step(data_iterator, model: LLaVAModel):
+    """Forward training step.
+
+    Args:
+        data_iterator (torch.utils.data.dataloader): Input data iterator
+        model: Multimodal model
+
+    Returns:
+        output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
+        loss_func (callable): Loss function with a loss mask specified.
+    """
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    tokens, labels, loss_mask, attention_mask, position_ids, images = get_batch(data_iterator)
+    timers('batch-generator').stop()
+
+    output_tensor = model(images, tokens, position_ids, attention_mask, labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--valid-path', nargs='*', default=None,
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False)
+    group.add_argument("--use-te", action="store_true", default=False)
+    return parser
+
+
+if __name__ == "__main__":
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_multimodal_extra_args,
+    )

From a26df8660965bc0b42e13c93e016d2291bb6e1cd Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 15 May 2024 21:56:11 -0700
Subject: [PATCH 33/92] Container for yq

---
 jet-tests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/jet-tests.yml b/jet-tests.yml
index 96518be5e5..c343d7c7bf 100644
--- a/jet-tests.yml
+++ b/jet-tests.yml
@@ -30,12 +30,11 @@ jet-setup:
       dotenv: config.env
 
 jet-configure:
-  image: alpine
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ci_yq:v1
   extends: [.jet_common, .jet-configure]
   tags:
     - os/linux
   script:
-    - wget https://github.com/mikefarah/yq/releases/download/v4.35.2/yq_linux_amd64.tar.gz -O - | tar xz && mv yq_linux_amd64 /usr/local/bin/yq
     - cd tests/functional_tests/jet_recipes
     - |
       if [[ $CI_PIPELINE_SOURCE == "merge_request_event" ]]; then

From 529c5c92f710346a45f32e5a4c7167424cc39d26 Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Thu, 16 May 2024 13:16:52 -0700
Subject: [PATCH 34/92] checking if weights is none.

---
 .../blended_megatron_dataset_builder.py       | 22 +++++++++++++------
 pretrain_retro.py                             |  4 ++--
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index f7af4bda39..2c067df1fb 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -180,13 +180,21 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
             split = self.config.split_matrix
 
             # >>>
-            if 0:
-                # Blend consists of a single prefix
-                if len(prefixes) == 1:
-                    # >>>
-                    # raise Exception("hi.")
-                    # <<<
-                    return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
+            # if 0:
+            # Blend consists of a single prefix
+            # >>>
+            # if len(prefixes) == 1:
+            if len(prefixes) == 1 and weights is None:
+            # <<<
+                # >>>
+                raise Exception("hi.")
+                # <<<
+                return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
+            # <<<
+
+            # >>>
+            from lutil import pax
+            pax("prefixes, weights")
             # <<<
 
             # Build the mid-level datasets
diff --git a/pretrain_retro.py b/pretrain_retro.py
index 0aa3475d3d..148396d3dc 100644
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
@@ -218,8 +218,8 @@ def train_valid_test_datasets_provider(train_valid_test_num_samples):
     }
 
     # >>>
-    from lutil import pax
-    pax({k:"%s, %d" % (len(d) if d else "--", n) for k, (d, n) in gpt_datasets.items()})
+    # from lutil import pax
+    # pax({k:"%s, %d" % (len(d) if d else "--", n) for k, (d, n) in gpt_datasets.items()})
     # <<<
 
     # Retro datasets.

From ee1d34a0da0727e805335992b7396920f82f3ee1 Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Thu, 16 May 2024 13:23:12 -0700
Subject: [PATCH 35/92] clean up.

---
 .../blended_megatron_dataset_builder.py       | 34 -------------------
 pretrain_retro.py                             | 11 ------
 tools/retro/cli/cli.py                        | 14 --------
 3 files changed, 59 deletions(-)

diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index 2c067df1fb..7a6187c7c1 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -124,11 +124,6 @@ def build(self) -> List[Optional[TopLevelDataset]]:
         """
         datasets = self._build_blended_dataset_splits()
 
-        # >>>
-        # from lutil import pax
-        # pax("datasets")
-        # <<<
-
         for dataset in datasets:
             if dataset is not None and len(dataset) > 0:
                 if isinstance(dataset, BlendedDataset):
@@ -142,11 +137,6 @@ def build(self) -> List[Optional[TopLevelDataset]]:
                                 f"{type(dataset).__name__} blend goes out of bounds for {type([dataset_and_size[0]]).__name__} {i} for {dataset.split.name} split"
                             )
 
-        # >>>
-        # from lutil import pax
-        # pax("datasets")
-        # <<<
-
         return datasets
 
     def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
@@ -179,23 +169,9 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
 
             split = self.config.split_matrix
 
-            # >>>
-            # if 0:
             # Blend consists of a single prefix
-            # >>>
-            # if len(prefixes) == 1:
             if len(prefixes) == 1 and weights is None:
-            # <<<
-                # >>>
-                raise Exception("hi.")
-                # <<<
                 return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
-            # <<<
-
-            # >>>
-            from lutil import pax
-            pax("prefixes, weights")
-            # <<<
 
             # Build the mid-level datasets
             if weights is None:
@@ -238,11 +214,6 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                         self.config,
                     )
 
-            # >>>
-            # from lutil import pax
-            # pax("blended_datasets")
-            # <<<
-
             return blended_datasets
 
         ##
@@ -307,11 +278,6 @@ def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
                         self.config,
                     )
 
-            # >>>
-            from lutil import pax
-            pax("blended_datasets")
-            # <<<
-
             return blended_datasets
 
     def _build_megatron_datasets_parallel(
diff --git a/pretrain_retro.py b/pretrain_retro.py
index 148396d3dc..e50e3077c1 100644
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
@@ -205,23 +205,12 @@ def train_valid_test_datasets_provider(train_valid_test_num_samples):
         data_config,
     ).build()
 
-    # >>>
-    # from lutil import pax
-    # pax("train_valid_test_num_samples")
-    # pax({"datasets": [ train_ds, valid_ds, test_ds ]})
-    # <<<
-
     gpt_datasets = {
         "train" : (train_ds, train_valid_test_num_samples[0]),
         "valid" : (valid_ds, train_valid_test_num_samples[1]),
         "test"  : (test_ds, train_valid_test_num_samples[2]),
     }
 
-    # >>>
-    # from lutil import pax
-    # pax({k:"%s, %d" % (len(d) if d else "--", n) for k, (d, n) in gpt_datasets.items()})
-    # <<<
-
     # Retro datasets.
     if args.retro_add_retriever:
         return get_retro_datasets(
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
index ea89e4d5fc..2a75679a37 100644
--- a/tools/retro/cli/cli.py
+++ b/tools/retro/cli/cli.py
@@ -60,15 +60,6 @@ def init(cls, project_dir: str) -> None:
                                         cls.config.retro_gpt_chunk_length,
                                         cls.config.retro_tokenizers.gpt.eod)
 
-        # >>>
-        # from megatron.training.training import build_train_valid_test_data_loaders
-        # args.iteration = 0
-        # train_loader, valid_loader, test_loader = \
-        #     build_train_valid_test_data_loaders(
-        #         train_valid_test_datasets_provider)
-        # pax("train_loader, valid_loader, test_loader")
-        # <<<
-
         # Pretraining datasets.
         pt_train_ds, pt_valid_ds, pt_test_ds = build_train_valid_test_datasets(
             train_valid_test_datasets_provider)
@@ -78,11 +69,6 @@ def init(cls, project_dir: str) -> None:
             test=pt_test_ds,
         )
 
-        # >>>
-        from lscratch import analyze_retro_dataset
-        analyze_retro_dataset("0.7", pt_train_ds)
-        # <<<
-
         # Print usage.
         cls.print_usage()
 

From ae8317036994ee877d7be832720f0143e57f1b8e Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Thu, 16 May 2024 14:19:13 -0700
Subject: [PATCH 36/92] fixed package_info.py.

---
 megatron/core/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index 980faab94b..4e7f4b2180 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 8
 PATCH = 0
-PRE_RELEASE = ''
+PRE_RELEASE = 'rc0'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

From 7cb6f0e195595cec75591ed7da70e476ebd29810 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 16 May 2024 15:00:33 -0700
Subject: [PATCH 37/92] Add feature to run nightly tests in MRs

---
 .gitlab-ci.yml |  9 ++++++---
 jet-tests.yml  | 11 ++---------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6227c4928e..0f833a9dda 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,11 @@
 workflow:
   rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests and nightly/
+      variables:
+        JET_CUSTOM_FILTER: "type == 'build' or 'merge-request' in spec.scope or 'nightly' in spec.scope"
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
+      variables:
+        JET_CUSTOM_FILTER: "type == 'build' or 'merge-request' in spec.scope"
     # always run MR pipelines
     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
     # always run web pipelines
@@ -18,9 +24,6 @@ variables: &VARS
   SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
   DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
   PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
-  TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
-  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
-  TEST_REGEX_ON_THIS_COMMIT:  NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
   JET_CUSTOM_FILTER: ""
   DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
   TIME_LIMIT: "10:00" # Default time limit for all jobs
diff --git a/jet-tests.yml b/jet-tests.yml
index 96518be5e5..203fd703ad 100644
--- a/jet-tests.yml
+++ b/jet-tests.yml
@@ -16,15 +16,8 @@ jet-setup:
     - os/linux
   script:
     - set -x
-    - |
-      if [[ $CI_PIPELINE_SOURCE == "merge_request_event" ]] && [[ $CI_MERGE_REQUEST_LABELS =~ "Run tests" ]]; then
-          JET_FILTER="type == 'build' or 'merge-request' in spec.scope"
-      elif [[ -n $JET_CUSTOM_FILTER && $CI_PIPELINE_SOURCE != 'merge_request_event' ]]; then
-        JET_FILTER=$JET_CUSTOM_FILTER
-      else
-        JET_FILTER="False"
-      fi
-      echo "_JET_FILTER=$JET_FILTER" | tee -a config.env
+    - JET_FILTER=${JET_CUSTOM_FILTER:-False}
+    - echo "_JET_FILTER=$JET_FILTER" | tee -a config.env
   artifacts:
     reports:
       dotenv: config.env

From 3892df77051349e4fc5fe4f4a664d9854d0870f7 Mon Sep 17 00:00:00 2001
From: Xiaowei Ren <xren@nvidia.com>
Date: Thu, 16 May 2024 15:57:46 -0700
Subject: [PATCH 38/92] Add CP functional test

---
 tests/functional_tests/jet_recipes/MR-gpt.yaml             | 4 +++-
 .../functional_tests/python_test_utils/test_ci_pipeline.py | 5 +++++
 .../python_test_utils/test_resume_checkpoint_pipeline.py   | 7 +++++++
 ...t-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json | 1 +
 ...t-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json | 1 +
 5 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json

diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index ac382ef295..7315cdda61 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -28,6 +28,7 @@ spec:
   artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00}
   ckpt_format: torch_dist
   ckpt_resume: 0
+  allow_nondeterministic: 0
   script: |-
     ls
     cd /workspace/megatron-lm
@@ -51,6 +52,7 @@ spec:
         MOE_GROUPED_GEMM={moe_grouped_gemm} \
         CKPT_FORMAT={ckpt_format} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
+        ALLOW_NONDETERMINISTIC={allow_nondeterministic} \
         JOB_NAME={key.split("/")[1]} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
@@ -68,7 +70,7 @@ products:
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ["--untie-embeddings-and-output-weights"], args_meta: ["untie_embeddings_and_outputs"]}
   - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["sequence_parallel"]}
   - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]}
-    # - {tp_size: [2], pp_size: [1,2], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"']}  # TODO: need updated container with TE > 1.0.0
+  - {tp_size: [2], pp_size: [1,2], ckpt_resume: [0, 1], allow_nondeterministic: [1], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"'], args_meta: ["cp2_nondeterministic"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_8experts2parallel_dist_optimizer"]}
     ## TODO: MoE GroupedMLP dist-ckpt not supported, so must use 'torch' ckpt format
diff --git a/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
index 4bda2242d8..076a54bebc 100644
--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
@@ -7,6 +7,7 @@
 
 LOGS_DIR = os.getenv('LOGS_DIR')
 EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
+ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
 
 
 # If we require a variation of tests for any of the other pipelines we can just inherit this class.
@@ -14,6 +15,7 @@ class TestCIPipeline:
 
     margin_loss, margin_time = 0.05, 0.1
     expected = None
+    allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC))
 
     def _setup(self):
         if os.path.exists(EXPECTED_METRICS_FILE):
@@ -43,16 +45,19 @@ def _test_helper(self, loss_type, test_type):
             else:
                 assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
 
+    @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
     def test_lm_loss_deterministic(self):
         # Expected training loss curve at different global steps.
         self._setup()
         self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
 
+    @pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
     def test_lm_loss_approx(self):
         # Expected training loss curve at different global steps.
         self._setup()
         self._test_helper("lm loss", TypeOfTest.APPROX)
 
+    @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
     def test_num_zeros_deterministic(self):
         # Expected validation loss curve at different global steps.
         self._setup()
diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
index f540dc3c4c..6abc99c63d 100644
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
@@ -12,6 +12,7 @@
 from tests.functional_tests.python_test_utils.common import TypeOfTest
 
 LOGS_DIR = os.getenv('LOGS_DIR')
+ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
 STEP_INTERVAL = 5
 
 def read_tb_logs_as_list(path, summary_name, index):
@@ -42,6 +43,7 @@ def collect_train_test_metrics(logs_dir, index):
 class TestCIPipeline:
 
     margin_loss = 0.005
+    allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC))
     train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
     train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
 
@@ -64,5 +66,10 @@ def _test_helper(self, loss_type, test_type):
             else:
                 assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
 
+    @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
     def test_lm_loss_deterministic(self):
         self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
+
+    @pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
+    def test_lm_loss_deterministic(self):
+        self._test_helper("lm loss", TypeOfTest.APPROX)
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json
new file mode 100644
index 0000000000..b87c0bca78
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json
new file mode 100644
index 0000000000..4c8008e6ac
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}

From 264f7853ce4c53d333d8b92e6f5b9527e116d5de Mon Sep 17 00:00:00 2001
From: Gao Deng <gdeng@nvidia.com>
Date: Fri, 17 May 2024 12:50:02 -0700
Subject: [PATCH 39/92] Add geglu in MoE expert layer

---
 megatron/core/transformer/moe/experts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
index c97cb97b5b..7509126a66 100644
--- a/megatron/core/transformer/moe/experts.py
+++ b/megatron/core/transformer/moe/experts.py
@@ -39,13 +39,13 @@ def __init__(self, num_local_experts: int, config: TransformerConfig):
 
         self.expert_parallel = config.expert_model_parallel_size > 1
         if self.config.gated_linear_unit:
-            if self.config.activation_func != F.silu:
-                raise ValueError("Activation function must be silu when using GroupedMLP.")
+            if self.config.activation_func not in (F.silu, F.gelu):
+                raise ValueError("Activation function must be silu or gelu when using GroupedMLP.")
 
             @jit_fuser
             def glu(x):
                 x = torch.chunk(x, 2, dim=-1)
-                return F.silu(x[0]) * x[1]
+                return self.config.activation_func(x[0]) * x[1]
 
             self.activation_func = glu
         else:

From cfd1b02c48f39a6041b040befb2ebd440df0d06d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 21 May 2024 16:55:44 -0700
Subject: [PATCH 40/92] Fix flag issues in nightly bert fp16 tests and gpt3
 tests using mcore models

---
 megatron/legacy/model/transformer.py          |  3 ++-
 .../jet_recipes/nightly-bert.yaml             |  1 -
 .../jet_recipes/nightly-gpt.yaml              | 21 ++++++++++---------
 .../bert/pretrain_bert_distributed_test.sh    |  6 ++++++
 .../gpt3/pretrain_gpt3_distributed_test.sh    |  6 ++++++
 .../pretrain_llava_distributed_test.sh        |  6 ++++++
 6 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index ef19656e00..53031f5512 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -1503,7 +1503,8 @@ def build_layer(layer_number):
                 assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32."
                 assert (
                     (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling
-                ), "Unsupported config for apply_query_key_layer_scaling in TransformerEngine."
+                ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
+                    "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.")
                 return transformer_engine.pytorch.TransformerLayer(
                     config.hidden_size,
                     config.ffn_hidden_size,
diff --git a/tests/functional_tests/jet_recipes/nightly-bert.yaml b/tests/functional_tests/jet_recipes/nightly-bert.yaml
index 9336de141a..70b1f0641e 100644
--- a/tests/functional_tests/jet_recipes/nightly-bert.yaml
+++ b/tests/functional_tests/jet_recipes/nightly-bert.yaml
@@ -22,7 +22,6 @@ spec:
   args_meta: null
   micro_batch_size: 4 # MBS
   batch_size: 128 # GBS, JET schema requires 'batch_size'
-  precision: bf16
   time_limit: 1200
   ckpt_format: torch
   ckpt_resume: 0
diff --git a/tests/functional_tests/jet_recipes/nightly-gpt.yaml b/tests/functional_tests/jet_recipes/nightly-gpt.yaml
index a4475e3d0b..a5f2b241c5 100644
--- a/tests/functional_tests/jet_recipes/nightly-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/nightly-gpt.yaml
@@ -23,10 +23,9 @@ spec:
   micro_batch_size: 4 # MBS
   batch_size: 32 # GBS, JET schema requires 'batch_size'
   moe_grouped_gemm: 0
-  precision: bf16
   time_limit: 1200
   artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00}
-  ckpt_format: torch_dist
+  ckpt_format: torch
   ckpt_resume: 0
   script: |-
     ls
@@ -54,15 +53,17 @@ spec:
         JOB_NAME={key.split("/")[1]} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
-  - {use_mcore: [True, False], tp_size: [4], pp_size: [1], ckpt_resume: [0, 1]}
-  - {use_mcore: [True, False], tp_size: [4], pp_size: [1], ckpt_resume: [1], ckpt_format: [torch]}
-  - {use_mcore: [True, False], tp_size: [1], pp_size: [2,4], ckpt_resume: [0, 1]}
-  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--num-experts 2 --sequence-parallel --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_2experts"]}
-  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 4 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_4experts2parallel"]}
-  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
+  - {use_mcore: [True],  tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch_dist]}
+  - {use_mcore: [False], tp_size: [4], pp_size: [1], ckpt_resume: [0, 1]}
+  - {use_mcore: [True], tp_size: [4], pp_size: [1], ckpt_resume: [1]}
+  - {use_mcore: [True],  tp_size: [1], pp_size: [2,4], ckpt_resume: [0, 1], ckpt_format: [torch_dist]}
+  - {use_mcore: [False], tp_size: [1], pp_size: [2,4], ckpt_resume: [0, 1]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--num-experts 2 --sequence-parallel --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_2experts"]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--sequence-parallel --num-experts 4 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_4experts2parallel"]}
+  - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
 # Non-MCore
   - {use_mcore: [False], tp_size: [1,4], pp_size: [1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]}
   - {use_mcore: [False], tp_size: [1], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
   - {use_mcore: [False], tp_size: [1], pp_size: [4], vp_size: [null, 1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]}
-  - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]}
-  - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--sequence-parallel --num-experts 4 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["4experts"]}
+  - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]}
+  - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 4 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["4experts"]}
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
index 97a9d1695b..4acff199dc 100755
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
@@ -97,6 +97,12 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
 
 if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then
     torch_run_cmd+=" --apply-query-key-layer-scaling"
+    # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using:
+    #  1. --apply-query-key-layer-scaling
+    #  2. transformer_impl="transformer_engine"
+    #  3. TE >= 0.11
+    #  4. fp16
+    export NVTE_APPLY_QK_LAYER_SCALING=1
 fi
 
 command="$command $torch_run_cmd"
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
index 0925c223d6..aa95d8d65a 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
@@ -133,6 +133,12 @@ build_torch_run_cmd() {
 
   if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then
       torch_run_cmd+=" --apply-query-key-layer-scaling"
+      # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using:
+      #  1. --apply-query-key-layer-scaling
+      #  2. transformer_impl="transformer_engine"
+      #  3. TE >= 0.11
+      #  4. fp16
+      export NVTE_APPLY_QK_LAYER_SCALING=1
   fi
 }
 
diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
index 1b7bedb582..fa536f97ed 100755
--- a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
@@ -126,6 +126,12 @@ build_torch_run_cmd() {
 
   if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then
       torch_run_cmd+=" --apply-query-key-layer-scaling"
+      # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using:
+      #  1. --apply-query-key-layer-scaling
+      #  2. transformer_impl="transformer_engine"
+      #  3. TE >= 0.11
+      #  4. fp16
+      export NVTE_APPLY_QK_LAYER_SCALING=1
   fi
 }
 

From 9dca04b2c2308e9676529a81c5e4fbee79cf99c0 Mon Sep 17 00:00:00 2001
From: Hao Wang <haowan@nvidia.com>
Date: Tue, 21 May 2024 23:22:30 -0700
Subject: [PATCH 41/92] Add a heuristic for data-cache building to improve
 speed and stability

---
 megatron/core/datasets/gpt_dataset.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
index b8ce1b0fc7..cbb800d866 100644
--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -416,8 +416,17 @@ def _build_document_sample_shuffle_indices(
 
             assert document_index.dtype == numpy.int32
             assert self.dataset.sequence_lengths.dtype == numpy.int32
+            if len(document_index) * 2 > len(self.dataset.sequence_lengths):
+                # Heuristic: if "access density" of sequence_lengths is relatively high,
+                # force loading the mmap-ed array into memory by taking a copy.
+                # System performance benefits come from two aspects:
+                # 1. **sequentially** pre-loading the whole file if we're gonna read a large fraction anyways.
+                # 2. GIL is held when calling into c++ code; making the c++ func faster improves parallelism.
+                sequence_lengths_for_cpp = self.dataset.sequence_lengths.copy()
+            else:
+                sequence_lengths_for_cpp = self.dataset.sequence_lengths
             sample_index = helpers.build_sample_idx(
-                self.dataset.sequence_lengths,
+                sequence_lengths_for_cpp,
                 document_index,
                 sequence_length,
                 num_epochs,

From 38e610be900ab06263de2badb8be72a78c3af5c1 Mon Sep 17 00:00:00 2001
From: Asha Anoosheh <aanoosheh@nvidia.com>
Date: Wed, 22 May 2024 05:30:09 -0700
Subject: [PATCH 42/92] Check if layernorm gradients even requires grad to
 avoid AttributeError

---
 megatron/core/distributed/finalize_model_grads.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
index 4eaa776b48..502f15abf2 100644
--- a/megatron/core/distributed/finalize_model_grads.py
+++ b/megatron/core/distributed/finalize_model_grads.py
@@ -81,7 +81,8 @@ def _allreduce_layernorm_grads(model: List[torch.nn.Module], config: Transformer
         for model_chunk in model:
             for name, param in get_attr_wrapped_model(model_chunk, 'named_parameters')():
                 if (
-                    getattr(param, 'sequence_parallel', False)
+                    param.requires_grad
+                    and getattr(param, 'sequence_parallel', False)
                     or 'q_layernorm' in name
                     or 'k_layernorm' in name
                 ):

From 1e58d09f8e5de14fd75a83f9d0369bf1bbe686a0 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 22 May 2024 11:09:29 -0700
Subject: [PATCH 43/92] Multimodal small fixes

---
 megatron/core/models/multimodal/llava_model.py |  5 +++--
 tests/unit_tests/models/test_llava_model.py    | 11 +++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 65f45c795b..6a5f21e2cf 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -171,14 +171,14 @@ def forward(
             # map vision model output size to language model input size.
             image_embeddings = self.vision_projection(
                 image_embeddings
-            )  # [b, img_seq_len, h_language]
+            )  # [img_seq_len, b, h_vision]
 
             # If running inference, the language model KV cache will be updated for image token positions.
             # Here we store the image tokens sequence length, which can be used as an offset to the KV cache later.
             if inference_params is not None:
                 inference_params.key_value_memory_dict[
                     "image_tokens_count"
-                ] = image_embeddings.shape[1]
+                ] = image_embeddings.shape[0]
 
             combined_embeddings = torch.cat(
                 [image_embeddings, language_embeddings], dim=0
@@ -195,6 +195,7 @@ def forward(
             attention_mask,
             decoder_input=combined_embeddings,
             labels=labels,
+            inference_params=inference_params,
         )
 
         return output
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index 6a9ab594af..07609ca25c 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -93,8 +93,15 @@ def test_forward(self):
             inference_params=inference_params,
         )
         assert logits.shape == torch.Size((2, 1601, 2048))
-        # Check KV cache got created.
-        assert len(inference_params.key_value_memory_dict) > 0
+
+        # Check KV cache got created correctly.
+        kv_dict = inference_params.key_value_memory_dict
+
+        assert kv_dict["image_tokens_count"] == 577
+        for layer_no in range(1, 4):    # 3 layers in the model.
+            layer_kv = kv_dict[layer_no]
+            # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head]
+            assert layer_kv[0].shape == layer_kv[1].shape == torch.Size((1601, 2, 8, 16))
 
     def test_save_load(self, tmp_path):
         path = tmp_path / "model.pt"

From d661fd7893a249129f04cdd36898436f87938090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20B=C5=82a=C5=BC?= <mblaz@nvidia.com>
Date: Tue, 26 Mar 2024 18:50:50 +0100
Subject: [PATCH 44/92] Add FP32 dist ckpt impl

---
 megatron/core/optimizer/optimizer.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
index 0ae938212a..255161d31a 100644
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -690,6 +690,21 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         self.optimizer.load_state_dict(state_dict)
 
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        if is_loading:
+            self.init_state_fn(self.optimizer)
+
+        state_dict = self.state_dict()
+        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+            model_sharded_state_dict, self.get_parameters()
+        )
+        optim_state_to_sharding_state(state_dict, id_to_sharded_param_map)
+
+        return state_dict
+
+
 
 class ProxyDict:
     """

From a95c7d19441de2539106c96668353edbf0c59f17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20B=C5=82a=C5=BC?= <mblaz@nvidia.com>
Date: Thu, 23 May 2024 13:54:24 +0200
Subject: [PATCH 45/92] Add unit test

---
 .../dist_checkpointing/test_optimizer.py      | 53 ++++++++++++++++---
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
index a8b7bc252f..82daa24d67 100644
--- a/tests/unit_tests/dist_checkpointing/test_optimizer.py
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -9,7 +9,8 @@
 from torch.optim import Adam
 
 from megatron.core import parallel_state, DistributedDataParallel as DDP
-from megatron.core.dist_checkpointing import ShardedTensor, save, load
+from megatron.core.dist_checkpointing import ShardedTensor, save, load, \
+    load_plain_tensors
 from megatron.core.dist_checkpointing.dict_utils import nested_values, diff
 from megatron.core.dist_checkpointing.optimizer import \
     get_param_id_to_sharded_param_map, optim_state_to_sharding_state
@@ -26,6 +27,7 @@
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import get_model_config
 from megatron.training.training import get_model
+from megatron.training.utils import unwrap_model
 from pretrain_gpt import model_provider
 
 from tests.unit_tests.dist_checkpointing import TempNamedDir
@@ -103,10 +105,10 @@ def initialize_gpt_model(pre_process=True, post_process=True, seed=0, **config_k
     return model
 
 
-def init_mock_args(args):
+def init_mock_args(args, bf16=True):
     args.data_parallel_random_init = False
     args.virtual_pipeline_model_parallel_size = None
-    args.bf16 = True
+    args.bf16 = bf16
     args.accumulate_allreduce_grads_in_fp32 = False
     args.overlap_grad_reduce = False
     args.use_distributed_optimizer = True
@@ -114,12 +116,12 @@ def init_mock_args(args):
     return args
 
 
-def setup_model_and_optimizer(seed):
+def setup_model_and_optimizer(seed, bf16=True):
     with mock.patch('megatron.training.training.get_args', data_parallel_random_init=False) as mock_args:
-        init_mock_args(mock_args.return_value)
+        init_mock_args(mock_args.return_value, bf16)
         model = get_model(partial(initialize_gpt_model, seed=seed))
 
-    config = OptimizerConfig(bf16=True, params_dtype=torch.bfloat16, use_distributed_optimizer=True)
+    config = OptimizerConfig(bf16=bf16, params_dtype=torch.bfloat16 if bf16 else torch.float, use_distributed_optimizer=bf16)
     optimizer = get_megatron_optimizer(config, model)
 
     torch.manual_seed(seed + 1)
@@ -133,7 +135,7 @@ def setup_model_and_optimizer(seed):
 
     optimizer.reload_model_params()
 
-    return model, optimizer
+    return unwrap_model(model), optimizer
 
 
 class TestDistributedOptimizer:
@@ -201,3 +203,40 @@ def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_
                     sleep(20)
             finally:
                 Utils.set_world_size()
+
+
+class TestFP32Optimizer:
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp'),
+        [
+            ((2, 4), (2, 4)),
+            ((2, 4), (4, 2)),
+            ((8, 1), (1, 2)),
+        ]
+    )
+    def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=False) as ckpt_dir_A:
+            with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=False) as ckpt_dir_B:
+                Utils.initialize_model_parallel(*src_tp_pp)
+                model_A, optimizer_A = setup_model_and_optimizer(seed=2, bf16=False)
+
+                save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A)
+                Utils.destroy_model_parallel()
+
+                # Load checkpoint A with different TP/PP and save as checkpoint B
+                Utils.initialize_model_parallel(*dest_tp_pp)
+                model_B, optimizer_B = setup_model_and_optimizer(seed=3, bf16=False)
+                load_sharded_state_dict = optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict())
+                state_dict = load(load_sharded_state_dict, ckpt_dir_A)
+
+                optimizer_B.load_state_dict(state_dict)
+                save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B)
+                Utils.destroy_model_parallel()
+
+                # Test both checkpoints are equal
+                Utils.initialize_model_parallel(1, 1)
+                plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+                plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+                diffs = diff(plain_state_dict_A, plain_state_dict_B)
+                assert not any(map(bool, diffs)), diffs
+                Utils.destroy_model_parallel()

From 826d11a24157ff0a155ee3c99e934a5a07e97ab3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20B=C5=82a=C5=BC?= <mblaz@nvidia.com>
Date: Thu, 23 May 2024 14:02:00 +0200
Subject: [PATCH 46/92] Fix formatting

---
 megatron/core/optimizer/optimizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
index 255161d31a..08d6fdb26c 100644
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -705,7 +705,6 @@ def sharded_state_dict(
         return state_dict
 
 
-
 class ProxyDict:
     """
     A dictionary-like object that proxies to a list of dictionaries.

From 4660d50dfe6ab6bfbcf162009aee7fff72407126 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Thu, 23 May 2024 13:30:43 -0700
Subject: [PATCH 47/92] examples/multimodal - initial text generation script

---
 examples/multimodal/README.md              |  11 ++
 examples/multimodal/run_text_generation.py | 217 +++++++++++++++++++++
 examples/multimodal/text_generation_8b.sh  |  92 +++++++++
 examples/multimodal/train.py               |  13 +-
 4 files changed, 330 insertions(+), 3 deletions(-)
 create mode 100644 examples/multimodal/run_text_generation.py
 create mode 100755 examples/multimodal/text_generation_8b.sh

diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index ce483e1998..159241ed1b 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -27,3 +27,14 @@ Run the following script:
 ```
 examples/multimodal/sft_8b.sh
 ```
+
+
+### Evaluation
+
+## Generation
+
+Run the following script:
+
+```
+examples/multimodal/text_generation_8b.sh --input-path /path/to/input/images --output-path /some/output/directory --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file
+```
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
new file mode 100644
index 0000000000..9a912db6e0
--- /dev/null
+++ b/examples/multimodal/run_text_generation.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Generate text using a vision language model."""
+import glob
+import json
+import logging
+import os
+import sys
+from collections import defaultdict
+from functools import partial
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToPILImage
+
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.training import get_args, get_model, print_rank_0
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from train import model_provider, get_image_token_count, add_multimodal_extra_args
+
+
+def add_text_generation_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='Vision language model text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
+    )
+    group.add_argument("--output-path", type=str, required=True, help='Output file path')
+    group.add_argument('--input-path', type=str, required=True, help="Input directory")
+    group.add_argument(
+        '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
+    )
+    group.add_argument('--partition-id', type=int, default=0, help="Partition index")
+    group.add_argument("--drop-vision-class-token", action="store_true", default=False)
+    group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+
+    # Add common multimodal arguments needed for e.g. building the model.
+    parser = add_multimodal_extra_args(parser)
+
+    return parser
+
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+
+def _transform_test(img_h, img_w):
+    return Compose([ToPILImage(), Resize((img_h, img_w)), _convert_image_to_rgb])
+
+
+def preprocess(img_h, img_w, img):
+    # Example image preprocessing.
+    pixel_mean = [123.675, 116.28, 103.53]  # Imagenet's mean.
+    pixel_std = [58.395, 57.12, 57.375]
+    pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
+    pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+
+    raw_h, raw_w = img.shape[0], img.shape[1]
+    ratio = float(max(img_h, img_w)) / max(raw_h, raw_w)
+    H, W = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5)
+    image_transform = _transform_test(H, W)
+    img = image_transform(img)
+    img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
+    delta_h, delta_w = img_h - H, img_w - W
+    padded_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+    return padded_img
+
+
+def generate_samples(model):
+    """Text generation using a trained vision language model. This is an example for the COCO dataset."""
+    args = get_args()
+
+    image_files = sorted(glob.glob(args.input_path + "/*"))
+    # Optionally, process only a subset of the input files.
+    if args.num_partitions > 0:
+        per_part = len(image_files) // args.num_partitions
+        image_files = image_files[per_part * args.partition_id : per_part * (args.partition_id + 1)]
+
+    num_samples = len(image_files)
+    images = []
+
+    # Run image preprocessing.
+    for image_file in image_files:
+        img = np.array(Image.open(image_file))
+        img = preprocess(args.img_h, args.img_w, img)
+
+        images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+
+    # Load optional ground truth.
+    gt_image_id_to_captions = defaultdict(list)
+    if args.gt_path:
+        gts = json.load(open(args.gt_path))
+        for gt in gts["annotations"]:
+            gt_image_id_to_captions[gt["image_id"]].append(gt['caption'])
+
+    num_image_tokens = get_image_token_count()
+
+    idx = 0
+    while idx < num_samples:
+        try:
+            image = images[idx].cuda()
+        except:
+            breakpoint()
+            pass
+
+        image_id = int(image_files[idx].split("_")[-1].split(".")[0])
+
+        forward_step = partial(VLMForwardStep, image, num_image_tokens)
+
+        if torch.distributed.get_rank() == 0:
+            prompt = "Give a short and clear explanation of the subsequent image.\n"
+
+            resp_sentences, _, _, _ = generate_and_post_process(
+                model,
+                forward_step=forward_step,
+                prompts=[prompt],
+                tokens_to_generate=args.out_seq_length,
+                return_output_log_probs=False,
+                top_k_sampling=args.top_k,
+                top_p_sampling=args.top_p,
+                add_BOS=False,
+                temperature=args.temperature,
+                random_seed=123,
+            )
+
+            for prompt, generation in zip([prompt], resp_sentences):
+                output = {
+                    "question_id": image_id,
+                    "prompt": prompt,
+                    "caption": generation[len(prompt) :],
+                }
+
+                output["ground_truth"] = gt_image_id_to_captions[image_id]
+
+                print_rank_0(output)
+
+                yield output
+                idx += 1
+        else:
+            generate_and_post_process(model, forward_step=forward_step)
+            idx += 1
+
+
+def generate_and_write_samples(model):
+    args = get_args()
+
+    for output in generate_samples(model):
+        if torch.distributed.get_rank() == 0:
+            with open(args.output_path, 'a') as f:
+                f.write(json.dumps(output) + "\n")
+
+
+class VLMForwardStep(ForwardStep):
+    def __init__(self, images, num_image_tokens, model, max_batch_size, max_sequence_length):
+        super().__init__(model, max_batch_size, max_sequence_length + num_image_tokens)
+        self._images = images
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(
+            self._images,
+            tokens,
+            position_ids,
+            attention_mask=None,
+            inference_params=self.inference_params,
+        )
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        logits = super().__call__(tokens, position_ids, attention_mask)
+
+        # On the first inference iteration, we compute image tokens.
+        # Update the sequence length offset by the number of image tokens.
+        num_tokens = tokens.size(1)
+        if num_tokens > 1:
+            self.inference_params.sequence_len_offset += self.inference_params.key_value_memory_dict[
+                "image_tokens_count"
+            ]
+
+        return logits
+
+
+def main():
+    """Vision language model text generation."""
+
+    logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.")
+
+    initialize_megatron(extra_args_provider=add_text_generation_args)
+
+    def wrapped_model_provider(pre_process, post_process):
+        return model_provider(pre_process, post_process, parallel_output=False)
+
+    # Set up model and load checkpoint.
+    model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+
+    args = get_args()
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    model = model[0]
+    model.eval()
+
+    generate_and_write_samples(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/text_generation_8b.sh b/examples/multimodal/text_generation_8b.sh
new file mode 100755
index 0000000000..b3b1deea8c
--- /dev/null
+++ b/examples/multimodal/text_generation_8b.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=1
+
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i|--input-path)
+            INPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--tokenizer-path)
+            TOKENIZER_PATH="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        --default)
+            DEFAULT=YES
+            shift # past argument
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=100
+START=0
+END=0
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 4 examples/multimodal/run_text_generation.py \
+        --use-flash-attn \
+        --language-model-type 8b \
+        --apply-layernorm-1p \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 0.5 \
+        --squared-relu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --num-layers 32 \
+        --hidden-size 4096 \
+        --num-attention-heads 32 \
+        --max-position-embeddings 4096 \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type GPTSentencePieceTokenizer \
+        --tokenizer-model ${TOKENIZER_PATH} \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length 99 \
+        --out-seq-length 700 \
+        --temperature 1.0 \
+        --img-h 336 \
+        --img-w 336 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --disable-vision-class-token \
+        --no-load-rng \
+        --no-load-optim \
+        --input-path ${INPUT_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH}/${PART_ID}.jsonl \
+        --gt-path ${GROUNDTRUTH_PATH}
+done
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
index 836185aacb..2a448f248b 100644
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -59,7 +59,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
 
     vision_projection_config = deepcopy(base_config)
     vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size)
-    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te)
+    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
 
     model = LLaVAModel(
         language_transformer_config=language_config,
@@ -134,8 +134,7 @@ def get_batch(data_iterator):
     return tokens, labels, loss_mask, attention_mask, position_ids, img_raw
 
 
-def _preprocess_data_for_llava(loss_mask, labels, attention_mask):
-    """Preprocess data sample to the format expected by a LLaVA model."""
+def get_image_token_count():
     args = get_args()
 
     add_class_token = not args.disable_vision_class_token
@@ -144,6 +143,14 @@ def _preprocess_data_for_llava(loss_mask, labels, attention_mask):
     num_patches_per_dim_w = args.img_w // args.patch_dim
     num_patches = num_patches_per_dim_h * num_patches_per_dim_w
     num_image_tokens = num_patches + (1 if add_class_token else 0)
+
+    return num_image_tokens
+
+
+def _preprocess_data_for_llava(loss_mask, labels, attention_mask):
+    """Preprocess data sample to the format expected by a LLaVA model."""
+    num_image_tokens = get_image_token_count()
+
     batch_size = loss_mask.shape[0]
 
     loss_mask2 = torch.cat(

From 6ebd707d0235dfa2bc51d53e41e31aa492c234a5 Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Tue, 21 May 2024 12:45:02 -0700
Subject: [PATCH 48/92] Unit tests for ParamAndGradBuffer in mcore/distributed

---
 .../core/distributed/param_and_grad_buffer.py |  13 +-
 .../distributed/test_param_and_grad_buffer.py | 161 ++++++++++++++++++
 tests/unit_tests/test_utilities.py            |  41 ++++-
 3 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 tests/unit_tests/distributed/test_param_and_grad_buffer.py

diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index 1d037c86e9..c07b15b94a 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -91,7 +91,7 @@ def reset(self):
         """
         self.params_with_grad = set()
         self.communication_handle = None
-        self.communication_issued = False
+        self.is_communication_outstanding = False
 
     def start_grad_sync(self):
         """
@@ -103,8 +103,8 @@ def start_grad_sync(self):
         synchronous call.
         """
         assert (
-            self.communication_handle is None and not self.communication_issued
-        ), 'Should not have multiple communication calls in flight at once'
+            self.communication_handle is None and not self.is_communication_outstanding
+        ), 'Should not have multiple communication calls outstanding at once'
 
         # Make sure norm of grads in bucket are not NaN
         # prior to data-parallel all-reduce / reduce-scatter.
@@ -136,7 +136,10 @@ def start_grad_sync(self):
                 group=self.data_parallel_group,
                 async_op=self.ddp_config.overlap_grad_reduce,
             )
-        self.communication_issued = True
+        if self.ddp_config.overlap_grad_reduce:
+            self.is_communication_outstanding = True
+        else:
+            self.is_communication_outstanding = False
 
     def finish_grad_sync(self):
         """
@@ -150,7 +153,7 @@ def finish_grad_sync(self):
         if not self.ddp_config.overlap_grad_reduce:
             self.start_grad_sync()
             return
-        assert self.communication_handle is not None and self.communication_issued, (
+        assert self.communication_handle is not None and self.is_communication_outstanding, (
             f'Communication call has not been issued for this bucket '
             f'({len(self.params_with_grad)}/{len(self.params)} params have grad available)'
         )
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
new file mode 100644
index 0000000000..ee2c4cd0e0
--- /dev/null
+++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -0,0 +1,161 @@
+import contextlib
+import math
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallelConfig, ParamAndGradBuffer
+from tests.unit_tests.test_utilities import Utils, TestModel
+
+
+def get_model_and_buffers(
+    input_dim: int,
+    output_dim: int,
+    num_layers: int,
+    bias: bool,
+    bucket_size: int,
+    use_distributed_optimizer: bool,
+    overlap_grad_reduce: bool,
+):
+    ddp_config = DistributedDataParallelConfig(
+        grad_reduce_in_fp32=True,
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=overlap_grad_reduce,
+    )
+    model = TestModel(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers, bias=bias)
+    params = list(model.parameters())
+    param_to_name = {}
+    for name, param in model.named_parameters():
+        param_to_name[param] = name
+
+    param_and_grad_buffer = ParamAndGradBuffer(
+        ddp_config,
+        param_dtype=torch.bfloat16,
+        grad_dtype=torch.float32,
+        params=params,
+        data_parallel_group=parallel_state.get_data_parallel_group(),
+        bucket_size=bucket_size,
+        param_to_name=param_to_name,
+        gradient_scaling_factor=1.0,
+    )
+
+    return model, param_and_grad_buffer
+
+
+@pytest.mark.parametrize("bucket_size", [None, 9999, 10000, 10001, 19999, 20000])
+@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.parametrize("bias", [False, True])
+def test_bucket_sizes(bucket_size: int, use_distributed_optimizer: bool, bias: bool):
+    Utils.initialize_model_parallel()
+
+    input_dim = 100
+    output_dim = 100
+    num_layers = 10
+    _, param_and_grad_buffer = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=bias,
+        bucket_size=bucket_size,
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=False,
+    )
+
+    actual_numel_in_each_bucket = [
+        bucket.numel_unpadded for bucket in param_and_grad_buffer.buckets
+    ]
+    actual_numel_padded_in_each_bucket = [
+        bucket.grad_data.numel() for bucket in param_and_grad_buffer.buckets
+    ]
+
+    def _pad_if_needed(numel_unpadded):
+        # Want 128-byte alignment for distributed optimizer.
+        divisor = math.lcm(parallel_state.get_data_parallel_world_size(), 128)
+        if use_distributed_optimizer:
+            return math.ceil(numel_unpadded / divisor) * divisor
+        return numel_unpadded
+
+    if bucket_size is None:
+        # If bucket_size is infinite (None), number of buckets should be 1.
+        assert len(param_and_grad_buffer.buckets) == 1
+    else:
+        # Else, compute number of buckets.
+        numel_in_each_bucket = []
+        numel_padded_in_each_bucket = []
+        numel_in_last_bucket = 0
+        for _ in range(num_layers):
+            numel_in_last_bucket += input_dim * output_dim
+            if bias:
+                numel_in_last_bucket += output_dim  # Include bias term.
+            if numel_in_last_bucket >= bucket_size:
+                numel_in_each_bucket.append(numel_in_last_bucket)
+                numel_padded_in_each_bucket.append(_pad_if_needed(numel_in_last_bucket))
+                numel_in_last_bucket = 0
+        if numel_in_last_bucket > 0:
+            numel_in_each_bucket.append(numel_in_last_bucket)
+            numel_padded_in_each_bucket.append(_pad_if_needed(numel_in_last_bucket))
+
+        assert len(param_and_grad_buffer.buckets) == len(numel_in_each_bucket)
+        assert actual_numel_in_each_bucket == numel_in_each_bucket, (
+            f"Number of parameters in each bucket should be {numel_in_each_bucket}, "
+            f"but is {actual_numel_in_each_bucket}"
+        )
+        assert actual_numel_padded_in_each_bucket == numel_padded_in_each_bucket, (
+            f"Number of parameters in each padded bucket should be {numel_padded_in_each_bucket}, "
+            f"but is {actual_numel_padded_in_each_bucket}"
+        )
+
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.parametrize("overlap_grad_reduce", [False, True])
+def test_grad_sync(use_distributed_optimizer: bool, overlap_grad_reduce: bool):
+    Utils.initialize_model_parallel()
+
+    input_dim = 100
+    output_dim = 100
+    num_layers = 10
+    model, param_and_grad_buffer = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=True,
+        bucket_size=None,  # Group all params into single bucket.
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=overlap_grad_reduce,
+    )
+
+    param_and_grad_buffer.grad_data.data.fill_(1.0)
+    expected_grad_data_value_after_collective = 1
+    if torch.distributed.get_rank() == 0 or not use_distributed_optimizer:
+        expected_grad_data_value_after_collective = parallel_state.get_data_parallel_world_size()
+
+    params = list(model.parameters())
+    for i, param in enumerate(params):
+        register_grad_sync_context = (
+            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+        )
+        finish_grad_sync_context = contextlib.nullcontext()
+        if i < (len(params) - 1) and overlap_grad_reduce:
+            # Can't finish grad sync until all params have been registered ready.
+            finish_grad_sync_context = pytest.raises(AssertionError)
+
+        with register_grad_sync_context:
+            param_and_grad_buffer.register_grad_ready(param)
+        with finish_grad_sync_context:
+            # When overlap_grad_reduce is True, this should throw an assertion error until all
+            # params in the model have registered their grad above.
+            # When overlap_grad_reduce is False, the collective is forced through.
+            param_and_grad_buffer.finish_grad_sync()
+
+        expected_grad_data_value = expected_grad_data_value_after_collective
+        if overlap_grad_reduce and i < (len(params) - 1):
+            expected_grad_data_value = 1
+        assert int(param_and_grad_buffer.grad_data[0]) == expected_grad_data_value
+
+        if not overlap_grad_reduce:
+            # Reset grad_data for subsequent collectives.
+            param_and_grad_buffer.grad_data.data.fill_(1.0)
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py
index 9896a67441..0464866bb8 100644
--- a/tests/unit_tests/test_utilities.py
+++ b/tests/unit_tests/test_utilities.py
@@ -2,6 +2,15 @@
 import torch
 import megatron.core.parallel_state as ps
 
+
+class TestModel(torch.nn.Module):
+    def __init__(self, input_dim: int, output_dim: int, num_layers: int, bias: bool):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [torch.nn.Linear(input_dim, output_dim, bias) for _ in range(num_layers)]
+        )
+
+
 class Utils:
 
     world_size = torch.cuda.device_count()
@@ -10,20 +19,30 @@ class Utils:
     @staticmethod
     def initialize_distributed():
         if not torch.distributed.is_initialized() and Utils.rank >= 0:
-            print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
+            print(
+                f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}'
+            )
             torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
             init_method = 'tcp://'
             master_ip = os.getenv('MASTER_ADDR', 'localhost')
             master_port = os.getenv('MASTER_PORT', '6000')
             init_method += master_ip + ':' + master_port
-            torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
+            torch.distributed.init_process_group(
+                backend='nccl',
+                world_size=Utils.world_size,
+                rank=Utils.rank,
+                init_method=init_method,
+            )
 
             torch.distributed.barrier()
 
     @staticmethod
     def set_world_size(world_size=None, rank=None):
         Utils.world_size = torch.cuda.device_count() if world_size is None else world_size
-        if torch.distributed.is_initialized() and Utils.world_size != torch.distributed.get_world_size():
+        if (
+            torch.distributed.is_initialized()
+            and Utils.world_size != torch.distributed.get_world_size()
+        ):
             torch.distributed.destroy_process_group()
 
         if rank is None:
@@ -39,7 +58,19 @@ def destroy_model_parallel():
         torch.distributed.barrier()
 
     @staticmethod
-    def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None, **kwargs):
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+        **kwargs,
+    ):
         ps.destroy_model_parallel()
         Utils.initialize_distributed()
-        ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank, **kwargs)
\ No newline at end of file
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+            **kwargs,
+        )

From f993b3138c3b8bdb7dd50d49efdaa92e0ac74b09 Mon Sep 17 00:00:00 2001
From: Lawrence McAfee <lmcafee@nvidia.com>
Date: Thu, 23 May 2024 23:35:27 -0700
Subject: [PATCH 49/92] updated help string.

---
 tools/checkpoint/loader_llama2.py   | 2 +-
 tools/checkpoint/loader_mcore.py    | 2 +-
 tools/checkpoint/loader_megatron.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/checkpoint/loader_llama2.py b/tools/checkpoint/loader_llama2.py
index 80beeea8d3..b7fd02f73a 100644
--- a/tools/checkpoint/loader_llama2.py
+++ b/tools/checkpoint/loader_llama2.py
@@ -32,7 +32,7 @@ def add_arguments(parser):
     group.add_argument('--tokenizer-model', required=True,
                        help='Sentencepiece tokenizer model.')
     group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of deepspeed repository')
+                       help='Base directory of Megatron repository')
     group.add_argument('--loader-transformer-impl', default='local',
                        choices=['local', 'transformer_engine'],
                        help='Which Transformer implementation to use.')
diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py
index 1f734a7d26..8e571c91c5 100644
--- a/tools/checkpoint/loader_mcore.py
+++ b/tools/checkpoint/loader_mcore.py
@@ -18,7 +18,7 @@ def add_arguments(parser):
                        help='Path to the vocab file. If specified will use this to get vocab size and '
                        'trim padding from the embedding table.')
     group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of deepspeed repository')
+                       help='Base directory of Megatron repository')
     group.add_argument('--position-embedding-type',
                        type=str,
                        default='learned_absolute',
diff --git a/tools/checkpoint/loader_megatron.py b/tools/checkpoint/loader_megatron.py
index 371e426046..7ce41db6c8 100644
--- a/tools/checkpoint/loader_megatron.py
+++ b/tools/checkpoint/loader_megatron.py
@@ -17,7 +17,7 @@ def add_arguments(parser):
                        help='Path to the vocab file. If specified will use this to get vocab size and '
                        'trim padding from the embedding table.')
     group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of deepspeed repository')
+                       help='Base directory of Megatron repository')
     group.add_argument('--position-embedding-type',
                        type=str,
                        default='learned_absolute',

From bea17d229d38bd1d8222479cd39181a076ff6259 Mon Sep 17 00:00:00 2001
From: Chenhan Yu <chenhany@nvidia.com>
Date: Fri, 24 May 2024 15:48:33 -0700
Subject: [PATCH 50/92] Update nvidia-ammo 0.7 to nvidia-modelopt 0.11

---
 .gitlab-ci.yml                                |  14 ++
 examples/inference/README.md                  |  44 +++--
 examples/inference/ptq_trtllm_llama_7b.sh     |  36 ++--
 examples/inference/ptq_trtllm_nemotron3_8b.sh |  35 ++--
 examples/inference/text_generation_ptq.py     | 169 +++++++++---------
 examples/inference/trtllm_text_generation.py  |  49 +++--
 megatron/core/inference/gpt/model_specs.py    |  29 +--
 .../core/inference/gpt/state_dict_hooks.py    |   8 +-
 .../core/transformer/transformer_config.py    |   7 -
 megatron/inference/arguments.py               |  21 +--
 megatron/inference/gpt/model_provider.py      |  54 +++---
 .../inference/test_modelopt_gpt_model.py      |  44 +++++
 12 files changed, 299 insertions(+), 211 deletions(-)
 create mode 100644 tests/unit_tests/inference/test_modelopt_gpt_model.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0f833a9dda..f5b6d9cf63 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,6 +96,20 @@ unit_tests-fusions:
       when: never
     - when: always
 
+unit_tests-inference:
+  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
+  tags:
+    - 8xL40S
+  stage: test
+  script:
+    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/inference
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
+      when: never
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+      when: never
+    - when: always
+
 unit_tests-models:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
   tags:
diff --git a/examples/inference/README.md b/examples/inference/README.md
index 7251a8d015..a70ff84cc2 100644
--- a/examples/inference/README.md
+++ b/examples/inference/README.md
@@ -4,10 +4,10 @@
 We recommend that users follow TensorRT-LLM's official installation guide to build it from source
 and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
 
-```
+```sh
 git clone https://github.com/NVIDIA/TensorRT-LLM.git
 cd TensorRT-LLM
-git checkout v0.7.1
+git checkout v0.9.0
 make -C docker release_build
 ```
 
@@ -15,18 +15,17 @@ make -C docker release_build
 > you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
 > called later which requires `.git` to continue.
 
-Once the container is built, install `nvidia-ammo` and additional dependencies for sharded checkpoint support:
-```
-pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
+```sh
+pip install "nvidia-modelopt[all]~=0.11.0" --extra-index-url https://pypi.nvidia.com
 pip install zarr tensorstore==0.1.45
 ```
-TensorRT-LLM quantization functionalities are currently packaged in `nvidia-ammo`.
-You can find more documentation about `nvidia-ammo` in [TensorRT-LLM's quantization
-examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/quantization).
+TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
+You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
 
 ## Support Matrix
 
-The following matrix shows the current support for the PTQ + TensorRT-LLM export flow. 
+The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
 
 | model                       | fp16 | int8_sq | fp8 | int4_awq |
 |-----------------------------|------|---------| ----| -------- |
@@ -40,17 +39,17 @@ Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed
 and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
 following checkpoint formats with some remedy:
 
-| GPTModel                          | sharded |                        remedy arguments |
-|-----------------------------------|---------|-----------------------------------------|
-| megatron.legacy.model             |         | `--ammo-load-classic-megatron-to-mcore` |
-| TE-Fused (default mcore gpt spec) |         | `--ammo-convert-te-to-local-spec`       |
-| TE-Fused (default mcore gpt spec) |       x |                                         |
+| GPTModel                          | sharded |                        remedy arguments     |
+|-----------------------------------|---------|---------------------------------------------|
+| megatron.legacy.model             |         | `--export-legacy-megatron` |
+| TE-Fused (default mcore gpt spec) |         | `--export-te-mcore-model`       |
+| TE-Fused (default mcore gpt spec) |       x |                                             |
 
 > **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
-> need to adding `additional_sharded_prefix="model."` to `ammo_load_checkpoint()` since NeMo has an additional
+> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional
 > `model.` wrapper on top of the `GPTModel`.
 
-> **NOTE:** flag `--ammo-load-classic-megatron-to-mcore` may not work on all legacy checkpoint versions.
+> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions.
 
 ## Examples
 
@@ -75,12 +74,13 @@ cd ..
 ```
 
 Now launch the PTQ + TensorRT-LLM export script,
-```
+```sh
 bash examples/inference/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None
 ```
 By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
 quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
-be restored for further evaluation. TensorRT-LLM engine is exported to `/tmo/ammo` by default.
+be restored for further evaluation. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and
+built in `/tmp/trtllm_engine` by default.
 
 The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the following structure:
 ```
@@ -101,14 +101,10 @@ The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the f
 > some special tokens, `encode`, and `batch_decode`. As a result, the tokenizer behavior in TensorRT-LLM engine may
 > not match exactly.
 
-> **TROUBLE SHOOTING:** If you are loading `.nemo` sharded checkpoint here, call 
-> `ammo_load_checkpoint(..., additional_sharded_prefix="model.")` with additional sharded prefix in
-> `text_generation_ptq.py` to align the sharded keys.
-
 ### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
 > **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
-> the instruction in `docs/llama2.md` to convert the checkpoint to megatron classic `GPTModel` format and
-> use `--ammo-load-classic-megatron-to-mcore` flag which will remap the checkpoint to the MCore `GPTModel` spec
+> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
+> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
 > that we support.
 
 ```sh
diff --git a/examples/inference/ptq_trtllm_llama_7b.sh b/examples/inference/ptq_trtllm_llama_7b.sh
index 4b285f95f9..1c8322203f 100644
--- a/examples/inference/ptq_trtllm_llama_7b.sh
+++ b/examples/inference/ptq_trtllm_llama_7b.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+set -e
+
 DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
 NAME="${1:-$DEFAULT_NAME}"
 
@@ -7,7 +9,6 @@ QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
 
 # CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
 TP="8"
-PP=1
 INFERENCE_TP=${TP}
 DECODER_TYPE="llama"
 CHECKPOINT_LOAD_DIR="${NAME}"
@@ -19,19 +20,21 @@ if [ "$QUANT_CFG" = "int4_awq" ]; then
 fi
 
 additional_options=" \
-    --ammo-quant-cfg ${QUANT_CFG} \
-    --ammo-load-classic-megatron-to-mcore \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
     --decoder ${DECODER_TYPE} \
-    --engine-dir /tmp/ammo \
-    --max-input-len 2048 \
-    --max-output-len 512 \
-    --max-batch-size 8 \
+    --export-dir /tmp/trtllm_ckpt \
     --inference-tensor-parallel ${INFERENCE_TP} "
 
 trtllm_options=" \
-    --engine-dir /tmp/ammo \
+    --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
+    --engine-dir /tmp/trtllm_engine \
     --tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
-    --max-output-len 512 "
+    --max-input-len 2048 \
+    --max-output-len 512 \
+    --max-batch-size 8 "
 
 # DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
 export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -39,10 +42,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 options=" \
     --disable-bias-linear \
     --swiglu \
+    --no-rope-fusion \
     --untie-embeddings-and-output-weights \
     --use-rotary-position-embeddings \
     --normalization RMSNorm \
-    --norm-epsilon 1e-5 \
+    --rotary-percent 1.0 \
     --no-position-embedding \
     --no-masked-softmax-fusion \
     --no-bias-gelu-fusion \
@@ -54,26 +58,26 @@ options=" \
     --hidden-size 4096 \
     --ffn-hidden-size 11008 \
     --num-attention-heads 32 \
-    --seq-length 2048 \
+    --seq-length 4096 \
     --max-position-embeddings 4096 \
     --micro-batch-size 1 \
     --make-vocab-size-divisible-by 1 \
     --tokenizer-type Llama2Tokenizer \
     --tokenizer-model ${TOKENIZER_MODEL} \
     --save-interval 1000000 \
-    --bf16 \
+    --use-dist-ckpt \
+    --load ${CHECKPOINT_LOAD_DIR}
+    --fp16 \
     --use-mcore-models "
 
-set +x
-
 # Precompile CUDA extentions
-python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
 
 # Acquire launch configuration where variable launch_config will be set
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR}
+torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
 
 # This script is using mpi4py which will fork multiple processes.
 python examples/inference/trtllm_text_generation.py ${trtllm_options}
diff --git a/examples/inference/ptq_trtllm_nemotron3_8b.sh b/examples/inference/ptq_trtllm_nemotron3_8b.sh
index 2a90367d4c..2a42d1f10c 100644
--- a/examples/inference/ptq_trtllm_nemotron3_8b.sh
+++ b/examples/inference/ptq_trtllm_nemotron3_8b.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
-DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.2.0"
+set -e
+
+DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
 NAME="${1:-$DEFAULT_NAME}"
 
 DEFAULT_QUANT_CFG="fp8"
@@ -10,26 +12,28 @@ TP="8"
 INFERENCE_TP=${TP}
 DECODER_TYPE="gptnext"
 CHECKPOINT_LOAD_DIR="${NAME}"
-TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model"
+TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/tokenizer.model"
 
 if [ "$QUANT_CFG" = "int4_awq" ]; then
     INFERENCE_TP="1"
 fi
 
 additional_options=" \
-    --ammo-quant-cfg ${QUANT_CFG} \
-    --ammo-load-classic-megatron-to-mcore \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
     --decoder ${DECODER_TYPE} \
-    --engine-dir /tmp/ammo \
-    --max-input-len 2048 \
-    --max-output-len 512 \
-    --max-batch-size 8 \
+    --export-dir /tmp/trtllm_ckpt \
     --inference-tensor-parallel ${INFERENCE_TP} "
 
 trtllm_options=" \
-    --engine-dir /tmp/ammo \
+    --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
+    --engine-dir /tmp/trtllm_engine \
     --tokenizer ${TOKENIZER_MODEL} \
-    --max-output-len 512 "
+    --max-input-len 2048 \
+    --max-output-len 512 \
+    --max-batch-size 8 "
 
 # DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
 export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -38,6 +42,7 @@ options=" \
     --apply-layernorm-1p \
     --untie-embeddings-and-output-weights \
     --disable-bias-linear \
+    --no-rope-fusion \
     --no-position-embedding \
     --use-rotary-position-embeddings \
     --rotary-percent 0.5 \
@@ -56,20 +61,18 @@ options=" \
     --tokenizer-model ${TOKENIZER_MODEL} \
     --save-interval 1000000 \
     --load ${CHECKPOINT_LOAD_DIR} \
-    --bf16 \
+    --fp16 \
+    --use-dist-ckpt \
     --use-mcore-models "
 
-set +x
-
 # Precompile CUDA extentions
-python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
 
 # Acquire launch configuration where variable launch_config will be set
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR}
+torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
 
 # This script is using mpi4py which will fork multiple processes.
 python examples/inference/trtllm_text_generation.py ${trtllm_options}
-
diff --git a/examples/inference/text_generation_ptq.py b/examples/inference/text_generation_ptq.py
index 85aa4d13db..b6c2b445b4 100644
--- a/examples/inference/text_generation_ptq.py
+++ b/examples/inference/text_generation_ptq.py
@@ -8,46 +8,42 @@
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
 
-import ammo.torch.quantization as atq
+import modelopt.torch.quantization as mtq
 import torch
 from datasets import load_dataset
+from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
+from tqdm import tqdm
 
-# [ModelOpt]: changing the default model provider to the AMMO version
-from megatron.training import get_args, print_rank_0
-from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+# [ModelOpt]: changing the default model provider to the ModelOpt version
 from megatron.core import mpu
 from megatron.core.dist_checkpointing import load
-from megatron.inference.arguments import add_ammo_args
+from megatron.inference.arguments import add_modelopt_args
 from megatron.inference.gpt.model_provider import model_provider
-from megatron.training.initialize import initialize_megatron
 from megatron.inference.text_generation import generate_and_post_process
-from megatron.training import get_model
-from megatron.training.utils import unwrap_model
+from megatron.training import get_args, get_model, initialize_megatron
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
 
 QUANT_CFG_CHOICES = {
-    "int8": atq.INT8_DEFAULT_CFG,
-    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
-    "fp8": atq.FP8_DEFAULT_CFG,
-    "int4_awq": atq.INT4_AWQ_CFG,
-    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+    "int8": mtq.INT8_DEFAULT_CFG,
+    "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+    "fp8": mtq.FP8_DEFAULT_CFG,
+    "int4_awq": mtq.INT4_AWQ_CFG,
+    "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
+    "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
 }
 
 
-def add_trtllm_args(parser):
+def add_trtllm_ckpt_export_args(parser):
     """Add additional arguments for TensorRT-LLM."""
     group = parser.add_argument_group(title="trtllm")
 
     group.add_argument(
-        "--engine-dir", type=str, help="The output TensorRT-LLM engine dir.",
+        "--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
     )
     group.add_argument(
         "--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
     )
-    group.add_argument("--max-input-len", type=int, help="Max input sequence length.", default=2048)
-    group.add_argument(
-        "--max-output-len", type=int, help="Max output sequence length.", default=512
-    )
-    group.add_argument("--max-batch-size", type=int, help="Max batch size.", default=32)
     group.add_argument(
         "--inference-tensor-parallel",
         type=int,
@@ -57,8 +53,8 @@ def add_trtllm_args(parser):
 
 
 def add_text_generate_ptq_args(parser):
-    """Add additional arguments for AMMO text generation PTQ."""
-    group = parser.add_argument_group(title='AMMO text generation ptq')
+    """Add additional arguments for ModelOpt text generation PTQ."""
+    group = parser.add_argument_group(title='ModelOpt text generation ptq')
     group.add_argument(
         "--calib-dataset",
         type=str,
@@ -66,7 +62,10 @@ def add_text_generate_ptq_args(parser):
         help="Calibration datasets from HuggingFace datasets.",
     )
     group.add_argument(
-        "--calib-steps", type=int, default=512, help="Steps to perform atq.quantize calibration."
+        "--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
+    )
+    group.add_argument(
+        "--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
     )
     parser.add_argument(
         "--prompts",
@@ -76,15 +75,20 @@ def add_text_generate_ptq_args(parser):
         ),
         help="Input texts. Please use | to separate different batches.",
     )
-    add_ammo_args(parser)
-    add_trtllm_args(parser)
+    add_modelopt_args(parser)
+    add_trtllm_ckpt_export_args(parser)
     return parser
 
 
 def get_calib_dataloader(
     data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
 ):
-    if data == "wikitext":
+    if data == "pileval":
+        dataset = load_dataset(
+            "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
+        )
+        text_column = "text"
+    elif data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
     elif data == "cnn_dailymail":
@@ -99,8 +103,8 @@ def get_calib_dataloader(
         yield batch
 
 
-def ammo_load_checkpoint(
-    model, optimizer=None, opt_param_scheduler=None, strict=True, additional_sharded_prefix=""
+def modelopt_load_checkpoint(
+    model, optimizer=None, opt_param_scheduler=None, strict=True, additional_sharded_prefix="model."
 ):
     """Load a megatron checkpoint depending its format.
 
@@ -108,7 +112,7 @@ def ammo_load_checkpoint(
         model: MCoreGPTModel instance
         optimizer: Megatron optimizer instance
         opt_param_scheduler: Megatron scheduler instance
-        strict: if True, no extra or missing keys are allowed while loading the state_dict 
+        strict: if True, no extra or missing keys are allowed while loading the state_dict
         additional_sharded_prefix (str): Append additional prefix to align the sharded checkpoint keys. When loading
         an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is typically an empty string.
     """
@@ -159,28 +163,29 @@ def _remove_prefix_state_dict_pre_hook(
 
     args = get_args()
     if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
         exit()
 
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
+    args.exit_on_missing_checkpoint = True
+
+    # Set up model and load checkpoint
+    # [ModelOpt]: make sure that output logits are allgathered.
     text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
     model = get_model(text_generation_model_provider, wrap_with_ddp=False)
-    assert len(model) == 1, "Above condition should have caught this"
 
     if args.load is not None:
-        _ = ammo_load_checkpoint(
-            model,
-            None,
-            None,
-            strict=not args.untie_embeddings_and_output_weights,
-            additional_sharded_prefix="model.",
-        )
-    else:
-        print_rank_0("WARNING: No checkpoint is loaded for PTQ! The process will still continue.")
+        modelopt_load_checkpoint(model)
+        print_rank_0("Done loading checkpoint")
+
+    # Removing virtual pipeline parallel and other wrapper
+    assert len(model) == 1, "Above condition should have caught this"
+    unwrapped_model = unwrap_model(model)
 
     all_prompts = args.prompts.split("|")
 
-    def custom_prompt_forward_loop_func():
-        for prompt in all_prompts:
+    def custom_prompt_forward_loop_func(model):
+        for prompt in tqdm(all_prompts):
             if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
                 (
                     prompts_plus_generations,
@@ -188,7 +193,7 @@ def custom_prompt_forward_loop_func():
                     logprobs,
                     _,
                 ) = generate_and_post_process(
-                    model[0],
+                    model,
                     prompts=[prompt],
                     tokens_to_generate=128,
                     return_output_log_probs=True,
@@ -196,11 +201,11 @@ def custom_prompt_forward_loop_func():
                 )
                 print_rank_0(prompts_plus_generations)
             else:
-                generate_and_post_process(model[0])
+                generate_and_post_process(model)
 
-    def hf_dataset_forword_loop_func():
-        dataloader = get_calib_dataloader(args.calib_dataset, calib_size=args.calib_steps)
-        for prompts in dataloader:
+    def hf_dataset_forword_loop_func(model):
+        dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
+        for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
             if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
                 (
                     prompts_plus_generations,
@@ -208,66 +213,58 @@ def hf_dataset_forword_loop_func():
                     logprobs,
                     _,
                 ) = generate_and_post_process(
-                    model[0],
+                    model,
                     prompts=prompts,
                     tokens_to_generate=0,
                     return_output_log_probs=True,
                     temperature=1.0,
                 )
             else:
-                generate_and_post_process(model[0])
+                generate_and_post_process(model)
 
     ptq_forward_loop_func = custom_prompt_forward_loop_func
     if args.calib_dataset is not None:
         ptq_forward_loop_func = hf_dataset_forword_loop_func
 
-    if args.ammo_quant_cfg in QUANT_CFG_CHOICES:
-        atq_config = QUANT_CFG_CHOICES[args.ammo_quant_cfg]
-        if "awq" in args.ammo_quant_cfg:
-            weight_quantizer = atq_config["quant_cfg"]["*weight_quantizer"]  # type: ignore
+    # Setting data parallel and tensor parallel group
+    set_data_parallel_group(mpu.get_data_parallel_group())
+    set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
+
+    if args.export_quant_cfg in QUANT_CFG_CHOICES:
+        mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
+        if "*output_layer*" not in mtq_config["quant_cfg"]:
+            mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
+        if "awq" in args.export_quant_cfg:
+            weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"]  # type: ignore
             if isinstance(weight_quantizer, list):
                 weight_quantizer = weight_quantizer[0]
             weight_quantizer["block_sizes"][-1] = 128
-        atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
-        print_rank_0("atq.quantize: output_layer quantization is disable")
-        atq.quantize(model[0], atq_config, ptq_forward_loop_func)
-        custom_prompt_forward_loop_func()
-        if args.save:
-            save_checkpoint(1, model, None, None)
-    else:
-        custom_prompt_forward_loop_func()
+        print_rank_0("Quantizing the model...")
+        mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
 
-    if args.engine_dir:
-        from ammo.deploy.llm import model_config_to_tensorrt_llm
-        from ammo.torch.export import torch_to_model_config
+    custom_prompt_forward_loop_func(model[0])
 
-        assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
+    if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
+        save_checkpoint(1, unwrapped_model, None, None, 0)
 
-        Path(args.engine_dir).mkdir(parents=True, exist_ok=True)
+    print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
 
-        print_rank_0("Exporting model_configs for TRT LLM.")
-        model = unwrap_model(model)
-        model = model[0]
+    if args.export_dir:
+        assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
+        Path(args.export_dir).mkdir(parents=True, exist_ok=True)
+        print_rank_0("Exporting TensorRT-LLM checkpoints.")
+
+        from modelopt.torch.export import export_tensorrt_llm_checkpoint
 
         # In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
-        model_configs = torch_to_model_config(
-            model,
+        export_tensorrt_llm_checkpoint(
+            unwrapped_model[0],
             args.decoder,
-            torch.float16,
+            torch.bfloat16 if args.bf16 else torch.float16,
+            export_dir=args.export_dir,
             inference_tensor_parallel=args.inference_tensor_parallel,
+            inference_pipeline_parallel=1,
+            use_nfs_workspace=True,
         )
 
-        print_rank_0("Building TRT LLM engines.")
-        for model_config in model_configs:
-            model_config_to_tensorrt_llm(
-                model_config,
-                args.engine_dir,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_batch_size=args.max_batch_size,
-                max_beam_width=1,
-                num_build_workers=1,
-                inflight_batching=False,
-                enable_sparsity=False,
-            )
-        print_rank_0(f"TRT LLM engines saved to {args.engine_dir}")
+        print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
diff --git a/examples/inference/trtllm_text_generation.py b/examples/inference/trtllm_text_generation.py
index c6c0098f20..17a47bfa3c 100644
--- a/examples/inference/trtllm_text_generation.py
+++ b/examples/inference/trtllm_text_generation.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import torch
-from ammo.deploy.llm import generate, load, unload
+from modelopt.deploy.llm import LLM, build_tensorrt_llm
 from transformers import AutoTokenizer, T5Tokenizer
 
 
@@ -23,19 +23,30 @@ def __init__(self, model):
         super().__init__(model, extra_ids=0, bos_token="<s>", pad_token="<pad>")
 
     def encode(self, text, add_special_tokens: bool = True, **kwargs):
-        return self.sp_model.encode_as_ids(text)
+        return torch.Tensor(self.sp_model.encode_as_ids(text))
+
+    def batch_encode_plus(
+        self, batch_text_or_text_pairs, add_special_tokens: bool = True, **kwargs
+    ):
+        return {'input_ids': self.sp_model.encode_as_ids(batch_text_or_text_pairs)}
 
     def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs):
         if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences):
             sequences = sequences.tolist()
         return self.sp_model.decode(sequences)
 
+    def decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
+        return self.sp_model.decode([token_ids])[0]
+
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("--tokenizer", type=str, default="")
-    parser.add_argument("--max-output-len", type=int, default=100)
-    parser.add_argument("--engine-dir", type=str, default="/tmp/ammo")
+    parser.add_argument("--max-input-len", type=int, default=4096)
+    parser.add_argument("--max-output-len", type=int, default=512)
+    parser.add_argument("--max-batch-size", type=int, default=8)
+    parser.add_argument("--tensorrt-llm-checkpoint-dir", type=str, default=None)
+    parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
     parser.add_argument(
         "--input-texts",
         type=str,
@@ -44,7 +55,7 @@ def parse_arguments():
         ),
         help="Input texts. Please use | to separate different batches.",
     )
-    parser.add_argument("--max-num-beams", type=int, default=1)
+    parser.add_argument("--max-beam-width", type=int, default=1)
     parser.add_argument("--profiler-output", type=str, default="")
     return parser.parse_args()
 
@@ -62,6 +73,7 @@ def run(args):
         raise ValueError(
             "arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext"
         )
+    print(tokenizer, tokenizer.vocab_size)
 
     if not hasattr(args, "profiler_output"):
         args.profiler_output = ""
@@ -70,22 +82,33 @@ def run(args):
     assert input_texts, "input_text not specified"
     print(input_texts)
 
+    if args.tensorrt_llm_checkpoint_dir is not None:
+        print("Building TensorRT-LLM engines.")
+        build_tensorrt_llm(
+            args.tensorrt_llm_checkpoint_dir + "/config.json",
+            args.engine_dir,
+            max_input_len=args.max_input_len,
+            max_batch_size=args.max_batch_size,
+            max_beam_width=args.max_beam_width,
+            num_build_workers=1,
+        )
+        print(f"TensorRT-LLM engines saved to {args.engine_dir}")
+
     free_memory_before = torch.cuda.mem_get_info()
 
-    host_context = load(
-        tokenizer=tokenizer, engine_dir=args.engine_dir, num_beams=args.max_num_beams
-    )
+    # This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
+    llm_engine = LLM(args.engine_dir, tokenizer)
+
     torch.cuda.cudart().cudaProfilerStart()
-    outputs = generate(input_texts, args.max_output_len, host_context, None, args.profiler_output)
-    print(outputs)
+    # outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
+    outputs = llm_engine.generate(input_texts)
     torch.cuda.cudart().cudaProfilerStop()
 
     free_memory_after = torch.cuda.mem_get_info()
     print(
-        f"Use GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
+        f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
     )
-
-    unload(host_context)
+    print(outputs)
 
 
 if __name__ == "__main__":
diff --git a/megatron/core/inference/gpt/model_specs.py b/megatron/core/inference/gpt/model_specs.py
index 50467ef414..5d6d0d7d44 100644
--- a/megatron/core/inference/gpt/model_specs.py
+++ b/megatron/core/inference/gpt/model_specs.py
@@ -3,22 +3,30 @@
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.custom_layers.transformer_engine import TEDotProductAttention, TENorm
 from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
 
-# Use this spec for AMMO PTQ and TensorRT-LLM export
-def get_gpt_layer_ammo_spec() -> ModuleSpec:
+# Use this spec for ModelOpt PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec(
+    remap_te_layernorm: bool = False, qk_layernorm: bool = False
+) -> ModuleSpec:
     """Mix the native spec with TENorm.
 
     This is essentially the native local spec except for the layernorm implementation
-    is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
-    prevents the apex dependency.
+    is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
+    has stopped supporting RMSNorm needed by llama.
     """
+    sharded_state_dict_keys_map = {}
+    if remap_te_layernorm:
+        sharded_state_dict_keys_map = {
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        }
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(
@@ -28,8 +36,10 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
+                    core_attention=TEDotProductAttention,
                     linear_proj=RowParallelLinear,
+                    q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    k_layernorm=TENorm if qk_layernorm else IdentityOp,
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
@@ -42,9 +52,6 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
             ),
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back
-            sharded_state_dict_keys_map={
-                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-            },
+            sharded_state_dict_keys_map=sharded_state_dict_keys_map,
         ),
     )
diff --git a/megatron/core/inference/gpt/state_dict_hooks.py b/megatron/core/inference/gpt/state_dict_hooks.py
index 7d6197d655..7222c78460 100644
--- a/megatron/core/inference/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/gpt/state_dict_hooks.py
@@ -7,15 +7,15 @@
 logger = getLogger(__name__)
 
 
-def mcore_gpt_load_classic_state_dict_pre_hook(
+def mcore_gpt_load_legacy_state_dict_pre_hook(
     state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
 ):
     """Register a pre-hook to fix the state_dict key difference.
 
-    This prehook is used when trying to load the classic Megatron-LM GPTModel into its
+    This prehook is used when trying to load the legacy Megatron-LM GPTModel into its
     megatron/core variant that uses native ParallelLinear and Transformer-Engine Norm.
     Only this particular spec supports post-training quantization and TensorRT-LLM
-    config export through `nvidia-ammo` package.
+    config export through `nvidia-modelopt` package.
 
     Args:
         state_dict: state dictionary
@@ -89,7 +89,7 @@ def mcore_gpt_load_te_state_dict_pre_hook(
     fused Transformer-Engine ParallelLinear into the variant that uses native ParallelLinear
     and Transformer-Engine Norm (effectively to restore the fusion).
     Only this particular spec supports post-training quantization and TensorRT-LLM
-    config export through `nvidia-ammo` package.
+    config export through `nvidia-modelopt` package.
 
     Args:
         state_dict: state dictionary
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index 250b2fdcd2..93210ef657 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -280,13 +280,6 @@ class TransformerConfig(ModelParallelConfig):
     enable_cuda_graph: bool = False
     """When set to true, TransformerLayer blocks are wrapped with CUDA graph."""
 
-    # These 2 attributes are WAR for TRTLLM export. DO NOT USE!! WILL BE DEPRECATED SOON!!
-    max_position_embeddings: int = 0
-    """Deprecated. Do not use."""
-
-    rotary_percent: float = 0
-    """Deprecated. Do not use."""
-
     def __post_init__(self):
         """ Python dataclass method that is used to modify attributes after initialization.
             See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
diff --git a/megatron/inference/arguments.py b/megatron/inference/arguments.py
index c03e70cdb6..7fcd7a7dc3 100644
--- a/megatron/inference/arguments.py
+++ b/megatron/inference/arguments.py
@@ -1,25 +1,26 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
-def add_ammo_args(parser):
-    """Add additional arguments for ammo."""
-    group = parser.add_argument_group(title="ammo-generic")
+
+def add_modelopt_args(parser):
+    """Add additional arguments for using TensorRT Model Optimizer (modelopt) features."""
+    group = parser.add_argument_group(title="modelopt-generic")
 
     group.add_argument(
-        "--ammo-load-classic-megatron-to-mcore",
+        "--export-legacy-megatron",
         action="store_true",
-        help="Load a classic megatron-lm checkpoint to a new megatron-core model.",
+        help="Export a legacy megatron-lm checkpoint.",
     )
     group.add_argument(
-        "--ammo-convert-te-to-local-spec",
+        "--export-te-mcore-model",
         action="store_true",
-        help="Load a megatron-core transformer-engine checkpoint to a model with local spec.",
+        help="Export a megatron-core transformer-engine checkpoint.",
     )
     group.add_argument(
-        "--ammo-quant-cfg",
+        "--export-quant-cfg",
         type=str,
         default=None,
-        choices=["int8_sq", "fp8", "int4_awq", "None"],
-        help="Algorithms supported by atq.quantize.",
+        choices=["int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "None"],
+        help="Specify a quantization config from the supported choices.",
     )
 
     return parser
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index e0cc326861..c6d3761de6 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -2,24 +2,22 @@
 
 """ModelOpt GPT model provider."""
 
-from typing import Union
-
-from megatron.training import get_args, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
+from megatron.core.inference.gpt.model_specs import get_gpt_layer_modelopt_spec
 from megatron.core.inference.gpt.state_dict_hooks import (
-    mcore_gpt_load_classic_state_dict_pre_hook,
+    mcore_gpt_load_legacy_state_dict_pre_hook,
     mcore_gpt_load_te_state_dict_pre_hook,
 )
 from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_args, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
 
 
-def model_provider(
-    pre_process=True, post_process=True, parallel_output=True,
-) -> Union[MCoreGPTModel]:
-    """Builds the GPT model.
+def model_provider(pre_process=True, post_process=True, parallel_output=True) -> MCoreGPTModel:
+    """Builds the model.
 
-    This model_provider only sypport use_mcore_models=True.
+    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
 
     Args:
         pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
@@ -28,21 +26,23 @@ def model_provider(
             True if `model_provider` is called in text_generation_server.
 
     Returns:
-        Union[MCoreGPTModel]: The returned model
+        MCoreGPTModel: The returned model
     """
     args = get_args()
 
     print_rank_0("building GPT model ...")
+
+    # ModelOpt by default assumes none homogenous layers. This affect the storage format of the sharded checkpoint.
     config = core_transformer_config_from_args(get_args())
+    config.non_homogeneous_layers = True
 
     if args.use_mcore_models:
         if args.spec is not None:
-            raise ValueError("Custom layer specs are not supported!")
+            transformer_layer_spec = import_module(args.spec)
         else:
-            if args.num_experts is None:
-                transformer_layer_spec = get_gpt_layer_ammo_spec()
-            else:
-                raise ValueError("MoE is not supported for now!")
+            transformer_layer_spec = get_gpt_layer_modelopt_spec(
+                remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False,
+            )
 
         model_type = MCoreGPTModel
         model_kwargs = {
@@ -59,15 +59,21 @@ def model_provider(
             "rotary_percent": args.rotary_percent,
         }
     else:
-        raise ValueError("Classic Megatron-LM models are not supported!")
+        raise ValueError(
+            "ModelOpt integration only support MCore models. Use --use-mcore-modules instead."
+        )
 
     model = model_type(**model_kwargs)
-    print_rank_0(str(model))
 
-    if args.use_mcore_models:
-        if args.ammo_load_classic_megatron_to_mcore:
-            model._register_load_state_dict_pre_hook(mcore_gpt_load_classic_state_dict_pre_hook)
-        elif args.ammo_convert_te_to_local_spec:
-            model._register_load_state_dict_pre_hook(mcore_gpt_load_te_state_dict_pre_hook)
+    # Register some load_state_dict prehooks to handle some known state_dict key mismatch.
+    # (legacy <-> modelopt) and (default te <-> modelopt)
+    if args.export_legacy_megatron:
+        model._register_load_state_dict_pre_hook(mcore_gpt_load_legacy_state_dict_pre_hook)
+    if args.export_te_mcore_model:
+        model._register_load_state_dict_pre_hook(mcore_gpt_load_te_state_dict_pre_hook)
+
+    # Print models on all pp ranks.
+    if get_tensor_model_parallel_rank() == 0:
+        print(str(model))
 
     return model
diff --git a/tests/unit_tests/inference/test_modelopt_gpt_model.py b/tests/unit_tests/inference/test_modelopt_gpt_model.py
new file mode 100644
index 0000000000..4060b1f259
--- /dev/null
+++ b/tests/unit_tests/inference/test_modelopt_gpt_model.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from tests.unit_tests.test_utilities import Utils
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.inference.gpt.model_specs import get_gpt_layer_modelopt_spec
+from megatron.core.inference.gpt.state_dict_hooks import mcore_gpt_load_te_state_dict_pre_hook
+
+
+class TestModelOptGPTModel:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1,1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+        # Ensure that a GPTModel can be built with the modelopt spec.
+        self.modelopt_gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_modelopt_spec(),
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    def test_load_te_state_dict_pre_hook(self):
+        handle = self.modelopt_gpt_model._register_load_state_dict_pre_hook(
+            mcore_gpt_load_te_state_dict_pre_hook
+        )
+        self.modelopt_gpt_model.load_state_dict(self.gpt_model.state_dict())
+        handle.remove()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()

From 9ad1a56f82a55f2bb55dfb42d392ec8c06c362e0 Mon Sep 17 00:00:00 2001
From: Jared Casper <jcasper@nvidia.com>
Date: Fri, 24 May 2024 19:56:12 -0700
Subject: [PATCH 51/92] Change default to use mcore models, not legacy.

---
 examples/bert/train_bert_340m_distributed.sh  |  1 -
 examples/detxoify_lm/generate_samples_gpt.py  | 23 ++++-----
 examples/gpt3/gpt_config.yaml                 |  2 +-
 examples/gpt3/train_gpt3_175b_distributed.sh  |  1 -
 examples/inference/ptq_trtllm_llama_7b.sh     |  3 +-
 examples/inference/ptq_trtllm_nemotron3_8b.sh |  3 +-
 examples/retro/train_retro_2b_distributed.sh  |  1 -
 examples/t5/train_t5_220m_distributed.sh      |  1 -
 megatron/core/transformer/moe/README.md       |  1 -
 megatron/inference/gpt/model_provider.py      | 48 +++++++++----------
 megatron/training/arguments.py                | 28 +++++++----
 pretrain_bert.py                              | 23 ++++-----
 pretrain_gpt.py                               | 24 ++++------
 pretrain_retro.py                             | 21 ++++----
 pretrain_t5.py                                | 28 ++++++-----
 .../bert/pretrain_bert_distributed_test.sh    |  5 +-
 .../gpt3/pretrain_gpt3_distributed_test.sh    |  5 +-
 .../pretrain_llava_distributed_test.sh        |  5 +-
 .../retro/pretrain_retro_distributed_test.sh  |  5 +-
 .../t5/pretrain_t5_distributed_test.sh        |  6 +--
 tools/checkpoint/loader_llama2.py             |  2 +-
 tools/checkpoint/loader_mcore.py              |  4 +-
 tools/checkpoint/loader_megatron.py           |  2 +-
 tools/checkpoint/saver_mcore.py               |  2 +-
 tools/checkpoint/saver_megatron.py            |  2 +-
 .../text_generation/retro_text_generation.py  |  7 +--
 tools/run_text_generation_server.py           | 22 ++++-----
 27 files changed, 138 insertions(+), 137 deletions(-)

diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh
index 7d489917e5..649c579129 100644
--- a/examples/bert/train_bert_340m_distributed.sh
+++ b/examples/bert/train_bert_340m_distributed.sh
@@ -46,7 +46,6 @@ TRAINING_ARGS=(
     --weight-decay 1e-2 
     --lr-warmup-fraction .01 
     --clip-grad 1.0 
-    --use-mcore-models
 )
 
 MODEL_PARALLEL_ARGS=(
diff --git a/examples/detxoify_lm/generate_samples_gpt.py b/examples/detxoify_lm/generate_samples_gpt.py
index 01c22a1011..895a45d024 100644
--- a/examples/detxoify_lm/generate_samples_gpt.py
+++ b/examples/detxoify_lm/generate_samples_gpt.py
@@ -29,7 +29,7 @@
 def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
     """Builds the model.
 
-    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
 
     Args:
         pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
@@ -44,8 +44,15 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
     print_rank_0('building GPT model ...')
     config = core_transformer_config_from_args(args)
 
-    if args.use_mcore_models:
-
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+    else:
         if args.spec is None:
             if args.transformer_impl == 'local':
                 transformer_layer_spec = get_gpt_layer_local_spec(
@@ -80,16 +87,6 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent
         )
-    else:
-        assert(args.context_parallel_size == 1), "Context parallelism is only supported with Megatron Core!"
-
-        model = megatron.legacy.model.GPTModel(
-            config,
-            num_tokentypes=0,
-            parallel_output=False,
-            pre_process=pre_process,
-            post_process=post_process
-        )
 
     return model
 
diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml
index 652cd4d43e..8e4b527cda 100644
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
@@ -132,7 +132,7 @@ model_parallel:
   barrier_with_L1_time: True
 
 # training:
-use_mcore_models: True
+use_legacy_models: False
 spec: null
 micro_batch_size: 2
 global_batch_size: 128
diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh
index ccba78784b..b164ae2e91 100755
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
@@ -49,7 +49,6 @@ TRAINING_ARGS=(
     --min-lr 6.0e-6
     --lr-warmup-fraction .001 
     --lr-decay-iters 430000 
-    --use-mcore-models
 )
 
 MODEL_PARALLEL_ARGS=(
diff --git a/examples/inference/ptq_trtllm_llama_7b.sh b/examples/inference/ptq_trtllm_llama_7b.sh
index 1c8322203f..3a798bf1b3 100644
--- a/examples/inference/ptq_trtllm_llama_7b.sh
+++ b/examples/inference/ptq_trtllm_llama_7b.sh
@@ -67,8 +67,7 @@ options=" \
     --save-interval 1000000 \
     --use-dist-ckpt \
     --load ${CHECKPOINT_LOAD_DIR}
-    --fp16 \
-    --use-mcore-models "
+    --fp16"
 
 # Precompile CUDA extentions
 python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
diff --git a/examples/inference/ptq_trtllm_nemotron3_8b.sh b/examples/inference/ptq_trtllm_nemotron3_8b.sh
index 2a42d1f10c..988f8fc6e8 100644
--- a/examples/inference/ptq_trtllm_nemotron3_8b.sh
+++ b/examples/inference/ptq_trtllm_nemotron3_8b.sh
@@ -62,8 +62,7 @@ options=" \
     --save-interval 1000000 \
     --load ${CHECKPOINT_LOAD_DIR} \
     --fp16 \
-    --use-dist-ckpt \
-    --use-mcore-models "
+    --use-dist-ckpt"
 
 # Precompile CUDA extentions
 python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh
index 3bbfc9bcb6..c8276b56f4 100644
--- a/examples/retro/train_retro_2b_distributed.sh
+++ b/examples/retro/train_retro_2b_distributed.sh
@@ -65,7 +65,6 @@ EVAL_AND_LOGGING_ARGS=(
 
 TRAINING_ARGS=" \
     --retro-project-dir ${RETRO_PROJECT_DIR} \
-    --use-mcore-models \
     --transformer-impl transformer_engine \
     --num-workers 8 \
     --micro-batch-size 4 \
diff --git a/examples/t5/train_t5_220m_distributed.sh b/examples/t5/train_t5_220m_distributed.sh
index 4a55bb6e95..5d9357ab0e 100755
--- a/examples/t5/train_t5_220m_distributed.sh
+++ b/examples/t5/train_t5_220m_distributed.sh
@@ -51,7 +51,6 @@ T5_ARGS="
     --transformer-impl transformer_engine \
     --tensor-model-parallel-size 1 \
     --pipeline-model-parallel-size 1 \
-    --use-mcore-models \
 "
 
 DATA_ARGS="
diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md
index 88feec002b..a1771c7028 100644
--- a/megatron/core/transformer/moe/README.md
+++ b/megatron/core/transformer/moe/README.md
@@ -126,7 +126,6 @@ DISTRIBUTED_ARGS=(
 )
 
 MODEL_ARGS=(
-    --use-mcore-models
     --disable-bias-linear
     --seq-length 4096
     --max-position-embeddings 32768
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index c6d3761de6..b242ed90a1 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -17,7 +17,7 @@
 def model_provider(pre_process=True, post_process=True, parallel_output=True) -> MCoreGPTModel:
     """Builds the model.
 
-    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
 
     Args:
         pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
@@ -36,33 +36,33 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
     config = core_transformer_config_from_args(get_args())
     config.non_homogeneous_layers = True
 
-    if args.use_mcore_models:
-        if args.spec is not None:
-            transformer_layer_spec = import_module(args.spec)
-        else:
-            transformer_layer_spec = get_gpt_layer_modelopt_spec(
-                remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False,
-            )
-
-        model_type = MCoreGPTModel
-        model_kwargs = {
-            "config": config,
-            "transformer_layer_spec": transformer_layer_spec,
-            "vocab_size": args.padded_vocab_size,
-            "max_sequence_length": args.max_position_embeddings,
-            "pre_process": pre_process,
-            "post_process": post_process,
-            "fp16_lm_cross_entropy": args.fp16_lm_cross_entropy,
-            "parallel_output": parallel_output,
-            "share_embeddings_and_output_weights": not args.untie_embeddings_and_output_weights,
-            "position_embedding_type": args.position_embedding_type,
-            "rotary_percent": args.rotary_percent,
-        }
-    else:
+    if args.use_legacy_models:
         raise ValueError(
             "ModelOpt integration only support MCore models. Use --use-mcore-modules instead."
         )
 
+    if args.spec is not None:
+        transformer_layer_spec = import_module(args.spec)
+    else:
+        transformer_layer_spec = get_gpt_layer_modelopt_spec(
+            remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False,
+        )
+
+    model_type = MCoreGPTModel
+    model_kwargs = {
+        "config": config,
+        "transformer_layer_spec": transformer_layer_spec,
+        "vocab_size": args.padded_vocab_size,
+        "max_sequence_length": args.max_position_embeddings,
+        "pre_process": pre_process,
+        "post_process": post_process,
+        "fp16_lm_cross_entropy": args.fp16_lm_cross_entropy,
+        "parallel_output": parallel_output,
+        "share_embeddings_and_output_weights": not args.untie_embeddings_and_output_weights,
+        "position_embedding_type": args.position_embedding_type,
+        "rotary_percent": args.rotary_percent,
+    }
+
     model = model_type(**model_kwargs)
 
     # Register some load_state_dict prehooks to handle some known state_dict key mismatch.
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6b038669f7..0ef141e1a0 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -59,7 +59,8 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     # Experimental yaml
     if args.yaml_cfg is not None:
         from .yaml_arguments import load_yaml
-        assert args.yaml_cfg and args.use_mcore_models, "To use yaml, mcore must be enabled"
+        assert args.yaml_cfg and not args.use_legacy_models, \
+            "Yaml config is not supported with legacy models."
         args = load_yaml(args.yaml_cfg)
 
 
@@ -264,7 +265,7 @@ def validate_args(args, defaults={}):
             '--overlap-param-gather only supported with distributed optimizer'
         assert args.overlap_grad_reduce, \
             '--overlap-grad-reduce should be turned on when using --overlap-param-gather'
-        assert args.use_mcore_models, \
+        assert not args.use_legacy_models, \
             '--overlap-param-gather only supported with MCore models'
 
     # Parameters dtype.
@@ -481,8 +482,8 @@ def validate_args(args, defaults={}):
             "retro currently does not support pipeline parallelism."
 
     if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
-        assert args.use_mcore_models, \
-            '--decoupled-lr and --decoupled-min-lr only supported by Megatron Core, please add --use-mcore-models.'
+        assert not args.use_legacy_models, \
+            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
         assert not args.use_dist_ckpt, "Distributed checkpointing does not work with decoupled LR yet."
 
     # Legacy RoPE arguments
@@ -490,8 +491,8 @@ def validate_args(args, defaults={}):
         args.position_embedding_type = 'rope'
     if args.rotary_interleaved and args.apply_rope_fusion:
         raise RuntimeError('--rotary-interleaved does not work with rope_fusion.')
-    if args.rotary_interleaved and not args.use_mcore_models:
-        raise RuntimeError('--rotary-interleaved only support Megatron Core, please add --use-mcore-models.')
+    if args.rotary_interleaved and args.use_legacy_models:
+        raise RuntimeError('--rotary-interleaved is not supported in legacy models.')
 
     # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
     # don't allow it to keep things simple
@@ -505,6 +506,10 @@ def validate_args(args, defaults={}):
             assert args.sequence_parallel, \
                 "When using MoE and tensor parallelism, sequence parallelism must be used."
 
+    # Context parallel
+    if args.context_parallel_size > 1:
+        assert not args.use_legacy_models, "Context parallelism is not supported in legacy models."
+
     # Expert parallelism check
     if args.expert_model_parallel_size  > 1:
         assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
@@ -514,8 +519,8 @@ def validate_args(args, defaults={}):
             "Expert parallelism is not supported with fp16 training."
 
     # Distributed checkpointing checks
-    if args.use_dist_ckpt and not args.use_mcore_models:
-        raise RuntimeError('--use-dist-ckpt only support Megatron Core, please add --use-mcore-models.')
+    if args.use_dist_ckpt and args.use_legacy_models:
+        raise RuntimeError('--use-dist-ckpt is not supported in legacy models.')
 
     # Data blend checks
     assert args.mock_data + \
@@ -1110,7 +1115,12 @@ def _add_training_args(parser):
                        'gradient computation of linear layers',
                        dest='gradient_accumulation_fusion')
     group.add_argument('--use-mcore-models', action='store_true',
-                       help='Use the implementation from megatron core')
+                       dest='deprecated_use_mcore_models',
+                       help='DEPRECATED. Use the implementation from megatron core.'
+                       'Now ignored and mcore models are the default, use '
+                       '--use-legacy-models to not use core models.')
+    group.add_argument('--use-legacy-models', action='store_true',
+                       help='Use the legacy Megatron models, not Megatron-Core models.')
     group.add_argument('--manual-gc', action='store_true',
                        help='Disable the threshold-based default garbage '
                        'collector and trigger the garbage collection manually. '
diff --git a/pretrain_bert.py b/pretrain_bert.py
index 0f751cad9b..f5c553029c 100644
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -35,9 +35,15 @@ def model_provider(pre_process=True, post_process=True):
     config = core_transformer_config_from_args(args)
     num_tokentypes = 2 if args.bert_binary_head else 0
 
-    if args.use_mcore_models:
-
-
+    if args.use_legacy_models:
+        model = megatron.legacy.model.BertModel(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_binary_head=args.bert_binary_head,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process)
+    else:
         if args.spec is None:
             transformer_layer_spec = bert_layer_with_transformer_engine_spec #default spec
         elif args.spec[0] == 'local':
@@ -46,7 +52,6 @@ def model_provider(pre_process=True, post_process=True):
         else :
             transformer_layer_spec = import_module(args.spec)
 
-
         model = BertModel(
             config=config,
             transformer_layer_spec=transformer_layer_spec,
@@ -58,14 +63,6 @@ def model_provider(pre_process=True, post_process=True):
             parallel_output=True,
             pre_process=pre_process,
             post_process=post_process)
-    else:
-        model = megatron.legacy.model.BertModel(
-            config=config,
-            num_tokentypes=num_tokentypes,
-            add_binary_head=args.bert_binary_head,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process)
 
     return model
 
@@ -192,4 +189,4 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
     pretrain(train_valid_test_datasets_provider, model_provider,
              ModelType.encoder_or_decoder,
-             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
\ No newline at end of file
+             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 6ba99de751..194ae22783 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -38,7 +38,7 @@
 def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
     """Builds the model.
 
-    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the mcore GPT model.
 
     Args:
         pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
@@ -58,7 +58,15 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
     else:
         config = core_transformer_config_from_args(args)
 
-    if args.use_mcore_models:
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+        )
+    else: # using core models
         if args.spec is not None:
             transformer_layer_spec = import_module(args.spec)
         else:
@@ -80,18 +88,6 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent,
         )
-    else:
-        assert (
-            args.context_parallel_size == 1
-        ), "Context parallelism is only supported with Megatron Core!"
-
-        model = megatron.legacy.model.GPTModel(
-            config,
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-        )
 
     return model
 
diff --git a/pretrain_retro.py b/pretrain_retro.py
index e50e3077c1..a0d8f9d922 100644
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
@@ -70,7 +70,10 @@ def model_provider(pre_process=True, post_process=True):
     """
 
     args = get_args()
-    provider = core_model_provider if (args.use_mcore_models and args.retro_add_retriever) else default_model_provider
+    if not args.use_legacy_models and args.retro_add_retriever:
+        provider = core_model_provider
+    else:
+        provider = default_model_provider
     model = provider(pre_process=pre_process, post_process=post_process)
     return model
 
@@ -149,7 +152,13 @@ def forward_step(data_iterator, model):
     timers('batch-generator').stop()
 
     # Model call.
-    if args.use_mcore_models:
+    if args.use_legacy_models:
+        forward_kwargs = {
+            "retriever_input_ids" : neighbor_tokens,
+            "retriever_position_ids" : neighbor_position_ids,
+            "retriever_attn_mask" : neighbor_attention_mask,
+        }
+    else:
         if args.retro_add_retriever:
             forward_kwargs = {
                 "context_input_ids" : neighbor_tokens,
@@ -158,13 +167,7 @@ def forward_step(data_iterator, model):
             }
         else:
             forward_kwargs = {}
-    else:
-        forward_kwargs = {
-            "retriever_input_ids" : neighbor_tokens,
-            "retriever_position_ids" : neighbor_position_ids,
-            "retriever_attn_mask" : neighbor_attention_mask,
-        }
-
+ 
     output_tensor = model(tokens, position_ids, attention_mask,
                           labels=labels, **forward_kwargs)
 
diff --git a/pretrain_t5.py b/pretrain_t5.py
index a5dfdc0403..e9702c3072 100644
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -3,6 +3,7 @@
 """Pretrain T5"""
 
 from functools import partial
+from typing import Union
 
 import torch
 
@@ -29,7 +30,7 @@
                                             get_t5_decoder_with_transformer_engine_block_spec,
                                             get_t5_encoder_with_local_block_spec,
                                             get_t5_decoder_with_local_block_spec)
-from megatron.legacy.model import T5Model as NonCoreT5Model
+from megatron.legacy.model import T5Model as LegacyT5Model
 
 """
 Pipeline parallelism for T5
@@ -70,7 +71,7 @@
 
 def model_provider(
     pre_process=True, post_process=True, add_encoder=True, add_decoder=True
-) -> T5Model:
+) -> Union[LegacyT5Model, T5Model]:
     """Builds the model.
 
     Args:
@@ -84,7 +85,17 @@ def model_provider(
 
     args = get_args()
     config = core_transformer_config_from_args(args)
-    if args.use_mcore_models:
+    if args.use_legacy_models:
+        model = LegacyT5Model(
+            config=config,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+        )
+    else:
         if args.transformer_impl == "local":
             en_block_spec = get_t5_encoder_with_local_block_spec(args.encoder_num_layers)
             de_block_spec = get_t5_decoder_with_local_block_spec(args.decoder_num_layers)
@@ -110,16 +121,7 @@ def model_provider(
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent,
         )
-    else:
-        model = NonCoreT5Model(
-            config=config,
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            add_encoder=add_encoder,
-            add_decoder=add_decoder,
-        )
+
     return model
 
 
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
index 4acff199dc..dd9e40fa99 100755
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
@@ -36,11 +36,12 @@ else
    ADDITIONAL_PARAMS+=" --deterministic-mode"
 fi
 
+USE_LEGACY=1
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       USE_MCORE=1
+       unset USE_LEGACY
 fi
 if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
        echo "Running checkpoint resume test..."
@@ -89,7 +90,7 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
        --tensor-model-parallel-size $TP_SIZE \
        --pipeline-model-parallel-size $PP_SIZE \
        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
-       ${USE_MCORE:+--use-mcore-models} \
+       ${USE_LEGACY:+--use-legacy-models} \
        ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
        --no-gradient-accumulation-fusion \
        ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
index aa95d8d65a..61940984ef 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
@@ -39,11 +39,12 @@ else
    ADDITIONAL_PARAMS+=" --deterministic-mode"
 fi
 
+USE_LEGACY=1
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=transformer_engine
        TRAINING_DTYPE=bf16
-       USE_MCORE=1
+       unset USE_LEGACY
 fi
 
 if [[ $USE_FP8 -eq 1 ]]; then
@@ -126,7 +127,7 @@ build_torch_run_cmd() {
        --pipeline-model-parallel-size $PP_SIZE \
        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
        ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
-       ${USE_MCORE:+--use-mcore-models} \
+       ${USE_LEGACY:+--use-legacy-models} \
        --no-gradient-accumulation-fusion \
        ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \
        --${TRAINING_DTYPE}"
diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
index fa536f97ed..dffdf95b99 100755
--- a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
@@ -37,11 +37,12 @@ else
    ADDITIONAL_PARAMS+=" --deterministic-mode"
 fi
 
+USE_LEGACY=1
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       USE_MCORE=1
+       unset USE_LEGACY
 fi
 
 if [[ $MOE_GROUPED_GEMM -eq 1 ]]; then
@@ -116,7 +117,7 @@ build_torch_run_cmd() {
       --pipeline-model-parallel-size $PP_SIZE \
       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
       ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
-      ${USE_MCORE:+--use-mcore-models} \
+      ${USE_LEGACY:+--use-legacy-models} \
       --no-gradient-accumulation-fusion \
       --${TRAINING_DTYPE} \
       --img-h 336 \
diff --git a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
index eccbe00200..45c0c264b9 100755
--- a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
@@ -28,12 +28,13 @@ command="export CUDA_DEVICE_MAX_CONNECTIONS=1;"
 TRANSFORMER_IMPL=local
 TRAINING_DTYPE=bf16
 
+USE_LEGACY=1
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
        command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0;"
-       USE_MCORE=1
+       unset USE_LEGACY
        export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
 fi
 
@@ -114,7 +115,7 @@ build_args() {
     --bf16 \
     --transformer-impl $TRANSFORMER_IMPL \
     --${TRAINING_DTYPE} \
-    ${USE_MCORE:+--use-mcore-models} \
+    ${USE_LEGACY:+--use-legacy-models} \
     ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
     --retro-workdir /workspace/data/retro_data/neighbors
     --retro-add-retriever \
diff --git a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
index 7ad640bb77..ea546d04ba 100755
--- a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
@@ -37,11 +37,12 @@ else
    ADDITIONAL_PARAMS+=" --deterministic-mode"
 fi
 
+USE_LEGACY=1
 if [[ $USE_CORE -eq 1 ]]; then
        echo "Running using megatron core"
        TRANSFORMER_IMPL=local
        TRAINING_DTYPE=bf16
-       USE_MCORE=1
+       unset USE_LEGACY
 fi
 
 if [[ $NO_FA -eq 1 ]]; then
@@ -103,7 +104,6 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
     --vocab-extra-ids 100 \
     --init-method-std 0.015 \
     --transformer-impl $TRANSFORMER_IMPL \
-    --use-mcore-models \
     --data-path $DATA_PATH \
     --vocab-file $VOCAB_PATH \
     --tokenizer-type BertWordPieceCase \
@@ -122,7 +122,7 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
     --eval-iters 10 \
     --distributed-backend nccl \
     ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \
-    ${USE_MCORE:+--use-mcore-models} \
+    ${USE_LEGACY:+--use-legacy-models} \
     ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS}"
 
 command="$command $torch_run_cmd"
diff --git a/tools/checkpoint/loader_llama2.py b/tools/checkpoint/loader_llama2.py
index b7fd02f73a..9b53860f4f 100644
--- a/tools/checkpoint/loader_llama2.py
+++ b/tools/checkpoint/loader_llama2.py
@@ -433,7 +433,7 @@ def _load_checkpoint(queue, args):
 
     margs = validate_args(margs)
 
-    margs.use_mcore_models = False
+    margs.use_legacy_models = True
     margs.transformer_impl = args.loader_transformer_impl
 
     def check_for_arg(arg_name, default=None):
diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py
index 8e571c91c5..52ffb9740c 100644
--- a/tools/checkpoint/loader_mcore.py
+++ b/tools/checkpoint/loader_mcore.py
@@ -82,7 +82,7 @@ def _load_checkpoint(queue, args):
     # Validate margs.
     margs = validate_args(margs)
 
-    margs.use_mcore_models = True
+    margs.use_legacy_models = False
     margs.transformer_impl = args.loader_transformer_impl
 
     def check_for_arg(arg_name, default=None):
@@ -229,7 +229,7 @@ def get_models(count, dtype):
     md.true_vocab_size = true_vocab_size
     md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by
     md.checkpoint_args = checkpoint_args
-    md.use_mcore_models = margs.use_mcore_models
+    md.use_legacy_models = margs.use_legacy_models
 
     # Get transformer block (named either 'encoder' or 'decoder').
     transformer_block_key = get_mcore_transformer_block_key(md.model_type)
diff --git a/tools/checkpoint/loader_megatron.py b/tools/checkpoint/loader_megatron.py
index 7ce41db6c8..b11fd93fd7 100644
--- a/tools/checkpoint/loader_megatron.py
+++ b/tools/checkpoint/loader_megatron.py
@@ -80,7 +80,7 @@ def _load_checkpoint(queue, args):
     # Validate margs.
     margs = validate_args(margs)
 
-    margs.use_mcore_models = False
+    margs.use_legacy_models = True
     margs.transformer_impl = args.loader_transformer_impl
 
     def check_for_arg(arg_name, default=None):
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
index 656103f360..a06ea18554 100644
--- a/tools/checkpoint/saver_mcore.py
+++ b/tools/checkpoint/saver_mcore.py
@@ -383,7 +383,7 @@ def check_message(msg):
     validate_args(margs)
 
     # Use M-core models & unset loaded paths.
-    margs.use_mcore_models = True
+    margs.use_legacy_models = False
     margs.blendable_index_path = None
     margs.data_path = []
     margs.load = None
diff --git a/tools/checkpoint/saver_megatron.py b/tools/checkpoint/saver_megatron.py
index 9722576943..38f80f1c48 100644
--- a/tools/checkpoint/saver_megatron.py
+++ b/tools/checkpoint/saver_megatron.py
@@ -165,7 +165,7 @@ def check_message(msg):
     validate_args(margs)
 
     # Use MLM models.
-    margs.use_mcore_models = False
+    margs.use_legacy_models = True
     margs.transformer_impl = args.saver_transformer_impl
 
     # Do not instantiate Tensorboard
diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py
index c1cdcafb79..2705009044 100755
--- a/tools/retro/text_generation/retro_text_generation.py
+++ b/tools/retro/text_generation/retro_text_generation.py
@@ -28,8 +28,6 @@
 def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
     """Builds the model.
 
-    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
-
     Args:
         pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
         post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
@@ -39,8 +37,11 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
         Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
     """
     print_rank_0('building GPT model ...')
-    config = core_transformer_config_from_args(get_args())
+    args = get_args()
+    config = core_transformer_config_from_args(args)
 
+    assert args.use_legacy_models, 'retro text generation only implemented for legacy models'
+    
     # not support core model yet
     model = megatron.legacy.model.GPTModel(
         config,
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index ed92846dec..3dad098bee 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -31,7 +31,7 @@
 def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
     """Builds the model.
 
-        If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+        If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
 
         Args:
             pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
@@ -53,7 +53,15 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
     else:
         config = core_transformer_config_from_args(args)
 
-    if args.use_mcore_models:
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+    else:
         if args.spec is not None:
             transformer_layer_spec = import_module(args.spec)
         else:
@@ -75,16 +83,6 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent
         )
-    else:
-        assert(args.context_parallel_size == 1), "Context parallelism is only supported with Megatron Core!"
-
-        model = megatron.legacy.model.GPTModel(
-            config,
-            num_tokentypes=0,
-            parallel_output=False,
-            pre_process=pre_process,
-            post_process=post_process
-        )
 
     return model
 

From 4d2b3c5a2ffb5964313c7cbe03360d14e6161583 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Thu, 30 May 2024 12:24:44 -0700
Subject: [PATCH 52/92] multimodal example - coco eval

---
 examples/multimodal/Dockerfile       | 22 ++++++++++
 examples/multimodal/README.md        | 19 +++++++--
 examples/multimodal/evaluate_coco.py | 60 ++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 4 deletions(-)
 create mode 100644 examples/multimodal/Dockerfile
 create mode 100644 examples/multimodal/evaluate_coco.py

diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
new file mode 100644
index 0000000000..0ac8f91b75
--- /dev/null
+++ b/examples/multimodal/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+RUN apt update && \
+    apt -y upgrade && \
+    apt install -y --no-install-recommends \
+        software-properties-common \
+        build-essential \
+        python3-pip \
+        python3-dev \
+        bash \
+        git \
+        vim \
+        python-is-python3 \
+        default-jre
+
+RUN pip install --upgrade pip
+RUN pip install einops sentencepiece braceexpand webdataset
+RUN pip install pytest-cov pytest_mock nltk wrapt
+RUN pip install zarr "tensorstore==0.1.45"
+RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
+RUN pip install black==19.10b0 isort click==8.0.2
+RUN pip install pycocoevalcap megatron-energon
\ No newline at end of file
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index 159241ed1b..f3117d2533 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -4,6 +4,10 @@ NOTE: This is work in progress and not fully functional yet.
 
 ## Setup
 
+### Docker container
+
+You can build a docker container using `examples/multimodal/Dockerfile` to run this example.
+
 ### Vision model
 
 This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
@@ -28,13 +32,20 @@ Run the following script:
 examples/multimodal/sft_8b.sh
 ```
 
+## Evaluation
 
-### Evaluation
-
-## Generation
+### Generation
 
 Run the following script:
 
 ```
-examples/multimodal/text_generation_8b.sh --input-path /path/to/input/images --output-path /some/output/directory --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file
+examples/multimodal/text_generation_8b.sh --input-image-path /path/to/input/images --output-path /some/output/directory --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name
+```
+
+### COCO captioning
+
+First, run text generation using `--task captioning`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
 ```
diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluate_coco.py
new file mode 100644
index 0000000000..501a5df499
--- /dev/null
+++ b/examples/multimodal/evaluate_coco.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import glob
+import json
+
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+
+
+def convert_to_coco_format(input_path):
+    """Convert input files to COCO compatible format."""
+    output_file_path = input_path + "-captioning-merged.json"
+
+    pattern = input_path + "-captioning-[0-9].*jsonl"
+    input_file_paths = glob.glob(pattern)
+
+    captions = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                question_id = res['sample_id']
+                caption = res['caption'].rstrip('.').lower()
+
+                captions.append({"image_id": question_id, "caption": caption})
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(captions, output_file)
+
+    return output_file_path
+
+
+def coco_captioning_eval(input_path, groundtruth_file):
+    """Run COCO captioning evaluation."""
+    coco = COCO(groundtruth_file)
+    input_file = convert_to_coco_format(input_path)
+    coco_result = coco.loadRes(input_file)
+
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # Evaluate on the input subset of images.
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    coco_eval.evaluate()
+
+    for metric, score in coco_eval.eval.items():
+        print(metric, score)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path", type=str, required=True, help="Path to groundtruth file"
+    )
+    args = parser.parse_args()
+
+    coco_captioning_eval(args.input_path, args.groundtruth_path)

From e024654349f1c7cba7c216e2ff569b6d5792aa57 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 29 May 2024 15:30:05 -0700
Subject: [PATCH 53/92] multimodal example - model combiner script

---
 examples/multimodal/combine_state_dicts.py | 76 ++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 examples/multimodal/combine_state_dicts.py

diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py
new file mode 100644
index 0000000000..928be4782d
--- /dev/null
+++ b/examples/multimodal/combine_state_dicts.py
@@ -0,0 +1,76 @@
+import argparse
+import os
+import sys
+
+import torch
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+
+def combine(input_files, module_prefixes, output_files):
+    num_inputs_per_output = int(len(input_files) / len(output_files))
+
+    for output_idx, output_file in enumerate(output_files):
+        combined_state_dict = None
+
+        lb = output_idx * num_inputs_per_output
+        ub = (output_idx + 1) * num_inputs_per_output
+        current_input_files = input_files[lb:ub]
+        current_module_prefixes = module_prefixes[lb:ub]
+
+        for i, (input_file, module_prefix) in enumerate(
+            zip(current_input_files, current_module_prefixes)
+        ):
+            # initialize the combined state dict using the first provided input file
+            current_state_dict = torch.load(input_file)
+            if i == 0:
+                combined_state_dict = current_state_dict.copy()
+                combined_state_dict["model"] = dict()
+
+            # copy model state dict and prefix names with the given module keys.
+            for k, v in current_state_dict["model"].items():
+                combined_state_dict["model"]["%s.%s" % (module_prefix, k)] = v
+
+        torch.save(combined_state_dict, output_file)
+        print("saved:", output_file)
+
+    print("done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Combine multiple state dicts into a single state dict.
+The combined state dict is first initialized by taking a copy of the first provided input state dict.
+To avoid conflicts in model parameter names, a prefix must be provided for each input file.
+Model parameter names will be renamed from <original name> to <model prefix>.<original name>.
+
+
+Example usage:
+python combine_state_dicts.py --input language_model.pt vision_model.pt --prefixes language_model vision_model --output multimodal.pt
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--input", nargs="*", required=True, help="paths to input state dict files")
+    parser.add_argument(
+        "--prefixes",
+        nargs="*",
+        required=True,
+        help="prefixes to use with each input model's parameters",
+    )
+    parser.add_argument(
+        "--output", nargs="*", required=True, help="path(s) to output state dict file"
+    )
+
+    args = parser.parse_args()
+
+    assert len(args.input) > 1, "must provide more than 1 input model to combine"
+    assert len(args.input) == len(args.prefixes), "each input model must have a corresponding key"
+    assert (
+        len(args.input) % len(args.output) == 0
+    ), "each output file must use the same number of input files"
+
+    combine(args.input, args.prefixes, args.output)

From abf10f85907ece699300edc2204649bbb47d4073 Mon Sep 17 00:00:00 2001
From: Jack Chang <jianbinc@nvidia.com>
Date: Thu, 30 May 2024 15:50:23 -0700
Subject: [PATCH 54/92] Fix dual-optimizer gradient clipping issue

---
 megatron/core/optimizer/__init__.py           |  13 +-
 megatron/core/optimizer/clip_grads.py         |  58 +++---
 megatron/core/optimizer/distrib_optimizer.py  |   9 +-
 megatron/core/optimizer/optimizer.py          | 181 +++++++++++++-----
 megatron/core/parallel_state.py               |  23 ++-
 ...1-te-8experts2parallel-dist-optimizer.json |   2 +-
 ...-pp1-te-8experts2parallel-groupedgemm.json |   2 +-
 ...-grad-reduce-param-gather-groupedgemm.json |   2 +-
 ...2-pp1-te-8experts2parallel-top2router.json |   2 +-
 ...8g-mcore-tp2-pp1-te-8experts2parallel.json |   2 +-
 10 files changed, 210 insertions(+), 84 deletions(-)

diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
index 95e6c31377..66d518675d 100644
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -152,6 +152,7 @@ def _get_megatron_optimizer_based_on_param_groups(
     config: OptimizerConfig,
     param_groups: List,
     per_model_buffers: Optional[Dict[int, List[ParamAndGradBuffer]]] = None,
+    model_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
     data_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
     data_parallel_group_gloo: Optional[torch.distributed.ProcessGroup] = None,
     data_parallel_group_idx: Optional[int] = None,
@@ -245,11 +246,13 @@ def init_state_fn(opt):
             )
         else:
             optimizer = Float16OptimizerWithFloat16Params(*optimizer_args)
+            setattr(optimizer, 'model_parallel_group', model_parallel_group)
+    else:
+        # FP32 optimizer.
+        optimizer = FP32Optimizer(optimizer, config, init_state_fn,)
+        setattr(optimizer, 'model_parallel_group', model_parallel_group)
 
-        return optimizer
-
-    # FP32.
-    return FP32Optimizer(optimizer, config, init_state_fn,)
+    return optimizer
 
 
 def get_megatron_optimizer(
@@ -316,6 +319,7 @@ def get_megatron_optimizer(
             config,
             param_groups=dense_param_groups,
             per_model_buffers=per_model_buffers,
+            model_parallel_group=mpu.get_model_parallel_group(),
             data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
             data_parallel_group_gloo=mpu.get_data_parallel_group_gloo(with_context_parallel=True),
             data_parallel_group_idx=model_parallel_rank,
@@ -329,6 +333,7 @@ def get_megatron_optimizer(
                 config,
                 param_groups=moe_param_groups,
                 per_model_buffers=per_model_ep_buffers,
+                model_parallel_group=mpu.get_model_parallel_group(with_expert_parallel=True),
                 data_parallel_group=mpu.get_data_modulo_expert_parallel_group(),
                 data_parallel_group_gloo=mpu.get_data_modulo_expert_parallel_group_gloo(),
                 data_parallel_group_idx=expert_parallel_rank * model_parallel_world_size
diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py
index cfb0c332f5..6c61be86fe 100644
--- a/megatron/core/optimizer/clip_grads.py
+++ b/megatron/core/optimizer/clip_grads.py
@@ -14,49 +14,32 @@
 from ..transformer.module import param_is_not_shared
 
 
-def clip_grad_norm_fp32(
-    parameters: Union[List[torch.Tensor], torch.Tensor],
+def get_grad_norm_fp32(
     grads_for_norm: Union[List[torch.Tensor], torch.Tensor],
-    max_norm: Union[int, float],
     norm_type: Union[int, float] = 2,
     model_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> float:
-    """Clips gradient norm of an iterable of parameters whose gradients
-       are in fp32.
+    """Calculate the norm of gradients in fp32.
 
     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
+    added functionality to handle model parallel parameters.
 
-    Args:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized.
-        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
+    Arguments:
+        grads_for_norm (Iterable[Tensor] or Tensor): an iterable of Tensors or a single
             Tensor that will be used for calculating the grad norm.
-        max_norm (float or int): max norm of the gradients.
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
-        model_parallel_group (torch.distributed.ProcessGroup, optional): model-parallel
-            group over which grad norm needs to be aggregated.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.
 
     Returns:
         Total norm of the parameters (viewed as a single vector).
     """
 
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
     if isinstance(grads_for_norm, torch.Tensor):
         grads_for_norm = [grads_for_norm]
 
-    # Grads.
-    grads = []
-    for param in parameters:
-        if param.grad is not None:
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(param.grad.detach())
-
     # Norm parameters.
-    max_norm = float(max_norm)
     norm_type = float(norm_type)
     total_norm = 0.0
 
@@ -100,6 +83,31 @@ def clip_grad_norm_fp32(
         )
         total_norm = total_norm.item() ** (1.0 / norm_type)
 
+    return total_norm
+
+
+def clip_grad_by_total_norm_fp32(
+    parameters: Union[List[torch.Tensor], torch.Tensor],
+    max_norm: Union[int, float],
+    total_norm: float,
+):
+    """Clips gradient of an iterable of parameters in fp32 by total norm.
+    
+    Note that the gradients are modified in place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized.
+        max_norm (float or int): max norm of the gradients.
+        total_norm (float): total norm of the gradients.
+    """
+    # Grads.
+    grads = []
+    for param in parameters:
+        if param.grad is not None:
+            assert param.grad.type() == 'torch.cuda.FloatTensor'
+            grads.append(param.grad.detach())
+
     # Scale.
     clip_coeff = max_norm / (total_norm + 1.0e-6)
     if clip_coeff < 1.0:
@@ -108,8 +116,6 @@ def clip_grad_norm_fp32(
             amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff
         )
 
-    return total_norm
-
 
 def count_zeros_fp32(
     parameters: Union[List[torch.Tensor], torch.Tensor],
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
index 3e71e0ad2b..c297f4ef4d 100644
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -1420,13 +1420,12 @@ def _reset_metadata_and_sync_gather_all_model_params(self, force_sync: bool):
                 self._dispatch_gather_model_params(all_gather_handle_index, force_sync=force_sync)
 
     @torch.no_grad()
-    def step(self):
-        """
-        Step optimizer.
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful.
         Under the hood, either launch synchronous param all-gathers or get ready to launch
         asynchorous all-gathers that get overlapped with the next forward pass.
         """
-        self.update_successful, grad_norm, num_zeros_in_grad = super().step()
+        self.update_successful = super().step_with_ready_grads()
 
         timers = self.config.timers
         if timers is not None:
@@ -1440,4 +1439,4 @@ def step(self):
         if timers is not None:
             timers('params-all-gather').stop()
 
-        return self.update_successful, grad_norm, num_zeros_in_grad
+        return self.update_successful
diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
index 0ae938212a..b84e523a05 100644
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -21,7 +21,7 @@
 )
 from ..dist_checkpointing.utils import add_prefix_for_sharding
 from ..transformer.module import param_is_not_shared
-from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
+from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32
 from .grad_scaler import MegatronGradScaler
 from .optimizer_config import OptimizerConfig
 
@@ -119,15 +119,37 @@ def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
 
     def get_model_parallel_group(self) -> torch.distributed.ProcessGroup:
         """Default returned here, but the distributed optimizer overrides this."""
+        if hasattr(self, 'model_parallel_group'):
+            return self.model_parallel_group
         return parallel_state.get_model_parallel_group()
 
+    @abstractmethod
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        return False
+
+    @abstractmethod
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        return True
+
+    @torch.no_grad()
+    def get_grad_norm(self):
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        total_norm = get_grad_norm_fp32(
+            grads_for_norm, model_parallel_group=self.get_model_parallel_group(),
+        )
+        return total_norm
+
     def clip_grad_norm(self, clip_grad: float) -> float:
         """Compute grad norm."""
         params = self.get_parameters()
         grads_for_norm = self.get_main_grads_for_grad_norm()
-        return clip_grad_norm_fp32(
-            params, grads_for_norm, clip_grad, model_parallel_group=self.get_model_parallel_group(),
+        grad_norm = get_grad_norm_fp32(
+            grads_for_norm, model_parallel_group=self.get_model_parallel_group()
         )
+        clip_grad_by_total_norm_fp32(params, clip_grad, grad_norm)
+        return grad_norm
 
     def count_zeros(self) -> float:
         """Count number of zeros in model's gradients."""
@@ -297,8 +319,8 @@ def _unscale_main_grads_and_check_for_nan(self):
         return found_inf_flag
 
     @torch.no_grad()
-    def step(self):
-
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
         timers = self.config.timers
 
         # Copy gradients from model params to main params.
@@ -327,9 +349,41 @@ def step(self):
             # so we can update the loss scale.
             self.grad_scaler.update(found_inf_flag)
 
-            # If we found inf/nan, skip the update.
-            if found_inf_flag:
-                return False, None, None
+            return found_inf_flag
+
+        return False
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        timers = self.config.timers
+        # Step the optimizer.
+        if timers is not None:
+            timers('optimizer-inner-step', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self.optimizer.step()
+        if timers is not None:
+            timers('optimizer-inner-step').stop()
+
+        # Update params from main params.
+        if timers is not None:
+            timers('optimizer-copy-main-to-model-params', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self._copy_main_params_to_model_params()
+        if timers is not None:
+            timers('optimizer-copy-main-to-model-params').stop()
+
+        return True
+
+    @torch.no_grad()
+    def step(self):
+        timers = self.config.timers
+
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
 
         # Clip the main gradients.
         if timers is not None:
@@ -351,26 +405,10 @@ def step(self):
         if timers is not None:
             timers('optimizer-count-zeros').stop()
 
-        # Step the optimizer.
-        if timers is not None:
-            timers('optimizer-inner-step', log_level=1).start(
-                barrier=self.config.barrier_with_L1_time
-            )
-        self.optimizer.step()
-        if timers is not None:
-            timers('optimizer-inner-step').stop()
-
-        # Update params from main params.
-        if timers is not None:
-            timers('optimizer-copy-main-to-model-params', log_level=1).start(
-                barrier=self.config.barrier_with_L1_time
-            )
-        self._copy_main_params_to_model_params()
-        if timers is not None:
-            timers('optimizer-copy-main-to-model-params').stop()
+        success = self.step_with_ready_grads()
 
         # Successful update.
-        return True, grad_norm, num_zeros_in_grad
+        return success, grad_norm, num_zeros_in_grad
 
 
 class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
@@ -632,10 +670,8 @@ def get_loss_scale(self):
         return self._scale
 
     @torch.no_grad()
-    def step(self):
-        """Clip gradients (if needed) and step the base optimizer.
-        Always return successful since there is no overflow."""
-
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
         timers = self.config.timers
 
         # Copy main_grads to grads.
@@ -649,6 +685,34 @@ def step(self):
         if timers is not None:
             timers('optimizer-copy-to-main-grad').stop()
 
+        return False
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        timers = self.config.timers
+
+        # Update parameters.
+        if timers is not None:
+            timers('optimizer-inner-step', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self.optimizer.step()
+        if timers is not None:
+            timers('optimizer-inner-step').stop()
+
+        return True
+
+    @torch.no_grad()
+    def step(self):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+        timers = self.config.timers
+
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
+
         # Clip gradients.
         if timers is not None:
             timers('optimizer-clip-main-grad', log_level=1).start(
@@ -669,17 +733,10 @@ def step(self):
         if timers is not None:
             timers('optimizer-count-zeros').stop()
 
-        # Update parameters.
-        if timers is not None:
-            timers('optimizer-inner-step', log_level=1).start(
-                barrier=self.config.barrier_with_L1_time
-            )
-        self.optimizer.step()
-        if timers is not None:
-            timers('optimizer-inner-step').stop()
+        success = self.step_with_ready_grads()
 
         # No overflow for FP32 optimizer.
-        return True, grad_norm, num_zeros_in_grad
+        return success, grad_norm, num_zeros_in_grad
 
     def reload_model_params(self):
         pass
@@ -793,6 +850,24 @@ def load_state_dict(self, state_dict):
         for optimizer, state in zip(self.chained_optimizers, state_dict):
             optimizer.load_state_dict(state)
 
+    @torch.no_grad()
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        found_inf_flag = False
+        for optimizer in self.chained_optimizers:
+            found_inf_flag |= optimizer.prepare_grads()
+
+        return found_inf_flag
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        success = True
+        for optimizer in self.chained_optimizers:
+            success &= optimizer.step_with_ready_grads()
+
+        return success
+
     def disable_pre_hook(self):
         for optimizer in self.chained_optimizers:
             if (
@@ -817,19 +892,39 @@ def enable_pre_hook(self):
                 )
             optimizer.enable_pre_hook()
 
+    @torch.no_grad()
     def step(self):
         """ChainedOptimizer will step all optimizers one by one.
         """
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
 
-        update_successful, grad_norm, num_zeros_in_grad = True, 0, 0
+        # Get grad norm.
         grad_norms = []
         for optimizer in self.chained_optimizers:
-            _update_successful, _grad_norm, _num_zeros_in_grad = optimizer.step()
-            update_successful &= _update_successful
+            _grad_norm = optimizer.get_grad_norm()
             grad_norms += [_grad_norm if _grad_norm else 0.0]
-            num_zeros_in_grad += _num_zeros_in_grad if _num_zeros_in_grad else 0
         grad_norm = math.sqrt(sum([x ** 2 for x in grad_norms]))
 
+        # Clip gradients.
+        for optimizer in self.chained_optimizers:
+            if optimizer.config.clip_grad > 0.0:
+                clip_grad_by_total_norm_fp32(
+                    optimizer.get_parameters(),
+                    max_norm=optimizer.config.clip_grad,
+                    total_norm=grad_norm,
+                )
+
+        # Count the zeros in the grads.
+        num_zeros_in_grad = 0
+        for optimizer in self.chained_optimizers:
+            num_zeros_in_grad += (
+                optimizer.count_zeros() if optimizer.config.log_num_zeros_in_grad else 0
+            )
+
+        update_successful = self.step_with_ready_grads()
+
         return update_successful, grad_norm, num_zeros_in_grad
 
     def save_parameter_state(self, filename: str):
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
index fdbff2c311..53b378260b 100644
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
@@ -17,6 +17,8 @@
 _PIPELINE_MODEL_PARALLEL_GROUP = None
 # Model parallel group (both intra- and pipeline) that the current rank belongs to.
 _MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra-, pipeline, and expert) that the current rank belongs to.
+_MODEL_AND_EXPERT_PARALLEL_GROUP = None
 # Embedding group.
 _EMBEDDING_GROUP = None
 # Position embedding group.
@@ -554,6 +556,18 @@ def initialize_model_parallel(
         if rank in ranks:
             _MODEL_PARALLEL_GROUP = group
 
+    # Build the model-parallel groups with expert parallel
+    global _MODEL_AND_EXPERT_PARALLEL_GROUP
+    assert (
+        _MODEL_AND_EXPERT_PARALLEL_GROUP is None
+    ), 'model and expert parallel group is already initialized'
+    for ranks in rank_generator.get_ranks('tp-ep-pp', independent_ep=True):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('mp_exp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _MODEL_AND_EXPERT_PARALLEL_GROUP = group
+
     # Build the tensor model-parallel groups.
     global _TENSOR_MODEL_PARALLEL_GROUP
     global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
@@ -714,8 +728,13 @@ def model_parallel_is_initialized():
     return True
 
 
-def get_model_parallel_group():
+def get_model_parallel_group(with_expert_parallel=False):
     """Get the model parallel group the caller rank belongs to."""
+    if with_expert_parallel:
+        assert (
+            _MODEL_AND_EXPERT_PARALLEL_GROUP is not None
+        ), 'model parallel group is not initialized'
+        return _MODEL_AND_EXPERT_PARALLEL_GROUP
     assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
     return _MODEL_PARALLEL_GROUP
 
@@ -1200,6 +1219,8 @@ def destroy_model_parallel():
     """Set the groups to none."""
     global _MODEL_PARALLEL_GROUP
     _MODEL_PARALLEL_GROUP = None
+    global _MODEL_AND_EXPERT_PARALLEL_GROUP
+    _MODEL_AND_EXPERT_PARALLEL_GROUP = None
     global _TENSOR_MODEL_PARALLEL_GROUP
     _TENSOR_MODEL_PARALLEL_GROUP = None
     global _PIPELINE_MODEL_PARALLEL_GROUP
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
index 12df0ef48c..cd90f50218 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86453, 10.87233, 10.80777, 10.71193, 10.63878, 10.19208, 10.3079, 10.21681, 9.90869]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 36902.0, 37803.0, 36259.0, 33529.0, 35091.0, 30918.0, 35455.0, 36584.0, 37538.0]}, "iteration_timing_avg": 0.2890776470588235}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86462, 10.87239, 10.80678, 10.7118, 10.63911, 10.19319, 10.30944, 10.21988, 9.91603]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37033.0, 37783.0, 36040.0, 33452.0, 34761.0, 30933.0, 35487.0, 36392.0, 37655.0]}, "iteration_timing_avg": 0.3566726470588235}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
index b1e031706b..f2d71116c6 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86535, 10.86435, 10.80257, 10.71679, 10.64491, 10.21076, 10.31975, 10.2191, 9.92009]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16395.0, 19716.0, 19656.0, 18538.0, 17152.0, 17399.0, 15327.0, 17720.0, 18390.0, 18684.0]}, "iteration_timing_avg": 0.19267441176470584}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86367, 10.80237, 10.71665, 10.6452, 10.21186, 10.32279, 10.22474, 9.93034]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38080.0, 36072.0, 33389.0, 34302.0, 30262.0, 35071.0, 36081.0, 36818.0]}, "iteration_timing_avg": 0.2153429411764706}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
index 7e169607b0..01e08844c2 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86512, 10.86334, 10.80317, 10.71694, 10.64429, 10.21025, 10.31925, 10.21976, 9.92004]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37837.0, 38276.0, 36315.0, 33331.0, 34715.0, 30485.0, 34571.0, 36189.0, 36953.0]}, "iteration_timing_avg": 0.17911029411764712}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86435, 10.80239, 10.7159, 10.6454, 10.21181, 10.32236, 10.22471, 9.92956]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38276.0, 36278.0, 32946.0, 34291.0, 30145.0, 35217.0, 36060.0, 37032.0]}, "iteration_timing_avg": 0.21900323529411767}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
index e946d83fa3..dc0db6b1f8 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86905, 10.87593, 10.79804, 10.66451, 10.5803, 10.05453, 10.18348, 10.09461, 9.7533]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [13204.0, 16437.0, 17053.0, 16247.0, 14948.0, 15533.0, 14496.0, 17106.0, 17472.0, 18590.0]}, "iteration_timing_avg": 0.3051714705882352}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86872, 10.87553, 10.79762, 10.66445, 10.58091, 10.05497, 10.186, 10.0967, 9.75727]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [25918.0, 32306.0, 32291.0, 31879.0, 28498.0, 31096.0, 28681.0, 33729.0, 34593.0, 37080.0]}, "iteration_timing_avg": 0.27284176470588234}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
index 7e0b0a6092..50f16e7dd9 100644
--- a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86447, 10.87277, 10.80684, 10.71251, 10.63895, 10.19317, 10.30823, 10.21751, 9.90833]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [16117.0, 19202.0, 19572.0, 18615.0, 17501.0, 17675.0, 15669.0, 18087.0, 18717.0, 19010.0]}, "iteration_timing_avg": 0.29991823529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86506, 10.87184, 10.80703, 10.71158, 10.63915, 10.1929, 10.30937, 10.21969, 9.91592]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37021.0, 37806.0, 36157.0, 33974.0, 34873.0, 30957.0, 35062.0, 36419.0, 37713.0]}, "iteration_timing_avg": 0.35529294117647064}
\ No newline at end of file

From 707fe9088cec0f61d7654a2d55570ffd14d52220 Mon Sep 17 00:00:00 2001
From: Abhinav Khattar <akhattar@nvidia.com>
Date: Thu, 30 May 2024 16:19:38 -0700
Subject: [PATCH 55/92] groupedGEMM correct library version in import guard

---
 megatron/core/transformer/moe/grouped_gemm_util.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/megatron/core/transformer/moe/grouped_gemm_util.py b/megatron/core/transformer/moe/grouped_gemm_util.py
index e7ef79d795..409244de7c 100644
--- a/megatron/core/transformer/moe/grouped_gemm_util.py
+++ b/megatron/core/transformer/moe/grouped_gemm_util.py
@@ -1,5 +1,9 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+from importlib.metadata import version
+
+from pkg_resources import packaging
+
 try:
     import grouped_gemm
 except ImportError:
@@ -13,7 +17,13 @@ def grouped_gemm_is_available():
 def assert_grouped_gemm_is_available():
     assert grouped_gemm_is_available(), (
         "Grouped GEMM is not available. Please run "
-        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`."
+        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2`."
+    )
+
+    _gg_version = packaging.version.Version(version("grouped_gemm"))
+    assert _gg_version >= packaging.version.Version("1.1.2"), (
+        "Grouped GEMM should be v1.1.2 or newer. Please run "
+        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2`."
     )
 
 

From 47efd0121383fc49728081051de7323c0a994516 Mon Sep 17 00:00:00 2001
From: Jared Casper <jcasper@nvidia.com>
Date: Thu, 30 May 2024 16:20:30 -0700
Subject: [PATCH 56/92] Revert "Merge branch 'gg_guard' into 'main'"

This reverts merge request !1513
---
 megatron/core/transformer/moe/grouped_gemm_util.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/megatron/core/transformer/moe/grouped_gemm_util.py b/megatron/core/transformer/moe/grouped_gemm_util.py
index 409244de7c..e7ef79d795 100644
--- a/megatron/core/transformer/moe/grouped_gemm_util.py
+++ b/megatron/core/transformer/moe/grouped_gemm_util.py
@@ -1,9 +1,5 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from importlib.metadata import version
-
-from pkg_resources import packaging
-
 try:
     import grouped_gemm
 except ImportError:
@@ -17,13 +13,7 @@ def grouped_gemm_is_available():
 def assert_grouped_gemm_is_available():
     assert grouped_gemm_is_available(), (
         "Grouped GEMM is not available. Please run "
-        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2`."
-    )
-
-    _gg_version = packaging.version.Version(version("grouped_gemm"))
-    assert _gg_version >= packaging.version.Version("1.1.2"), (
-        "Grouped GEMM should be v1.1.2 or newer. Please run "
-        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2`."
+        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`."
     )
 
 

From 0f2ce07125124feeaa89cb0673d85f2fa2c8c1a8 Mon Sep 17 00:00:00 2001
From: Abhinav Khattar <akhattar@nvidia.com>
Date: Thu, 30 May 2024 16:25:12 -0700
Subject: [PATCH 57/92] Update groupedgemm version in test dockerfile

---
 Dockerfile.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.test b/Dockerfile.test
index dd7638ae6d..e62aafba29 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -8,4 +8,4 @@ RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 
 RUN apt-get update && apt-get install -y --no-install-recommends
 
-RUN pip3 install --no-cache-dir einops flask-restful nltk pytest pytest-cov pytest_mock sentencepiece wrapt git+https://github.com/fanshiqing/grouped_gemm@v1.1.1
\ No newline at end of file
+RUN pip3 install --no-cache-dir einops flask-restful nltk pytest pytest-cov pytest_mock sentencepiece wrapt git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
\ No newline at end of file

From 28c8dd71f0e6070c12433ccf13bca76bed336770 Mon Sep 17 00:00:00 2001
From: Xuwen Chen <xuwenc@nvidia.com>
Date: Thu, 30 May 2024 21:34:51 -0700
Subject: [PATCH 58/92] Fix issue #109 Weird outputs when inferring on models
 with GroupedGEMM

---
 megatron/core/tensor_parallel/layers.py           |  4 ++--
 megatron/core/transformer/moe/moe_layer.py        | 10 ++++++++++
 megatron/core/transformer/moe/token_dispatcher.py |  8 ++++++--
 megatron/training/arguments.py                    |  3 ---
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index fcb24d2585..ca7c2c3bdc 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -679,7 +679,7 @@ def __init__(
         self.disable_grad_reduce = disable_grad_reduce
 
         self.explicit_expert_comm = self.is_expert and (
-            config.sequence_parallel or self.expert_parallel
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
         )
         if self.explicit_expert_comm and config.moe_extended_tp:
             world_size = get_tensor_and_expert_parallel_world_size()
@@ -941,7 +941,7 @@ def __init__(
             raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
 
         self.explicit_expert_comm = self.is_expert and (
-            config.sequence_parallel or self.expert_parallel
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
         )
 
         # Divide the weight matrix along the last dimension.
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
index ba37500116..d42f409a06 100644
--- a/megatron/core/transformer/moe/moe_layer.py
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -90,6 +90,16 @@ def __init__(
         self.moe_layer_recompute = config.moe_layer_recompute
 
     def forward(self, hidden_states: torch.Tensor):
+        if (
+            self.training
+            and self.config.tensor_model_parallel_size > 1
+            and not self.config.sequence_parallel
+        ):
+            raise ValueError(
+                "During training, performance may degrade if MoE and tensor parallelism"
+                "are enabled without also enabling sequence parallelism."
+            )
+
         # process MoE
         def custom_forward(hidden_states):
             probs, indices = self.router(hidden_states)
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
index 515a96ff47..e0e112d94b 100644
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -107,7 +107,9 @@ def token_permutation(
         hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
 
         # Permute the tokens across the expert parallel devices.
-        if self.config.sequence_parallel or (self.config.expert_model_parallel_size > 1):
+        if (self.config.tensor_model_parallel_size > 1) or (
+            self.config.expert_model_parallel_size > 1
+        ):
             with torch.no_grad():
                 global_indices = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
                     max_ind
@@ -214,7 +216,9 @@ def token_unpermutation(
         output_bias_total = unpermuted_local_bias
 
         # Unpermute the tokens across expert parallel devices.
-        if self.config.sequence_parallel or (self.config.expert_model_parallel_size > 1):
+        if (self.config.tensor_model_parallel_size > 1) or (
+            self.config.expert_model_parallel_size > 1
+        ):
             assert (
                 self.global_local_map is not None
             ), "global_local_map is necessary for `AllGather`."
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6b038669f7..c829c52f19 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -501,9 +501,6 @@ def validate_args(args, defaults={}):
     # MoE Spec check
     if args.num_experts is not None:
         assert args.spec is None, "Model Spec must be None when using MoEs"
-        if args.tensor_model_parallel_size > 1:
-            assert args.sequence_parallel, \
-                "When using MoE and tensor parallelism, sequence parallelism must be used."
 
     # Expert parallelism check
     if args.expert_model_parallel_size  > 1:

From de48720f0f245085125dc6397f797d2321ba1f0d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 31 May 2024 09:45:46 -0700
Subject: [PATCH 59/92] Update Nightly golden values

* Updates bert/gpt nightly baselines
* fixes an issue where resume tests weren't correctly testing the deterministic path
* standardize using `{name}` instead of `{key.split('\')[1]}` since the latter requires assumptions of what the JET key logic is
  * renames all merge-request baselines to follow this convention
---
 .gitlab-ci.yml                                |  7 ++--
 .../functional_tests/jet_recipes/MR-bert.yaml |  2 +-
 .../jet_recipes/MR-gpt-nemo.yaml              |  2 +-
 .../functional_tests/jet_recipes/MR-gpt.yaml  |  2 +-
 .../jet_recipes/MR-multimodal.yaml            |  2 +-
 tests/functional_tests/jet_recipes/MR-t5.yaml |  2 +-
 .../jet_recipes/monthly-t5.yaml               |  2 +-
 .../jet_recipes/nightly-bert.yaml             |  2 +-
 .../jet_recipes/nightly-gpt.yaml              |  2 +-
 .../jet_recipes/weekly-gpt.yaml               |  2 +-
 .../python_test_utils/common.py               | 32 ++++++++++++-------
 .../get_test_results_from_tensorboard_logs.py | 27 +---------------
 .../python_test_utils/jet_test_pipeline.py    | 10 +++---
 .../test_resume_checkpoint_pipeline.py        | 24 ++------------
 .../run_selene_test_launcher_script.sh        |  2 +-
 ...m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json |  1 -
 ...ghtly-dgx-a100-1n8g-mcore-tp1-pp4-vp2.json |  1 -
 ...m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json |  1 -
 ...rt-345m-nightly-dgx-a100-1n8g-tp1-pp2.json |  1 -
 ...rt-345m-nightly-dgx-a100-1n8g-tp4-pp1.json |  1 -
 ...-request_dgx_a100_1N8G_mcore_tp2_pp2.json} |  0
 ...x_a100_1N8G_mcore_tp2_pp2_local_spec.json} |  0
 ...ge-request_dgx_a100_1N8G_tp1_pp4_vp2.json} |  0
 ..._merge-request_dgx_a100_1N8G_tp2_pp2.json} |  0
 ...request_resume_dgx_a100_1N8G_tp1_pp2.json} |  0
 ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json |  1 +
 ...ghtly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json |  1 +
 ...m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json |  1 +
 ...rt_345m_nightly_dgx_a100_1N8G_tp1_pp2.json |  1 +
 ...rt_345m_nightly_dgx_a100_1N8G_tp4_pp1.json |  1 +
 ...izer-overlap-grad-reduce-param-gather.json |  1 -
 ...m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json |  1 -
 ...m-nightly-dgx-a100-1n8g-mcore-tp1-pp4.json |  1 -
 ...x-a100-1n8g-mcore-tp2-pp2-te-2experts.json |  1 -
 ...8g-mcore-tp2-pp2-te-4experts2parallel.json |  1 -
 ...m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json |  1 -
 ...p1-dist-optimizer-overlap-grad-reduce.json |  1 -
 ...a100-1n8g-tp1-pp1-overlap-grad-reduce.json |  1 -
 ...t3-345m-nightly-dgx-a100-1n8g-tp1-pp2.json |  1 -
 ...a100-1n8g-tp1-pp4-overlap-grad-reduce.json |  1 -
 ...-1n8g-tp1-pp4-vp1-overlap-grad-reduce.json |  1 -
 ...t3-345m-nightly-dgx-a100-1n8g-tp1-pp4.json |  1 -
 ...ightly-dgx-a100-1n8g-tp2-pp2-4experts.json |  1 -
 ...a100-1n8g-tp2-pp2-overlap-grad-reduce.json |  1 -
 ...a100-1n8g-tp4-pp1-overlap-grad-reduce.json |  1 -
 ...t3-345m-nightly-dgx-a100-1n8g-tp4-pp1.json |  1 -
 ...00_1N8G_mcore_tp1_pp1_dist_optimizer.json} |  0
 ...pp1_dist_optimizer_no_mmap_bin_files.json} |  0
 ...mcore_tp1_pp1_uniform_full_recompute.json} |  0
 ...0_1N8G_mcore_tp1_pp2_rope_embeddings.json} |  0
 ...ope_embeddings_interleaved_no_fusion.json} |  0
 ...8G_mcore_tp1_pp4_disable_bias_linear.json} |  0
 ...1N8G_mcore_tp1_pp4_sequence_parallel.json} |  0
 ...t_dgx_a100_1N8G_mcore_tp1_pp4_swiglu.json} |  0
 ...tp1_pp4_untie_embeddings_and_outputs.json} |  0
 ...uest_dgx_a100_1N8G_mcore_tp1_pp4_vp1.json} |  0
 ...tp1_pp4_vp1_calculate_per_token_loss.json} |  0
 ..._1N8G_mcore_tp1_pp4_vp1_decoupled_lr.json} |  0
 ...1_dist_optimizer_overlap_grad_reduce.json} |  0
 ...zer_overlap_grad_reduce_param_gather.json} |  0
 ...optimizer_overlap_grad_reduce_untied.json} |  0
 ...G_mcore_tp2_pp1_cp2_nondeterministic.json} |  0
 ...G_mcore_tp2_pp1_te_8experts2parallel.json} |  0
 ..._te_8experts2parallel_dist_optimizer.json} |  0
 ...pp1_te_8experts2parallel_groupedGEMM.json} |  0
 ...grad_reduce_param_gather_groupedGEMM.json} |  0
 ..._pp1_te_8experts2parallel_top2router.json} |  0
 ...-request_dgx_a100_1N8G_mcore_tp2_pp2.json} |  0
 ...G_mcore_tp2_pp2_cp2_nondeterministic.json} |  0
 ..._create_attention_mask_in_dataloader.json} |  0
 ...1N8G_mcore_tp2_pp2_no_mmap_bin_files.json} |  0
 ...1_dist_optimizer_overlap_grad_reduce.json} |  0
 ...zer_overlap_grad_reduce_param_gather.json} |  0
 ...mcore_tp4_pp1_qk_layernorm_test_mode.json} |  0
 ...rge-request_dgx_a100_1N8G_te_tp2_pp2.json} |  0
 ...ge-request_dgx_a100_1N8G_tp1_pp4_vp1.json} |  0
 ..._merge-request_dgx_a100_1N8G_tp2_pp2.json} |  0
 ...request_resume_dgx_a100_1N8G_tp1_pp2.json} |  0
 ...izer_overlap_grad_reduce_param_gather.json |  1 +
 ...izer_overlap_grad_reduce_param_gather.json |  1 +
 ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json |  1 +
 ..._1N8G_mcore_tp1_pp2_resume_torch_dist.json |  1 +
 ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json |  1 +
 ..._1N8G_mcore_tp1_pp4_resume_torch_dist.json |  1 +
 ...tp2_pp2_resume_torch_dist_te_2experts.json |  1 +
 ...esume_torch_dist_te_4experts2parallel.json |  1 +
 ...x_a100_1N8G_mcore_tp2_pp2_te_2experts.json |  1 +
 ...8G_mcore_tp2_pp2_te_4experts2parallel.json |  1 +
 ...m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json |  1 +
 ..._a100_1N8G_mcore_tp4_pp1_resume_torch.json |  1 +
 ..._1N8G_mcore_tp4_pp1_resume_torch_dist.json |  1 +
 ...p1_dist_optimizer_overlap_grad_reduce.json |  1 +
 ...a100_1N8G_tp1_pp1_overlap_grad_reduce.json |  1 +
 ...t3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json |  1 +
 ...ly_dgx_a100_1N8G_tp1_pp2_resume_torch.json |  1 +
 ...t3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json |  1 +
 ...a100_1N8G_tp1_pp4_overlap_grad_reduce.json |  1 +
 ...ly_dgx_a100_1N8G_tp1_pp4_resume_torch.json |  1 +
 ..._1N8G_tp1_pp4_vp1_overlap_grad_reduce.json |  1 +
 ...ightly_dgx_a100_1N8G_tp2_pp2_4experts.json |  1 +
 ...a100_1N8G_tp2_pp2_overlap_grad_reduce.json |  1 +
 ...00_1N8G_tp2_pp2_resume_torch_4experts.json |  1 +
 ..._pp2_resume_torch_overlap_grad_reduce.json |  1 +
 ...t3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json |  1 +
 ...a100_1N8G_tp4_pp1_overlap_grad_reduce.json |  1 +
 ...ly_dgx_a100_1N8G_tp4_pp1_resume_torch.json |  1 +
 ...quest_dgx_a100_1N8G_mcore_te_tp1_pp1.json} |  0
 ...tp1_pp1_vp1_calculate_per_token_loss.json} |  0
 .../bert/pretrain_bert_distributed_test.sh    |  5 ++-
 .../gpt3/pretrain_gpt3_distributed_test.sh    |  4 +--
 .../pretrain_llava_distributed_test.sh        |  2 +-
 .../retro/pretrain_retro_distributed_test.sh  |  2 +-
 .../t5/pretrain_t5_distributed_test.sh        |  2 +-
 113 files changed, 86 insertions(+), 103 deletions(-)
 delete mode 100644 tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
 delete mode 100644 tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4-vp2.json
 delete mode 100644 tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
 delete mode 100644 tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
 delete mode 100644 tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
 rename tests/functional_tests/test_results/jet/{bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json => bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json => bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_local_spec.json} (100%)
 rename tests/functional_tests/test_results/jet/{bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json => bert_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json => bert_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{bert-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json => bert_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json} (100%)
 create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
 create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json
 create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
 create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
 create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-2experts.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-4experts2parallel.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-dist-optimizer-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-vp1-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-4experts.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1-overlap-grad-reduce.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_no_mmap_bin_files.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_uniform_full_recompute.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings_interleaved_no_fusion.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_disable_bias_linear.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_sequence_parallel.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_swiglu.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_untie_embeddings_and_outputs.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_calculate_per_token_loss.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_decoupled_lr.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_cp2_nondeterministic.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_dist_optimizer.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_groupedGEMM.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_top2router.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cp2_nondeterministic.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_create_attention_mask_in_dataloader.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_mmap_bin_files.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_qk_layernorm_test_mode.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json => gpt3_345m_merge-request_dgx_a100_1N8G_te_tp2_pp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json => gpt3_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp1.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json => gpt3_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json} (100%)
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json => gpt3_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json} (100%)
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json
 rename tests/functional_tests/test_results/jet/{multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json => multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json} (100%)
 rename tests/functional_tests/test_results/jet/{t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json => t5_220m_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_calculate_per_token_loss.json} (100%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f5b6d9cf63..f71be75984 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,11 +20,14 @@ stages:
   - test
   - jet
 
-variables: &VARS
+variables:
   SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
   DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
   PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
-  JET_CUSTOM_FILTER: ""
+  JET_CUSTOM_FILTER:
+    description: |
+      Selects what functional tests to run. For merge-request tests: "type == 'build' or 'merge-request' in spec.scope". For nightly tests: "type == 'build' or 'nightly' in spec.scope"
+    value: ""
   DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
   TIME_LIMIT: "10:00" # Default time limit for all jobs
   MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
diff --git a/tests/functional_tests/jet_recipes/MR-bert.yaml b/tests/functional_tests/jet_recipes/MR-bert.yaml
index 05dfafec95..3851a98a56 100644
--- a/tests/functional_tests/jet_recipes/MR-bert.yaml
+++ b/tests/functional_tests/jet_recipes/MR-bert.yaml
@@ -46,7 +46,7 @@ spec:
         MBS={micro_batch_size} \
         GBS={batch_size} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   # MCore
diff --git a/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
index 6bc7e98787..b99576eb2d 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml
@@ -38,7 +38,7 @@ spec:
         VP_SIZE={vp_size if vp_size is not None else '""'} \
         MBS={mbs} \
         GBS={gbs} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {tp_size: [1], pp_size: [1], mbs: [4], gbs: [64], vp_size: [null]}
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index 7315cdda61..77bbea30d3 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -53,7 +53,7 @@ spec:
         CKPT_FORMAT={ckpt_format} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
         ALLOW_NONDETERMINISTIC={allow_nondeterministic} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   # MCore
diff --git a/tests/functional_tests/jet_recipes/MR-multimodal.yaml b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
index 3f16288645..a93e840b9f 100644
--- a/tests/functional_tests/jet_recipes/MR-multimodal.yaml
+++ b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
@@ -46,7 +46,7 @@ spec:
         MOE_GROUPED_GEMM={moe_grouped_gemm} \
         CKPT_FORMAT={ckpt_format} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {use_te: [True], tp_size: [1],  pp_size: [1], ckpt_resume: [0, 1]}
diff --git a/tests/functional_tests/jet_recipes/MR-t5.yaml b/tests/functional_tests/jet_recipes/MR-t5.yaml
index a05c6ad85e..8a267a4a56 100644
--- a/tests/functional_tests/jet_recipes/MR-t5.yaml
+++ b/tests/functional_tests/jet_recipes/MR-t5.yaml
@@ -45,7 +45,7 @@ spec:
         VP_SIZE={vp_size if vp_size is not None else '""'} \
         MBS={micro_batch_size} \
         GBS={batch_size} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {use_te: [True], tp_size: [1],  pp_size: [1], vp_size: [1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]}
diff --git a/tests/functional_tests/jet_recipes/monthly-t5.yaml b/tests/functional_tests/jet_recipes/monthly-t5.yaml
index 1a67e9ad83..3dd6d6fae2 100644
--- a/tests/functional_tests/jet_recipes/monthly-t5.yaml
+++ b/tests/functional_tests/jet_recipes/monthly-t5.yaml
@@ -46,7 +46,7 @@ spec:
         MBS={micro_batch_size} \
         GBS={batch_size} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {tp_size: [1,2], pp_size: [1], vp_size: [1] }
diff --git a/tests/functional_tests/jet_recipes/nightly-bert.yaml b/tests/functional_tests/jet_recipes/nightly-bert.yaml
index 70b1f0641e..29d2857991 100644
--- a/tests/functional_tests/jet_recipes/nightly-bert.yaml
+++ b/tests/functional_tests/jet_recipes/nightly-bert.yaml
@@ -44,7 +44,7 @@ spec:
         VP_SIZE={vp_size if vp_size is not None else '""'} \
         MBS={micro_batch_size} \
         GBS={batch_size} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {tp_size: [1], pp_size: [4], vp_size: [2]}
diff --git a/tests/functional_tests/jet_recipes/nightly-gpt.yaml b/tests/functional_tests/jet_recipes/nightly-gpt.yaml
index a5f2b241c5..5b072ea51f 100644
--- a/tests/functional_tests/jet_recipes/nightly-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/nightly-gpt.yaml
@@ -50,7 +50,7 @@ spec:
         MOE_GROUPED_GEMM={moe_grouped_gemm} \
         CKPT_FORMAT={ckpt_format} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {use_mcore: [True],  tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch_dist]}
diff --git a/tests/functional_tests/jet_recipes/weekly-gpt.yaml b/tests/functional_tests/jet_recipes/weekly-gpt.yaml
index 516cead6a0..a0e3cf53d3 100644
--- a/tests/functional_tests/jet_recipes/weekly-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/weekly-gpt.yaml
@@ -50,7 +50,7 @@ spec:
         GBS={batch_size} \
         MOE_GROUPED_GEMM={moe_grouped_gemm} \
         ALLOW_NONDETERMINISTIC={"1" if allow_nondeterministic else "0"} \
-        JOB_NAME={key.split("/")[1]} \
+        JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
   - {use_mcore: [True], precision: [bf16], tp_size: [1], pp_size: [1], allow_nondeterministic: [False], args_meta: ["bf16_baseline"]}
diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py
index bdfe794855..20b77ff2da 100644
--- a/tests/functional_tests/python_test_utils/common.py
+++ b/tests/functional_tests/python_test_utils/common.py
@@ -4,13 +4,22 @@
 
 import enum
 
+# By default TB tries to be smart about what to load in memory to avoid OOM
+# Since we expect every step to be there when we do our comparisons, we explicitly
+# set the size guidance to 0 so that we load everything. It's okay given our tests
+# are small/short.
+SIZE_GUIDANCE = {
+    event_accumulator.TENSORS: 0,
+    event_accumulator.SCALARS: 0,
+}
+
 
 class TypeOfTest(enum.Enum):
     APPROX = 1
     DETERMINISTIC = 2
 
 
-def read_tb_logs_as_list(path, summary_name):
+def read_tb_logs_as_list(path, summary_name, index=0):
     """Reads a TensorBoard Events file from the input path, and returns the
     summary specified as input as a list.
 
@@ -23,14 +32,15 @@ def read_tb_logs_as_list(path, summary_name):
     """
     files = glob.glob(f"{path}/events*tfevents*")
     files += glob.glob(f"{path}/results/events*tfevents*")
+    if not files:
+        raise FileNotFoundError(f"File not found matching: {path}/events* || {path}/results/events*")
     files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")
+
+    event_file = files[index]
+    ea = event_accumulator.EventAccumulator(event_file, size_guidance=SIZE_GUIDANCE)
+    ea.Reload()
+    summary = ea.Scalars(summary_name)
+    summary_list = [round(x.value, 5) for x in summary]
+    print(f'\nObtained the following list for {summary_name} ------------------')
+    print(summary_list)
+    return summary_list
diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
index 8699bc1f6e..ce2047eb08 100644
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
@@ -1,34 +1,9 @@
 import os
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
 import sys
-import glob
-from tensorboard.backend.event_processing import event_accumulator
 
+from tests.functional_tests.python_test_utils.common import read_tb_logs_as_list
 
-def read_tb_logs_as_list(path, summary_name):
-    """Reads a TensorBoard Events file from the input path, and returns the
-    summary specified as input as a list.
-
-    Args:
-        path: str, path to the dir where the events file is located.
-        summary_name: str, name of the summary to read from the TB logs.
-
-    Returns:
-        summary_list: list, the values in the read summary list, formatted as a list.
-    """
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
 
 def collect_train_test_metrics(logs_dir, run_name):
     # TODO: Fetch current baseline
diff --git a/tests/functional_tests/python_test_utils/jet_test_pipeline.py b/tests/functional_tests/python_test_utils/jet_test_pipeline.py
index 2700639e0b..d4b7100868 100644
--- a/tests/functional_tests/python_test_utils/jet_test_pipeline.py
+++ b/tests/functional_tests/python_test_utils/jet_test_pipeline.py
@@ -50,7 +50,7 @@ def check_exitcodes(results, summary_jobid):
     for result in results:
         exit_codes.append(result.get('l_exit_code', -1))
         log_urls.append(select_asset(result, 'output_script-0.log'))
-        names.append(result['obj_workload']['s_key'].split('basic/')[-1])
+        names.append(result['obj_workload']['obj_spec']['s_name'])
         metrics_file_urls.append(select_asset(result, 'results.json'))
 
     # Results metrics table
@@ -91,7 +91,7 @@ def check_exitcodes(results, summary_jobid):
 def _download_log(url, save_dir):
     import requests
     if not os.path.exists(save_dir):
-        os.mkdir(save_dir)
+        os.makedirs(save_dir, exist_ok=True)
     filepath = os.path.join(save_dir, url.split('/')[-1])
 
     r = requests.get(url)
@@ -108,7 +108,7 @@ def save_scripts(results, save_dir):
 
     for result in results:
         script = result['obj_workload']['obj_spec']['s_script']
-        target_path = result['obj_workload']['s_key'].split('basic/')[-1] + '.sh'
+        target_path = result['obj_workload']['obj_spec']['s_name'] + '.sh'
         target_path = os.path.join(save_dir, target_path)
 
         from textwrap import dedent
@@ -141,7 +141,7 @@ def check_baselines(results):
         # Download TB event logs
         for result in results:
             event_log_url = select_asset(result, 'events.out.tfevents')
-            target_dir = result['obj_workload']['s_key'].split('basic/')[-1]
+            target_dir = result['obj_workload']['obj_spec']['s_name']
             target_dir = os.path.join(tmpdir, target_dir)
             _download_log(event_log_url, target_dir)
 
@@ -156,7 +156,7 @@ def fetch_metrics_files(results, save_dir):
     for result in results:
         metrics_url = select_asset(result, 'results.json')
         if metrics_url is not None:
-            cfg = result['obj_workload']['s_key'].split('basic/')[-1]
+            cfg = result['obj_workload']['obj_spec']['s_name']
             target_dir = os.path.join(save_dir, cfg)
             _download_log(metrics_url, target_dir)
 
diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
index 6abc99c63d..d648898559 100644
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
@@ -1,33 +1,13 @@
 import os
-
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
-import glob
-import json
-import shutil
-import sys
-
 import pytest
-from tensorboard.backend.event_processing import event_accumulator
 
-from tests.functional_tests.python_test_utils.common import TypeOfTest
+from tests.functional_tests.python_test_utils.common import TypeOfTest, read_tb_logs_as_list
 
 LOGS_DIR = os.getenv('LOGS_DIR')
 ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
 STEP_INTERVAL = 5
 
-def read_tb_logs_as_list(path, summary_name, index):
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[index]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")
 
 def collect_train_test_metrics(logs_dir, index):
     train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
@@ -71,5 +51,5 @@ def test_lm_loss_deterministic(self):
         self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
 
     @pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
-    def test_lm_loss_deterministic(self):
+    def test_lm_loss_nondeterministic(self):
         self._test_helper("lm loss", TypeOfTest.APPROX)
diff --git a/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh b/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh
index d454932abb..ceae6e596d 100755
--- a/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh
+++ b/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh
@@ -69,7 +69,7 @@ if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PAT
 # step 8 : DISPLAYING THE GROUND TRUTH INFO FOR DEBUGGING OR UPDATING GROUND TRUTH VALUES
 source $PYTHON_VIRTUAL_ENV
 if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
-    python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
+    PYTHONPATH=$BUILD_DIR python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
 fi
 
 # step 9 : COMPARING THE GROUND TRUTH VALUES TO THE OBTAINED VALUES FROM THE JOB
diff --git a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json b/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
deleted file mode 100644
index 9f4240cb65..0000000000
--- a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49462, 10.49187, 10.49226, 10.47656, 10.4729, 10.35563, 10.17664, 10.07391, 9.87361, 9.66669]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2103.0, 2412.0, 2156.0, 2258.0, 2482.0, 2597.0, 3087.0, 3010.0, 2961.0, 2616.0]}, "iteration_timing_avg": 0.4599232352941175}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4-vp2.json b/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4-vp2.json
deleted file mode 100644
index f22b1545d9..0000000000
--- a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4-vp2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.47287, 10.45915, 10.45198, 10.44271, 10.40758, 10.33402, 10.11407, 10.05164, 9.86947, 9.68722]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2539.0, 2553.0, 2236.0, 2372.0, 2423.0, 2534.0, 3060.0, 3274.0, 3597.0, 3211.0]}, "iteration_timing_avg": 0.7434476470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json b/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
deleted file mode 100644
index d3bc00d944..0000000000
--- a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.42216, 10.43879, 10.42095, 10.41062, 10.38718, 10.32354, 10.134, 10.03405, 9.86954, 9.66363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3334.0, 3577.0, 3277.0, 3334.0, 3481.0, 3515.0, 2958.0, 4206.0, 4587.0, 4107.0]}, "iteration_timing_avg": 1.4501132352941182}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp1-pp2.json b/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
deleted file mode 100644
index cfe92b062e..0000000000
--- a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49533, 10.46662, 10.42394, 10.30694, 10.15979, 9.96957]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [18772.0, 19035.0, 22296.0, 18412.0, 20887.0, 23006.0, 22439.0]}, "iteration_timing_avg": 0.4442270588235295}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp4-pp1.json b/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
deleted file mode 100644
index bd1a0abc89..0000000000
--- a/tests/functional_tests/test_results/jet/bert-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.48784, 10.4873, 10.50416, 10.49442, 10.47817, 10.41358, 10.28136, 10.14425, 9.94147]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27195.0, 19616.0, 25279.0, 24916.0, 21579.0, 19699.0, 20897.0]}, "iteration_timing_avg": 1.3253535294117644}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json b/tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
rename to tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json b/tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_local_spec.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-local-spec.json
rename to tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_local_spec.json
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json b/tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp2.json
rename to tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp2.json
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/bert-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
rename to tests/functional_tests/test_results/jet/bert_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json
diff --git a/tests/functional_tests/test_results/jet/bert-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json b/tests/functional_tests/test_results/jet/bert_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/bert-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json
rename to tests/functional_tests/test_results/jet/bert_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json
diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
new file mode 100644
index 0000000000..25faec6b8c
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json
new file mode 100644
index 0000000000..65fbb4d736
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4681, 10.45734, 10.4491, 10.44102, 10.41779, 10.34626, 10.11378, 10.04382, 9.86692, 9.67893]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2373.0, 2593.0, 2187.0, 2403.0, 2412.0, 2617.0, 3083.0, 3341.0, 3558.0, 3213.0]}, "iteration_timing_avg": 0.8346488235294117}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
new file mode 100644
index 0000000000..423d346851
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.42107, 10.42897, 10.43577, 10.40787, 10.38455, 10.32433, 10.13158, 10.04316, 9.86274, 9.65777]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2229.0, 3600.0, 3300.0, 3311.0, 3522.0, 3498.0, 4076.0, 4135.0, 4709.0, 4350.0]}, "iteration_timing_avg": 1.8964105882352944}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
new file mode 100644
index 0000000000..05d590edf8
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50096, 10.48594, 10.4936, 10.48501, 10.50417, 10.4773, 10.42153, 10.29719, 10.15831, 9.9675]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [18201.0, 19789.0, 21743.0, 18735.0, 21941.0, 19700.0, 21781.0]}, "iteration_timing_avg": 0.4730702941176471}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
new file mode 100644
index 0000000000..8b1d0bcd77
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.49275, 10.48836, 10.51349, 10.49399, 10.47549, 10.41922, 10.28044, 10.14255, 9.94736]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26212.0, 19433.0, 24101.0, 23509.0, 21539.0, 17889.0, 19123.0]}, "iteration_timing_avg": 1.6886158823529411}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
deleted file mode 100644
index 520501ff0e..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83721, 10.87648, 10.85327, 10.79634, 10.67874, 10.60491, 10.12636, 10.22252, 10.13977, 9.82346]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1640.0, 1873.0, 1930.0, 1910.0, 1936.0, 1807.0, 1630.0, 1962.0, 2317.0, 2314.0]}, "iteration_timing_avg": 0.07326058823529409}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
deleted file mode 100644
index 4090dd6feb..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83273, 10.86849, 10.89112, 10.80713, 10.68491, 10.61253, 10.09319, 10.21393, 10.13869, 9.80629]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1551.0, 1809.0, 1799.0, 1862.0, 1872.0, 1643.0, 1596.0, 1880.0, 2378.0, 2177.0]}, "iteration_timing_avg": 0.09853}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4.json
deleted file mode 100644
index 6dc5093bf6..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp1-pp4.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79373, 10.86651, 10.89091, 10.78164, 10.66101, 10.58089, 10.08413, 10.19034, 10.13461, 9.81138]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1864.0, 1826.0, 1965.0, 1861.0, 1605.0, 1609.0, 1931.0, 2343.0, 2347.0]}, "iteration_timing_avg": 0.12984617647058824}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-2experts.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-2experts.json
deleted file mode 100644
index 914b305c60..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-2experts.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79674, 10.84347, 10.81547, 10.76604, 10.65416, 10.56322, 10.08548, 10.21617, 10.1139, 9.8322]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2912.0, 3584.0, 3414.0, 3357.0, 3298.0, 3173.0, 2816.0, 3211.0, 3817.0, 3728.0]}, "iteration_timing_avg": 0.2900244117647059}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-4experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-4experts2parallel.json
deleted file mode 100644
index afa120eb5f..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp2-pp2-te-4experts2parallel.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82194, 10.86461, 10.85816, 10.80566, 10.71345, 10.63249, 10.15902, 10.27938, 10.18516, 9.88286]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [7126.0, 8754.0, 8834.0, 8614.0, 7854.0, 8202.0, 7007.0, 8641.0, 9234.0, 9655.0]}, "iteration_timing_avg": 0.291154705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
deleted file mode 100644
index c5bc9f8b8c..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-mcore-tp4-pp1.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86174, 10.88685, 10.87663, 10.83061, 10.71359, 10.60783, 10.13039, 10.23076, 10.15871, 9.83396]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1747.0, 2204.0, 2061.0, 2108.0, 2163.0, 1914.0, 1682.0, 2267.0, 2474.0, 2569.0]}, "iteration_timing_avg": 0.21648441176470584}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-dist-optimizer-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-dist-optimizer-overlap-grad-reduce.json
deleted file mode 100644
index e669216b21..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-dist-optimizer-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87174, 10.89545, 10.88847, 10.88533, 10.893, 10.84895, 10.70048, 10.64124, 10.53839, 10.3107]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1238.0, 1318.0, 1648.0, 1423.0, 1535.0, 1350.0, 1271.0]}, "iteration_timing_avg": 0.0613035294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-overlap-grad-reduce.json
deleted file mode 100644
index 7a4b5eb201..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp1-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87174, 10.89545, 10.88847, 10.88533, 10.89299, 10.84895, 10.70048, 10.64124, 10.53839, 10.3107]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1317.0, 1498.0, 1568.0, 1417.0, 1386.0, 1236.0, 1196.0]}, "iteration_timing_avg": 0.07787176470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp2.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
deleted file mode 100644
index 5c669dbe2e..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62396, 10.53554, 10.25187]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2078.0, 2328.0, 2420.0, 2256.0, 2180.0, 2078.0, 2313.0]}, "iteration_timing_avg": 0.0974135294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-overlap-grad-reduce.json
deleted file mode 100644
index c9ea06c056..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81873, 10.61811, 10.61052, 10.52823, 10.22962]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2365.0, 2535.0, 2707.0, 2210.0, 2411.0, 2781.0, 2593.0]}, "iteration_timing_avg": 0.12205411764705883}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-vp1-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-vp1-overlap-grad-reduce.json
deleted file mode 100644
index 302e8172b4..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4-vp1-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.7951, 10.84939, 10.87411, 10.83459, 10.82865, 10.78676, 10.56492, 10.57063, 10.48545, 10.19548]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2561.0, 2771.0, 2141.0, 2656.0, 2737.0, 2472.0, 2991.0]}, "iteration_timing_avg": 0.12153911764705884}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4.json
deleted file mode 100644
index c86c48a045..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp1-pp4.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81873, 10.61811, 10.61052, 10.52823, 10.22962]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2365.0, 2535.0, 2707.0, 2210.0, 2411.0, 2781.0, 2593.0]}, "iteration_timing_avg": 0.12152588235294119}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-4experts.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-4experts.json
deleted file mode 100644
index e5f0580685..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-4experts.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80342, 10.85864, 10.86188, 10.83807, 10.83268, 10.80489, 10.60813, 10.61632, 10.53669, 10.27118]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8302.0, 7865.0, 7784.0, 8919.0, 9202.0, 9007.0, 9274.0]}, "iteration_timing_avg": 0.37709088235294125}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-overlap-grad-reduce.json
deleted file mode 100644
index 4f8e3aad92..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp2-pp2-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85543, 10.89355, 10.87608, 10.87365, 10.88042, 10.84182, 10.67177, 10.62853, 10.52511, 10.2523]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2472.0, 2462.0, 2480.0, 2235.0, 2268.0, 2619.0, 2429.0]}, "iteration_timing_avg": 0.14843735294117646}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1-overlap-grad-reduce.json
deleted file mode 100644
index 77b92ef7c0..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1-overlap-grad-reduce.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85921, 10.8797, 10.87381, 10.88658, 10.88912, 10.84827, 10.6857, 10.62946, 10.54289, 10.26918]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2288.0, 2230.0, 2475.0, 1997.0, 2184.0, 2468.0, 2225.0]}, "iteration_timing_avg": 0.20612647058823536}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1.json b/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
deleted file mode 100644
index 10cbf8d244..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-nightly-dgx-a100-1n8g-tp4-pp1.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85921, 10.8797, 10.87381, 10.88658, 10.88912, 10.84827, 10.6857, 10.62946, 10.54289, 10.26918]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2288.0, 2230.0, 2475.0, 1997.0, 2184.0, 2468.0, 2225.0]}, "iteration_timing_avg": 0.20541176470588232}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer-no-mmap-bin-files.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_no_mmap_bin_files.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-dist-optimizer.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_no_mmap_bin_files.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_uniform_full_recompute.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp1-uniform-full-recompute.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp1_uniform_full_recompute.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings_interleaved_no_fusion.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp2-rope-embeddings-interleaved-no-fusion.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp2_rope_embeddings_interleaved_no_fusion.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_disable_bias_linear.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-disable-bias-linear.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_disable_bias_linear.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_sequence_parallel.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-sequence-parallel.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_sequence_parallel.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_swiglu.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-swiglu.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_swiglu.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_untie_embeddings_and_outputs.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-untie-embeddings-and-outputs.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_untie_embeddings_and_outputs.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-calculate-per-token-loss.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_calculate_per_token_loss.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_calculate_per_token_loss.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_decoupled_lr.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-decoupled-lr.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_decoupled_lr.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-param-gather.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp1-pp4-vp1-dist-optimizer-overlap-grad-reduce-untied.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_cp2_nondeterministic.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-cp2-nondeterministic.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_cp2_nondeterministic.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_dist_optimizer.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-dist-optimizer.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_dist_optimizer.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_groupedGEMM.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-groupedgemm.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_groupedGEMM.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-overlap-grad-reduce-param-gather-groupedgemm.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_top2router.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp1-te-8experts2parallel-top2router.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp1_te_8experts2parallel_top2router.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-create-attention-mask-in-dataloader.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cp2_nondeterministic.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-cp2-nondeterministic.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cp2_nondeterministic.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_create_attention_mask_in_dataloader.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-no-mmap-bin-files.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_create_attention_mask_in_dataloader.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_mmap_bin_files.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_no_mmap_bin_files.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-dist-optimizer-overlap-grad-reduce-param-gather.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_qk_layernorm_test_mode.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp4-pp1-qk-layernorm-test-mode.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp4_pp1_qk_layernorm_test_mode.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_te_tp2_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-te-tp2-pp2.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_te_tp2_pp2.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp1.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp1-pp4-vp1.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_tp1_pp4_vp1.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-tp2-pp2.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_tp2_pp2.json
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-resume-dgx-a100-1n8g-tp1-pp2.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_resume_dgx_a100_1N8G_tp1_pp2.json
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
new file mode 100644
index 0000000000..3bbdd74d44
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.0958791176470588}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json
new file mode 100644
index 0000000000..153f5b0129
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312, 9.8347, 9.61264, 9.67965, 9.68133, 9.60021, 9.06887, 9.46573, 9.06116, 9.32103, 9.51104]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0, 2686.0, 2671.0, 3014.0, 3152.0, 2960.0, 3015.0, 3735.0, 2675.0, 2947.0, 3414.0]}, "iteration_timing_avg": 0.08244119402985074}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
new file mode 100644
index 0000000000..8ade75c02d
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0]}, "iteration_timing_avg": 0.11905411764705882}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json
new file mode 100644
index 0000000000..fa1ca531db
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json
new file mode 100644
index 0000000000..43fa279808
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.1541691176470588}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json
new file mode 100644
index 0000000000..2d211e0a60
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153, 9.83685, 9.60745, 9.68285, 9.6869, 9.60677, 9.07989, 9.47324, 9.07018, 9.33019, 9.51809]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0, 2540.0, 2588.0, 3110.0, 3059.0, 2924.0, 2894.0, 3694.0, 2720.0, 2635.0, 3456.0]}, "iteration_timing_avg": 0.150555671641791}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json
new file mode 100644
index 0000000000..7878654e71
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404, 9.85697, 9.65534, 9.71837, 9.74563, 9.63824, 9.13952, 9.51114, 9.10678, 9.3932, 9.56085]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0, 4218.0, 4359.0, 4468.0, 5080.0, 4575.0, 4964.0, 5755.0, 4852.0, 4092.0, 5592.0]}, "iteration_timing_avg": 0.33336671641791044}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json
new file mode 100644
index 0000000000..b07f0421d4
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81916, 10.86702, 10.85724, 10.80665, 10.71115, 10.63679, 10.16197, 10.277, 10.18384, 9.88281, 9.89125, 9.67734, 9.74917, 9.75758, 9.65591, 9.15592, 9.52069, 9.11526, 9.4051, 9.56814]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [7138.0, 8525.0, 8821.0, 8718.0, 7682.0, 8227.0, 7158.0, 8514.0, 9143.0, 9624.0, 9298.0, 10386.0, 10352.0, 12164.0, 10941.0, 12318.0, 13902.0, 11709.0, 10898.0, 12956.0]}, "iteration_timing_avg": 0.33394373134328353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json
new file mode 100644
index 0000000000..1c130d9b60
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0]}, "iteration_timing_avg": 0.33478764705882363}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json
new file mode 100644
index 0000000000..c77c0fd291
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81916, 10.86702, 10.85724, 10.80665, 10.71115, 10.63679, 10.16197, 10.277, 10.18384, 9.88281]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [7138.0, 8525.0, 8821.0, 8718.0, 7682.0, 8227.0, 7158.0, 8514.0, 9143.0, 9624.0]}, "iteration_timing_avg": 0.34508176470588225}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
new file mode 100644
index 0000000000..d939d5423d
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0]}, "iteration_timing_avg": 0.27329441176470587}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json
new file mode 100644
index 0000000000..2f9d91c0d6
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.27828194029850745}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json
new file mode 100644
index 0000000000..46cdac4505
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.2851294029850746}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json
new file mode 100644
index 0000000000..69ca350fdd
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json
new file mode 100644
index 0000000000..96b8036e95
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
new file mode 100644
index 0000000000..6c6d8e79fc
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json
new file mode 100644
index 0000000000..d4a5cfb78e
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json
new file mode 100644
index 0000000000..0f5ad40c1c
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json
new file mode 100644
index 0000000000..b9816fbf8b
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json
new file mode 100644
index 0000000000..4cf16ef911
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json
new file mode 100644
index 0000000000..302a1524b4
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json
new file mode 100644
index 0000000000..114dfb1e2a
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json
new file mode 100644
index 0000000000..b807a2e979
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json
new file mode 100644
index 0000000000..546ccfca5e
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json
new file mode 100644
index 0000000000..c0a53bdb6c
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
new file mode 100644
index 0000000000..18457f230d
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json
new file mode 100644
index 0000000000..7b39f86c32
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json
new file mode 100644
index 0000000000..47198f9ec6
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json b/tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/multimodal-llava-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1.json
rename to tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json
diff --git a/tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json b/tests/functional_tests/test_results/jet/t5_220m_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_calculate_per_token_loss.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/t5-220m-merge-request-dgx-a100-1n8g-mcore-te-tp1-pp1-vp1-calculate-per-token-loss.json
rename to tests/functional_tests/test_results/jet/t5_220m_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_calculate_per_token_loss.json
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
index 4acff199dc..e812e5a612 100755
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
@@ -96,6 +96,9 @@ torch_run_cmd="torchrun $DISTRIBUTED_ARGS \
        --${TRAINING_DTYPE}"
 
 if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then
+    # Both NVTE_APPLY_QK_LAYER_SCALING and --apply-query-key-layer-scaling must be passed
+    # to enable feature and be backward compatible with TE<0.11
+    export NVTE_APPLY_QK_LAYER_SCALING=1
     torch_run_cmd+=" --apply-query-key-layer-scaling"
     # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using:
     #  1. --apply-query-key-layer-scaling
@@ -117,7 +120,7 @@ echo "$command" > $SCRIPTS_DIR/pretrain_bert_distributed_command.sh
 eval $command
 
 echo "Saving test results to $TENSORBOARD_DIR"
-python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
+PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
     tee ${TENSORBOARD_DIR}/results.json
 
 if [[ $SKIP_PYTEST != 1 ]]; then
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
index aa95d8d65a..1fceb0c074 100755
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
@@ -79,7 +79,7 @@ fi
 if [[ -n "$CKPT_FORMAT" ]] && [[ "$CKPT_FORMAT" != 'torch' ]]; then
        echo "Using distributed checkpoint format $CKPT_FORMAT..."
        [[ "$CKPT_FORMAT" == 'zarr' ]] && command="$command pip install zarr tensorstore==0.1.45;"
-       ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT"
+       ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT --use-mcore-models"
 fi
 set +x
 # Runs the "345M" parameter model
@@ -180,7 +180,7 @@ echo "$command" > $SCRIPTS_DIR/pretrain_gpt3_distributed_command.sh
 eval $command
 
 echo "Saving test results to $TENSORBOARD_DIR"
-python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
+PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
     tee ${TENSORBOARD_DIR}/results.json
 
 if [[ $SKIP_PYTEST != 1 ]]; then
diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
index fa536f97ed..1315a23d01 100755
--- a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
@@ -173,7 +173,7 @@ echo "$command" > $SCRIPTS_DIR/pretrain_llava_distributed_command.sh
 eval $command
 
 echo "Saving test results to $TENSORBOARD_DIR"
-python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
+PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
     tee ${TENSORBOARD_DIR}/results.json
 
 if [[ $SKIP_PYTEST != 1 ]]; then
diff --git a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
index eccbe00200..1d59228531 100755
--- a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh
@@ -149,7 +149,7 @@ echo "$command" > $SCRIPTS_DIR/pretrain_retro_distributed_command.sh
 eval $command
 
 echo "Saving test results to $TENSORBOARD_DIR"
-python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
+PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
     tee ${TENSORBOARD_DIR}/results.json
 
 if [[ $SKIP_PYTEST != 1 ]]; then
diff --git a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
index 7ad640bb77..9cf3904d9b 100755
--- a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh
@@ -137,7 +137,7 @@ echo "$command" > $SCRIPTS_DIR/pretrain_t5_distributed_command.sh
 eval $command
 
 echo "Saving test results to $TENSORBOARD_DIR"
-python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
+PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
     tee ${TENSORBOARD_DIR}/results.json
 
 if [[ $SKIP_PYTEST != 1 ]]; then

From 5c97996ce835a3a767a13b9a527febee861334a8 Mon Sep 17 00:00:00 2001
From: Jon Barker <jbarker@nvidia.com>
Date: Fri, 31 May 2024 10:04:18 -0700
Subject: [PATCH 60/92] Add copyright to combine_state_dicts.py

---
 examples/multimodal/combine_state_dicts.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py
index 928be4782d..a01512ae12 100644
--- a/examples/multimodal/combine_state_dicts.py
+++ b/examples/multimodal/combine_state_dicts.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
 import argparse
 import os
 import sys

From 3ee489d9eabfc27e994d1a0c01b5d22e9e5040b8 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Fri, 31 May 2024 14:59:52 -0700
Subject: [PATCH 61/92] Enable virtual pipelining and P2P communication overlap
 at PP=2

---
 megatron/core/parallel_state.py               |  4 +-
 .../pipeline_parallel/p2p_communication.py    | 42 +++++++++++++++----
 megatron/training/arguments.py                | 12 ++++--
 .../pipeline_parallel/test_schedules.py       |  3 ++
 4 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
index fdbff2c311..3b74e95b83 100644
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
@@ -444,9 +444,9 @@ def initialize_model_parallel(
     num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
 
     if virtual_pipeline_model_parallel_size is not None:
-        if not pipeline_model_parallel_size > 2:
+        if not pipeline_model_parallel_size > 1:
             raise RuntimeError(
-                "pipeline-model-parallel size should be greater than 2 with interleaved schedule"
+                "pipeline-model-parallel size should be greater than 1 with interleaved schedule"
             )
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index e5e7e5ab16..a95ed6398e 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -13,6 +13,7 @@
     get_pipeline_model_parallel_next_rank,
     get_pipeline_model_parallel_prev_rank,
     get_pipeline_model_parallel_rank,
+    get_pipeline_model_parallel_world_size,
 )
 
 # Types
@@ -175,53 +176,78 @@ def _p2p_ops(
 ):
     reqs = []
     rank = get_pipeline_model_parallel_rank()
+    even_send_odd_recv_group = group
+    if get_pipeline_model_parallel_world_size() == 2:
+        # Use the global process group for one of the two p2p communications
+        # to allow the overlap of the independent communications.
+        # Using the global process group is compatible because the pipeline-parallel
+        # communications set the source and destination by global rank.
+        even_recv_odd_send_group = torch.distributed.group.WORLD
+    else:
+        even_recv_odd_send_group = group
     if get_pipeline_model_parallel_rank() % 2 == 0:
         if tensor_send_next is not None:
             send_next_req = torch.distributed.isend(
-                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group,
+                tensor=tensor_send_next,
+                dst=get_pipeline_model_parallel_next_rank(),
+                group=even_send_odd_recv_group,
             )
             reqs.append(send_next_req)
 
         if tensor_recv_prev is not None:
             recv_prev_req = torch.distributed.irecv(
-                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group,
+                tensor=tensor_recv_prev,
+                src=get_pipeline_model_parallel_prev_rank(),
+                group=even_recv_odd_send_group,
             )
             reqs.append(recv_prev_req)
 
         if tensor_send_prev is not None:
             send_prev_req = torch.distributed.isend(
-                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group,
+                tensor=tensor_send_prev,
+                dst=get_pipeline_model_parallel_prev_rank(),
+                group=even_send_odd_recv_group,
             )
             reqs.append(send_prev_req)
 
         if tensor_recv_next is not None:
             recv_next_req = torch.distributed.irecv(
-                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group,
+                tensor=tensor_recv_next,
+                src=get_pipeline_model_parallel_next_rank(),
+                group=even_recv_odd_send_group,
             )
             reqs.append(recv_next_req)
 
     else:
         if tensor_recv_prev is not None:
             recv_prev_req = torch.distributed.irecv(
-                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group,
+                tensor=tensor_recv_prev,
+                src=get_pipeline_model_parallel_prev_rank(),
+                group=even_send_odd_recv_group,
             )
             reqs.append(recv_prev_req)
 
         if tensor_send_next is not None:
             send_next_req = torch.distributed.isend(
-                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group,
+                tensor=tensor_send_next,
+                dst=get_pipeline_model_parallel_next_rank(),
+                group=even_recv_odd_send_group,
             )
             reqs.append(send_next_req)
 
         if tensor_recv_next is not None:
             recv_next_req = torch.distributed.irecv(
-                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group,
+                tensor=tensor_recv_next,
+                src=get_pipeline_model_parallel_next_rank(),
+                group=even_send_odd_recv_group,
             )
             reqs.append(recv_next_req)
 
         if tensor_send_prev is not None:
             send_prev_req = torch.distributed.isend(
-                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group,
+                tensor=tensor_send_prev,
+                dst=get_pipeline_model_parallel_prev_rank(),
+                group=even_recv_odd_send_group,
             )
             reqs.append(send_prev_req)
     return reqs
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6b038669f7..327f7b82e3 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -241,9 +241,15 @@ def validate_args(args, defaults={}):
                 args.global_batch_size), flush=True)
     assert args.global_batch_size > 0
     if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
+        if args.overlap_p2p_comm:
+            assert args.pipeline_model_parallel_size > 1, \
+                'when interleaved schedule is used, pipeline-model-parallel size '\
+                'should be greater than 1'
+        else:
+            assert args.pipeline_model_parallel_size > 2, \
+                'when interleaved schedule is used and p2p communication overlap is disabled, '\
+                'pipeline-model-parallel size should be greater than 2 to avoid having multiple '\
+                'p2p sends and recvs between same 2 ranks per communication batch'
         assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
             'number of layers should be divisible by the pipeline parallel size'
         num_layers_per_pipeline_stage = args.num_layers // args.transformer_pipeline_model_parallel_size
diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py
index 68bd8041e5..02bdd2882b 100644
--- a/tests/unit_tests/pipeline_parallel/test_schedules.py
+++ b/tests/unit_tests/pipeline_parallel/test_schedules.py
@@ -17,6 +17,9 @@ def test_get_forward_backward_func():
     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2)
     assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
     Utils.destroy_model_parallel()
+    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=4)
+    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
+    Utils.destroy_model_parallel()
 
 def test_deallocate_output_tensor():
     out = torch.tensor([[1, 2, 3], [4, 5, 6]])

From 6c7bec698fbf53e88093868838035767af30f749 Mon Sep 17 00:00:00 2001
From: John Kamalu <jkamalu@nvidia.com>
Date: Tue, 4 Jun 2024 09:31:09 -0700
Subject: [PATCH 62/92] fix simple train loop and mock dataset

---
 examples/run_simple_mcore_train_loop.py       | 45 ++++++++++-----
 megatron/core/QuickStart.md                   | 55 +++++++++++--------
 .../blended_megatron_dataset_config.py        | 14 +++--
 megatron/core/datasets/gpt_dataset.py         | 27 ---------
 4 files changed, 75 insertions(+), 66 deletions(-)

diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py
index ad0c7e750b..d5ffffeeaf 100644
--- a/examples/run_simple_mcore_train_loop.py
+++ b/examples/run_simple_mcore_train_loop.py
@@ -12,10 +12,16 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
-from megatron.core.datasets.utils import Split
+from megatron.core.datasets.utils import compile_helpers 
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
 
-def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
     parallel_state.destroy_model_parallel()
 
     # Torch setup for distributed training
@@ -35,31 +41,43 @@ def model_provider():
         hidden_size=12, 
         num_attention_heads=4, 
         use_cpu_initialization=True, 
-        pipeline_dtype=torch.float32)
+        pipeline_dtype=torch.float32,
+    )
 
     gpt_model = GPTModel(
         config=transformer_config, 
         transformer_layer_spec=get_gpt_layer_local_spec(), 
         vocab_size=100, 
-        max_sequence_length=64)
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
 
     return gpt_model
 
 def get_train_data_iterator():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
     config = GPTDatasetConfig(
-        random_seed = 0,
-        sequence_length = 64,
-        blend=None,
+        random_seed=0,
+        sequence_length=_SEQUENCE_LENGTH,
         reset_position_ids=False,
         reset_attention_mask=False,
         eod_mask_loss=False,
-        tokenizer="dummy")
+        tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
+    )
 
-    training_data= MockGPTDataset(Split.train, config)
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [1000, None, None], lambda: True, config
+    ).build()
 
-    train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
+    train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)
 
     train_iterator = iter(train_dataloader)
+
     return train_iterator
 
 def forward_step_func(data_iterator, model):
@@ -119,9 +137,9 @@ def load_distributed_checkpoint(checkpoint_path, gpt_model):
             data_iterator=train_iterator,
             model=gpt_model,
             num_microbatches=1,
-            seq_length=64,
+            seq_length=_SEQUENCE_LENGTH,
             micro_batch_size=8,
-            decoder_seq_length=64,
+            decoder_seq_length=_SEQUENCE_LENGTH,
             forward_only=False)
 
         optim.step()
@@ -136,4 +154,5 @@ def load_distributed_checkpoint(checkpoint_path, gpt_model):
     # Loading the model
     gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
     gpt_model.to(device)
-    print('Successfully loaded the model')   
+    print('Successfully loaded the model')
+
diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
index eb092d1e3c..ed8fbfed60 100644
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
@@ -6,15 +6,13 @@ The following guide will show you how to quickly get started with Megatron Core.
 * We will save the model using the distributed checkpointing format
 * We will load the model saved above. 
 
-*NOTE: The following has been testing for megatron core version 0.5 and NGC Pytorch Container version 24.02
+*NOTE: The following has been testing for megatron core version 0.8.0 and NGC Pytorch Container version 24.02
 
 ### Environment Setup
 ```
-docker run --ipc=host --shm-size=512m --gpus all -it nvcr.io/nvidia/pytorch:24.02-py3
+docker run --ipc=host --shm-size=512m --gpus 2 -it nvcr.io/nvidia/pytorch:24.02-py3
 
-pip install megatron_core
-pip install tensorstore==0.1.45
-pip install zarr
+git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM
 ```
 <br>
 
@@ -80,26 +78,43 @@ The following shows you how you can quickly get started with a mock dataset util
 To find more information about megatron core data pipeline please refer to [this](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/readme.md?ref_type=heads)
 
 ```
+import torch
 from torch.utils.data import DataLoader
-from megatron.core.datasets.utils import Split
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from megatron.core.datasets.utils import compile_helpers
+
+_SEQUENCE_LENGTH = 64
 
 def get_train_data_iterator():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
     config = GPTDatasetConfig(
-        random_seed=0, 
-        sequence_length=64, 
-        blend=None, 
-        reset_position_ids=False, 
-        reset_attention_mask=False, 
-        eod_mask_loss=False, 
-        tokenizer="dummy")
+        random_seed=0,
+        sequence_length=_SEQUENCE_LENGTH,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+        tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
+    )
 
-    training_data= MockGPTDataset(Split.train, config)
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [1000, None, None], lambda: True, config
+    ).build()
 
-    train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
+    train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)
 
     train_iterator = iter(train_dataloader)
+
     return train_iterator
+
 ```
 <br>
 
@@ -138,8 +153,6 @@ def forward_step_func(data_iterator, model):
 **STEP 5 - Load and Save Distributed Checkpoint**
 Megatron core uses distributed checkpoint for loading and saving model. This gives you the flexiblity to convert model from one model parallel setting to another when you load a model (i.e A model trained with tensor parallel size 2, can now be loaded as tensor model parallel size 4 etc.)
 
-*NOTE: Make sure you have zarr and tensorstore pip package installed as shown in the environment setup*
-
 ```python
 from megatron.core import dist_checkpointing
 
@@ -157,6 +170,7 @@ def load_distributed_checkpoint(checkpoint_path, gpt_model):
 
 **STEP 6 - Main Function**
 The following is the main function that needs to go into your script. 
+
 ```python
 from pathlib import Path
 from torch.optim import Adam
@@ -206,13 +220,10 @@ if __name__ == "__main__":
 <br>
 
 **STEP 7 - Running the full example**
-All the above steps are put to gether in a [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script in examples folder in megatron . You can run it as follows
+All the above steps are put to gether in a [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script in examples folder in megatron . You can run it as follows after completing all steps in the Environment Setup section.
 
 ```
-git clone https://github.com/NVIDIA/Megatron-LM.git
-cd Megatron-LM/examples
-NUM_GPUS=2
-torchrun --nproc-per-node $NUM_GPUS run_simple_mcore_train_loop.py
+PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
 ```
 <br>
 
diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py
index a4dd1b46d6..10cd5909b9 100644
--- a/megatron/core/datasets/blended_megatron_dataset_config.py
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -84,14 +84,11 @@ def __post_init__(self) -> None:
                         self.blend_per_split[split.value][1]
                     ), "blend per split prefixes and weights must be equal in number"
         else:
-            assert self.split is not None, "split must be provided in absence of blend_per_split"
-            split_vector = parse_and_normalize_split(self.split)
-            self.split_matrix = convert_split_vector_to_split_matrix(split_vector)
-            log_single_rank(logger, logging.INFO, f"Let split_matrix = {self.split_matrix}")
             if self.blend is not None:
                 assert self.blend[1] is None or len(self.blend[0]) == len(
                     self.blend[1]
                 ), "blend prefixes and weights must be equal in number"
+                assert self.split is not None, "split must be provided when blend is not None"
             else:
                 self.mock = True
                 log_single_rank(
@@ -99,6 +96,15 @@ def __post_init__(self) -> None:
                     logging.INFO,
                     f"Let mock = True, as both blend and blend_per_split are None",
                 )
+                self.split = "1,1,1"
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"Let split = {self.split}, an arbitrarily even split, as mock is True",
+                )
+            split_vector = parse_and_normalize_split(self.split)
+            self.split_matrix = convert_split_vector_to_split_matrix(split_vector)
+            log_single_rank(logger, logging.INFO, f"Let split_matrix = {self.split_matrix}")
 
 
 def parse_and_normalize_split(split: str) -> List[float]:
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
index b8ce1b0fc7..9ebb9de771 100644
--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -728,9 +728,6 @@ def __init__(
     ) -> None:
         assert config.mock
 
-        if num_samples is None:
-            num_samples = len(indices)
-
         super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
 
     @staticmethod
@@ -760,27 +757,3 @@ def build_low_level_dataset(
             MockGPTLowLevelDataset: The underlying MockGPTLowLevelDataset
         """
         return MockGPTLowLevelDataset(config.tokenizer)
-
-    def __len__(self) -> int:
-        """Abstract method implementation
-
-        Returns:
-            int: The length of the dataset
-        """
-        return self.num_samples
-
-    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
-        """Abstract method implementation
-
-        Args:
-            idx (int): The integer seed for mock data generation
-
-        Returns:
-            Dict[str, numpy.ndarray]: The mock sample information wrapped in a dictionary
-        """
-        if idx is not None and idx >= self.num_samples:
-            raise IndexError(
-                f"The index {idx} exceeds the available number of samples ({self.num_samples})"
-            )
-
-        return super().__getitem__(idx)

From a4b31f2239dbcb9b91f9fd4408cdd8dc7640b323 Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Tue, 4 Jun 2024 13:58:08 -0700
Subject: [PATCH 63/92] Add option to average gradients directly in
 data-parallel collective

---
 megatron/core/datasets/blended_dataset.py     |  3 +-
 .../blended_megatron_dataset_builder.py       |  3 +-
 megatron/core/datasets/gpt_dataset.py         |  3 +-
 megatron/core/datasets/indexed_dataset.py     |  2 +-
 megatron/core/datasets/masked_dataset.py      |  3 +-
 .../retro/query/multi_split_gpt_dataset.py    |  3 +-
 megatron/core/datasets/retro/utils.py         |  2 +-
 megatron/core/datasets/utils.py               | 23 +-----
 .../distributed/distributed_data_parallel.py  | 71 ++++++++++++-------
 .../distributed_data_parallel_config.py       | 12 ++--
 .../core/distributed/param_and_grad_buffer.py | 42 ++++++-----
 megatron/core/optimizer/__init__.py           |  8 +--
 megatron/core/utils.py                        | 40 ++++++++++-
 megatron/training/arguments.py                |  2 +
 megatron/training/training.py                 |  5 +-
 .../functional_tests/jet_recipes/MR-gpt.yaml  |  1 +
 ...ore-tp2-pp2-ddp-average-in-collective.json |  1 +
 17 files changed, 143 insertions(+), 81 deletions(-)
 create mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json

diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py
index a981cb32da..5fe71514cb 100644
--- a/megatron/core/datasets/blended_dataset.py
+++ b/megatron/core/datasets/blended_dataset.py
@@ -13,7 +13,8 @@
 
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
 from megatron.core.datasets.megatron_dataset import MegatronDataset
-from megatron.core.datasets.utils import log_single_rank, normalize
+from megatron.core.datasets.utils import normalize
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index 7a6187c7c1..23dd7eef84 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -11,8 +11,9 @@
 from megatron.core.datasets.blended_dataset import BlendedDataset
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
 from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
-from megatron.core.datasets.utils import Split, log_single_rank, normalize
+from megatron.core.datasets.utils import Split, normalize
 from megatron.core.parallel_state import get_virtual_pipeline_model_parallel_rank
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
index b8ce1b0fc7..901e343c91 100644
--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -13,7 +13,8 @@
 from megatron.core.datasets.indexed_dataset import IndexedDataset
 from megatron.core.datasets.megatron_dataset import MegatronDataset
 from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
-from megatron.core.datasets.utils import Split, log_single_rank
+from megatron.core.datasets.utils import Split
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
index 28ef414d42..5f9fbe7238 100644
--- a/megatron/core/datasets/indexed_dataset.py
+++ b/megatron/core/datasets/indexed_dataset.py
@@ -19,7 +19,7 @@
 import numpy
 import torch
 
-from megatron.core.datasets.utils import log_single_rank
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py
index 0768cd29e3..9c8b7a9f34 100644
--- a/megatron/core/datasets/masked_dataset.py
+++ b/megatron/core/datasets/masked_dataset.py
@@ -13,7 +13,8 @@
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
 from megatron.core.datasets.indexed_dataset import IndexedDataset
 from megatron.core.datasets.megatron_dataset import MegatronDataset
-from megatron.core.datasets.utils import Split, log_single_rank
+from megatron.core.datasets.utils import Split
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
index 7dc3f44d6a..97a891fd14 100644
--- a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
+++ b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
@@ -15,7 +15,8 @@
 )
 from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
 from megatron.core.datasets.indexed_dataset import IndexedDataset
-from megatron.core.datasets.utils import Split, log_single_rank
+from megatron.core.datasets.utils import Split
+from megatron.core.utils import log_single_rank
 
 logger = logging.getLogger(__name__)
 
diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py
index 1f3a258d20..2d87630e1b 100644
--- a/megatron/core/datasets/retro/utils.py
+++ b/megatron/core/datasets/retro/utils.py
@@ -19,7 +19,7 @@
     MultiSplitGPTDataset,
     MultiSplitGPTDatasetConfig,
 )
-from megatron.core.datasets.utils import log_single_rank
+from megatron.core.utils import log_single_rank
 
 from .external_libs import h5py
 
diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py
index 412626d05f..45203c256a 100644
--- a/megatron/core/datasets/utils.py
+++ b/megatron/core/datasets/utils.py
@@ -2,11 +2,13 @@
 
 import logging
 from enum import Enum
-from typing import Any, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
 
+from ..utils import log_single_rank
+
 logger = logging.getLogger(__name__)
 
 
@@ -30,25 +32,6 @@ def compile_helpers():
         sys.exit(1)
 
 
-def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
-    """If torch distributed is initialized, log only on rank
-
-    Args:
-        logger (logging.Logger): The logger to write the logs
-
-        args (Tuple[Any]): All logging.Logger.log positional arguments
-
-        rank (int, optional): The rank to write on. Defaults to 0.
-
-        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
-    """
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == rank:
-            logger.log(*args, **kwargs)
-    else:
-        logger.log(*args, **kwargs)
-
-
 def normalize(weights: List[float]) -> List[float]:
     """Do non-exponentiated normalization
 
diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
index b587c36b57..cf7faba148 100644
--- a/megatron/core/distributed/distributed_data_parallel.py
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 from contextlib import contextmanager
-from logging import getLogger
 from typing import Dict, Optional
 
 import torch
@@ -9,10 +9,11 @@
 from .. import parallel_state
 from ..transformer.module import MegatronModule
 from ..transformer.transformer_config import TransformerConfig
+from ..utils import log_single_rank
 from .distributed_data_parallel_config import DistributedDataParallelConfig
 from .param_and_grad_buffer import ParamAndGradBuffer
 
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class DistributedDataParallel(MegatronModule):
@@ -27,12 +28,9 @@ class DistributedDataParallel(MegatronModule):
         config: Transformer config object.
         ddp_config: DistributedDataParallel config object.
         module: Underlying model.
-        data_parallel_group: Data-parallel process group.
-        expert_data_parallel_group: Optional data-parallel process group for experts in a MoE.
         disable_bucketing: If true, force assign all parameters to a single bucket. If false,
             use standard bucketing policy: assign parameters to smaller buckets and all-reduce
             per bucket _if_ overlap_grad_reduce is True and pp_rank is 0.
-        check_for_nan_in_grad: If true, check if local grad norm is NaN.
 
     """
 
@@ -41,8 +39,6 @@ def __init__(
         config: TransformerConfig,
         ddp_config: DistributedDataParallelConfig,
         module: torch.nn.Module,
-        data_parallel_group: torch.distributed.ProcessGroup,
-        expert_data_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
         disable_bucketing: bool = False,
     ):
         super().__init__(config=config)
@@ -53,15 +49,19 @@ def __init__(
         # ring-reduce implementations are large enough to remain bandwidth-bound rather than
         # latency-bound.
         if ddp_config.bucket_size is None:
-            dp_size = parallel_state.get_data_parallel_world_size()
-            ddp_config.bucket_size = max(40000000, 1000000 * dp_size)
+            ddp_config.bucket_size = max(
+                40000000, 1000000 * parallel_state.get_data_parallel_world_size()
+            )
         # Set bucket_size to infinity if overlap_grad_reduce is False.
         if not ddp_config.overlap_grad_reduce:
             ddp_config.bucket_size = None
 
         self.ddp_config = ddp_config
-        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-            logger.info(f'Setting up DistributedDataParallel with config {self.ddp_config}')
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f'Setting up DistributedDataParallel with config {self.ddp_config}',
+        )
 
         # Turn off bucketing if we are on a pipeline stage that is not the first (since
         # data-parallel communication on these stages is not on the critical path), or if
@@ -109,6 +109,18 @@ def allocate_buffers_for_parameters(
                 params.append(param)
                 param_and_grad_dtype_to_params[(param_dtype, grad_dtype)] = params
 
+            if not config.calculate_per_token_loss:
+                target_gradient_scaling_factor = 1.0 / parallel_state.get_data_parallel_world_size()
+                if self.ddp_config.average_in_collective:
+                    # Collective is averaging gradients in collective with data_parallel_group.
+                    assert (
+                        gradient_scaling_factor
+                        / torch.distributed.get_world_size(group=data_parallel_group)
+                        == target_gradient_scaling_factor
+                    )
+                else:
+                    assert gradient_scaling_factor == target_gradient_scaling_factor
+
             # Allocate the grad buffers and map the grads.
             buffers = []
             for (param_dtype, grad_dtype), params in param_and_grad_dtype_to_params.items():
@@ -131,20 +143,30 @@ def allocate_buffers_for_parameters(
 
         if config.calculate_per_token_loss:
             gradient_scaling_factor = 1.0
+            expert_gradient_scaling_factor = 1.0
         else:
-            data_parallel_world_size = torch.distributed.get_world_size(data_parallel_group)
-            gradient_scaling_factor = 1.0 / data_parallel_world_size
+            if self.ddp_config.average_in_collective:
+                gradient_scaling_factor = 1.0
+                expert_gradient_scaling_factor = (
+                    1.0 / parallel_state.get_expert_model_parallel_world_size()
+                )
+            else:
+                data_parallel_world_size = parallel_state.get_data_parallel_world_size()
+                gradient_scaling_factor = 1.0 / data_parallel_world_size
+                expert_gradient_scaling_factor = 1.0 / data_parallel_world_size
 
         # Allocate the param+grad buffers for dense params' grads.
         self.buffers = allocate_buffers_for_parameters(
-            dense_params, data_parallel_group, gradient_scaling_factor=gradient_scaling_factor,
+            dense_params,
+            parallel_state.get_data_parallel_group(with_context_parallel=True),
+            gradient_scaling_factor=gradient_scaling_factor,
         )
 
         # Allocate separate param+grad buffers for expert parallel params' grads.
         self.expert_parallel_buffers = allocate_buffers_for_parameters(
             expert_parallel_params,
-            expert_data_parallel_group,
-            gradient_scaling_factor=gradient_scaling_factor,
+            parallel_state.get_data_modulo_expert_parallel_group(),
+            gradient_scaling_factor=expert_gradient_scaling_factor,
         )
 
         # Delete references to weight_tensor if they exist since we don't want two parameter copies
@@ -266,17 +288,16 @@ def broadcast_params(self):
             is_expert_parallel = not getattr(param, 'allreduce', True)
 
             if is_expert_parallel:
-                torch.distributed.broadcast(
-                    param.data,
-                    src=torch.distributed.get_process_group_ranks(self.expert_data_parallel_group),
-                    group=self.expert_data_parallel_group,
-                )
+                data_parallel_group = parallel_state.get_data_modulo_expert_parallel_group()
             else:
-                torch.distributed.broadcast(
-                    param.data,
-                    src=torch.distributed.get_process_group_ranks(self.data_parallel_group),
-                    group=self.data_parallel_group,
+                data_parallel_group = parallel_state.get_data_parallel_group(
+                    with_context_parallel=True
                 )
+            torch.distributed.broadcast(
+                param.data,
+                src=torch.distributed.get_global_rank(data_parallel_group, 0),
+                group=data_parallel_group,
+            )
 
     def state_dict(self, prefix='', keep_vars=False):
         """
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
index b12be9255b..c1396e0f00 100644
--- a/megatron/core/distributed/distributed_data_parallel_config.py
+++ b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -15,8 +15,8 @@ class DistributedDataParallelConfig:
     """If true, overlap grad all-reduce / reduce-scatter with backward compute."""
 
     use_distributed_optimizer: bool = False
-    """If true, issue reduce-scatter collectives to aggregate gradients and clean up originally
-       allocated model parameters, otherwise issue all-reduce collectives.
+    """If true, issue reduce-scatter collectives to aggregate gradients and clean up
+       originally allocated model parameters, otherwise issue all-reduce collectives.
     """
 
     check_for_nan_in_grad: bool = False
@@ -24,5 +24,9 @@ class DistributedDataParallelConfig:
 
     bucket_size: Optional[int] = None
     """Maximum number of parameters in each bucket. If unspecified, MCore uses a default
-    value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger buckets
-    to ensure collectives do not become latency-bound)."""
+       value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger
+       buckets to ensure collectives do not become latency-bound)."""
+
+    average_in_collective: bool = False
+    """If true, compute average in collective directly, as opposed to dividing by the
+       dp_size first and then computing sum in the collective."""
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index c07b15b94a..4d13943e93 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -1,17 +1,17 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 import math
 import os
 from enum import Enum
-from logging import getLogger
 from typing import Dict, List, Optional
 
 import torch
 
-from .. import parallel_state
+from ..utils import log_on_each_pipeline_stage
 from .distributed_data_parallel_config import DistributedDataParallelConfig
 
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class BufferType(Enum):
@@ -117,8 +117,16 @@ def start_grad_sync(self):
                 f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
             )
 
+        # gradient_scaling_factor already takes into account whether we are computing
+        # an average or sum in the data-parallel collective.
         if self.gradient_scaling_factor != 1.0:
             self.grad_data *= self.gradient_scaling_factor
+
+        # Decide reduce_op.
+        reduce_op = torch.distributed.ReduceOp.SUM
+        if self.ddp_config.average_in_collective:
+            reduce_op = torch.distributed.ReduceOp.AVG
+
         # Use async_op only when overlap_grad_reduce is True.
         if self.ddp_config.use_distributed_optimizer:
             local_data_view = shard_buffer(self.grad_data, self.data_parallel_world_size)[
@@ -127,12 +135,14 @@ def start_grad_sync(self):
             self.communication_handle = torch.distributed._reduce_scatter_base(
                 local_data_view,
                 self.grad_data,
+                op=reduce_op,
                 group=self.data_parallel_group,
                 async_op=self.ddp_config.overlap_grad_reduce,
             )
         else:
             self.communication_handle = torch.distributed.all_reduce(
                 self.grad_data,
+                op=reduce_op,
                 group=self.data_parallel_group,
                 async_op=self.ddp_config.overlap_grad_reduce,
             )
@@ -400,20 +410,18 @@ def _does_param_require_new_bucket(param):
             )
 
         # Log buckets for all PP stages.
-        if (
-            parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0
-            and parallel_state.get_tensor_model_parallel_rank() == 0
-        ):
-            logger.info(
-                f'Number of buckets for gradient all-reduce / reduce-scatter: {len(self.buckets)}'
-            )
-            for index, bucket in enumerate(self.buckets):
-                numel = 0
-                for param in bucket.params:
-                    numel += param.data.nelement()
-                logger.info(f'Params for bucket {index+1} ({numel} elements):')
-                for param in bucket.params:
-                    logger.info(f'    {param_to_name[param]}')
+        log_strs = []
+        log_strs.append(
+            f'Number of buckets for gradient all-reduce / reduce-scatter: {len(self.buckets)}'
+        )
+        for index, bucket in enumerate(self.buckets):
+            numel = 0
+            for param in bucket.params:
+                numel += param.data.nelement()
+            log_strs.append(f'Params for bucket {index+1} ({numel} elements):')
+            for param in bucket.params:
+                log_strs.append(f'\t{param_to_name[param]}')
+        log_on_each_pipeline_stage(logger, logging.INFO, '\n'.join(log_strs))
 
     def scale_gradients(self, scaling_factor: float) -> None:
         """Scale the gradient data by `scaling_factor`."""
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
index 95e6c31377..5283e7b6f7 100644
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from logging import getLogger
+import logging
 from typing import Callable, Dict, List, Optional
 
 import torch
@@ -10,6 +10,7 @@
 
 from ..distributed import ParamAndGradBuffer
 from ..transformer.module import MegatronModule
+from ..utils import log_single_rank
 from .distrib_optimizer import DistributedOptimizer
 from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import (
@@ -20,7 +21,7 @@
 )
 from .optimizer_config import OptimizerConfig
 
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def _get_param_groups(
@@ -277,8 +278,7 @@ def get_megatron_optimizer(
         Instance of MegatronOptimizer.
     """
 
-    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-        logger.info(f'Setting up optimizer with config {config}')
+    log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}')
 
     # Collect param groups.
     param_groups = _get_param_groups(
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index 2c5a1ed88b..159bbf1163 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -16,7 +16,7 @@
 from datetime import datetime
 from functools import reduce
 from types import TracebackType
-from typing import List, Optional, Tuple, Type, Union
+from typing import Any, List, Optional, Tuple, Type, Union
 
 import torch
 
@@ -198,6 +198,44 @@ def init_(tensor):
     return init_
 
 
+def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
+    """If torch distributed is initialized, log only on rank
+
+    Args:
+        logger (logging.Logger): The logger to write the logs
+
+        args (Tuple[Any]): All logging.Logger.log positional arguments
+
+        rank (int, optional): The rank to write on. Defaults to 0.
+
+        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
+    """
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == rank:
+            logger.log(*args, **kwargs)
+    else:
+        logger.log(*args, **kwargs)
+
+
+def log_on_each_pipeline_stage(logger: logging.Logger, *args: Any, **kwargs: Any):
+    """Log on first rank in each pipeline stage
+
+    Args:
+        logger (logging.Logger): The logger to write the logs
+
+        args (Tuple[Any]): All logging.Logger.log positional arguments
+
+        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
+    """
+    assert torch.distributed.is_initialized()
+
+    if (
+        parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0
+        and parallel_state.get_tensor_model_parallel_rank() == 0
+    ):
+        logger.log(*args, **kwargs)
+
+
 def check_param_hashes_across_dp_replicas(model: List[torch.nn.Module]) -> bool:
     """Computes hashes of all parameters in model, all-gathers hashes across DP replicas,
     and then checks for equality between the locally-computed hashes and the hashes
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 6b038669f7..dcc3118d1d 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1338,6 +1338,8 @@ def _add_distributed_args(parser):
                        dest='delay_grad_reduce')
     group.add_argument('--ddp-bucket-size', type=int, default=None,
                        help='Bucket size for data-parallel communication')
+    group.add_argument('--ddp-average-in-collective', action='store_true',
+                       default=False, help='If set, average directly in data-parallel communication collective.')
     group.add_argument('--overlap-param-gather', action='store_true',
                        default=False, help='If set, overlap param all-gather in distributed optimizer.')
     group.add_argument('--delay-param-gather', action='store_true',
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 67361d6b89..8c12268d24 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -421,12 +421,11 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
             overlap_grad_reduce=args.overlap_grad_reduce,
             use_distributed_optimizer=args.use_distributed_optimizer,
             check_for_nan_in_grad=args.check_for_nan_in_loss_and_grad,
-            bucket_size=args.ddp_bucket_size)
+            bucket_size=args.ddp_bucket_size,
+            average_in_collective=args.ddp_average_in_collective)
         model = [DDP(config,
                      ddp_config,
                      model_chunk,
-                     data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
-                     expert_data_parallel_group=mpu.get_data_modulo_expert_parallel_group(),
                      # Turn off bucketing for model_chunk 2 onwards, since communication for these
                      # model chunks is overlapped with compute anyway.
                      disable_bucketing=(model_chunk_idx > 0))
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index 7315cdda61..17f84861d6 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -60,6 +60,7 @@ products:
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1]}
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-create-attention-mask-in-dataloader"], args_meta: ["no_create_attention_mask_in_dataloader"]}
   - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-mmap-bin-files"], args_meta: ["no_mmap_bin_files"]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--ddp-average-in-collective"], args_meta: ["ddp_average_in_collective"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]}
   - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--qk-layernorm --test-mode"'], args_meta: ["qk_layernorm_test_mode"]}
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json
new file mode 100644
index 0000000000..265ad7c9b9
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file

From ebcdfeb38f981144b2f8d31eb2ba6b1b58345fad Mon Sep 17 00:00:00 2001
From: Jon Barker <jbarker@nvidia.com>
Date: Tue, 4 Jun 2024 14:06:15 -0700
Subject: [PATCH 64/92] Llama3 and Mistral support

---
 docs/llama2.md                                | 178 --------
 docs/llama_mistral.md                         | 391 ++++++++++++++++++
 .../inference/text_generation/tokenization.py |   7 +-
 megatron/training/arguments.py                |   2 +
 megatron/training/tokenizer/tokenizer.py      |  68 +++
 tools/checkpoint/convert.py                   |   2 +-
 ...ader_llama2.py => loader_llama_mistral.py} | 158 ++++---
 tools/checkpoint/saver_mcore.py               |   9 +-
 tools/preprocess_data.py                      |   2 +-
 9 files changed, 583 insertions(+), 234 deletions(-)
 delete mode 100644 docs/llama2.md
 create mode 100644 docs/llama_mistral.md
 rename tools/checkpoint/{loader_llama2.py => loader_llama_mistral.py} (81%)

diff --git a/docs/llama2.md b/docs/llama2.md
deleted file mode 100644
index 286a29c06f..0000000000
--- a/docs/llama2.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Llama-2 Inference and Finetuning
-
-The Llama-2 [family of models](https://ai.meta.com/llama/) are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At the time of release, Llama-2 models achieved among the best results for open-source models, and were competitive with the closed-source GPT-3.5 model (see https://arxiv.org/pdf/2307.09288.pdf).
-
-Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
-
-1. Get access to download the checkpoints.
-2. Convert the checkpoints from Meta/Huggingface format to Megatron format.
-3. Setup arguments for launching the model.
-
-The following sections detail these steps. The final section lists benchmark result comparisons between: 1) Llama-2 inference code running the Meta-format checkpoints, and 2) Megatron inference code running the converted checkpoints.
-
-# Contents
-  * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints)
-  * [Convert checkpoint format](#convert-checkpoint-format)
-    * [Meta format](#meta-format)
-    * [Huggingface format](#huggingface-format)
-  * [Launch model](#launch-model)
-    * [Megatron](#launch-megatron)
-    * [Meta](#launch-meta)
-    * [Huggingface](#launch-hf)
-  * [Benchmark results](#benchmark-results)
-
-# Download Meta or Huggingface checkpoints
-
-Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
-
-# Convert checkpoint format
-
-We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. 
-
-### Meta format
-
-The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 7B, 13B, 70B, etc.), the following example command can be used to convert from Llama-2 format to HF format in bfloat16:
-
-```
-python tools/checkpoint/convert.py --model-type GPT \ 
->   --loader llama2 \
->   --saver megatron \
->   --checkpoint-type meta \
->   --model-size 7B \ 
->   --load-dir $LLAMA_META_FORMAT_DIR \
->   --save-dir ${MEGATRON_FORMAT_DIR} \
->   --tokenizer-model ${TOKENIZER_MODEL} \
->   --target-tensor-parallel-size ${TP} \
->   --target-pipeline-parallel-size ${PP} \
->   --bf16
-```
-
-Valid values for `--model_size` include `7B`, `13B`, and `70B` (for pretrained-only models), and `7Bf`, `13Bf`, and `70Bf` (for chat-finetuned models).
-
-### Huggingface format
-
-The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-2 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama2.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
-
-| Model size | Tensor parallel size (`TP`) |
-| ---------- | --------------------------- |
-|  7B        | 1                           |
-| 13B        | 2                           |
-| 70B        | 8                           |
-
-Using these values for `TP`, along with the path to the Llama-2 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
-
-```
-$>: python tools/checkpoint/convert.py \
- >    --model-type GPT \
- >    --loader llama2 \
- >    --saver megatron \
- >    --target-tensor-parallel-size ${TP} \
- >    --checkpoint-type hf
- >    --load-dir ${HF_FORMAT_DIR} \
- >    --save-dir ${MEGATRON_FORMAT_DIR} \
- >    --tokenizer-model ${TOKENIZER_MODEL}
-```
-
-After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
-
-# Launch model
-
-### Launch Megatron
-
-If loading for either inference or finetuning, use the following arguments:
-
-```
---tensor-model-parallel-size ${TP} \
---pipeline-model-parallel-size 1 \
---seq-length 4096 \
---max-position-embeddings 4096 \
---tokenizer-type Llama2Tokenizer \
---tokenizer-model ${TOKENIZER_MODEL} \
---load ${CHECKPOINT_DIR} \
---exit-on-missing-checkpoint \
---use-checkpoint-args \
---no-load-optim \
---no-load-rng \
---untie-embeddings-and-output-weights \
---use-rotary-position-embeddings \
---normalization RMSNorm \
---no-position-embedding \
---no-masked-softmax-fusion \
---attention-softmax-in-fp32
-```
-
-### Launch Meta
-
-Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
-
-### Launch Huggingface
-
-Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-
-# Benchmark results
-
-The tables below list the benchmark comparisons between native Llama-2 (using Meta's checkpoint and Meta's inference code) and Megatron (using a converted HF checkpoint and Megatron's inference code).
-
-The values are the percent error between Megatron and Llama-2, calculated using the formula: `|<llama_score> - <megatron_score>| / <llama_score>`, where the type of score is detailed before each table. Across all tests (80 total per model size), the mean error is 0.15%. The small difference in benchmark scores between the two models is due to minor arithmetic differences in implementation that alter the numerics slightly. Some of the factors that influence this difference include:
-
-- Megatron performs batch matrix multiplications in a couple places, such as within self attention and in SwiGLU, that Llama performs separately.
-- Megatron uses `torch.baddbmm` within self attention, versus Llama using `torch.matmul`.
-- Megatron uses a `sin`/`cos` implementation for rotary position embeddings, versus Llama using a `polar`/`complex` implementation.
-- Llama calls `torch.set_default_dtype(torch.float16)` during initialization, which Megatron does not.
-
-### Big Bench
-
-Score type: multiple choice grade.
-
-| bigbench / standard | 7b | 13b | 70b |
-| -- | -- | -- | -- |
-| date_understanding | 0.29% | 0.13% | 0.12% |
-| general_knowledge | 0.00% | 0.00% | 0.00% |
-| human_organs_senses | 0.00% | 0.00% | 0.00% |
-| intent_recognition | 0.00% | 0.11% | 0.00% |
-| riddle_sense | 0.00% | 0.00% | 0.00% |
-| similarities_abstraction | 0.00% | 0.58% | 0.00% |
-| simple_arithmetic_json_multiple_choice | 0.00% | 0.00% | 0.00% |
-| undo_permutation | 0.19% | 0.19% | 0.18% |
-
-### Multilingual
-
-Score type: multiple choice grade.
-
-| multilingual / xcopa | 7b  | 13b  | 70b |
-| -- | -- | -- | -- |
-| en-template-mGPT-remove-punctuation | 0.08% | 0.00% | 0.00% |
-| et-template-mGPT-remove-punctuation | 0.00% | 0.13% | 0.25% |
-| ht-template-mGPT-remove-punctuation | 0.26% | 0.13% | 0.26% |
-| id-template-mGPT-remove-punctuation | 0.11% | 0.00% | 0.19% |
-| it-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
-| qu-template-mGPT-remove-punctuation | 0.00% | 0.00% | 0.27% |
-| sw-template-mGPT-remove-punctuation | 0.14% | 0.13% | 0.13% |
-| th-template-mGPT-remove-punctuation | 0.25% | 0.13% | 0.13% |
-| tr-template-mGPT-remove-punctuation | 0.26% | 0.00% | 0.34% |
-| vi-template-mGPT-remove-punctuation | 0.00% | 0.11% | 0.00% |
-| zh-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
-
-### LM Evaluation Harness
-
-Score type: multiple choice grade.
-
-| lm-eval | 7b  | 13b  | 70b |
-| -- | -- | -- | -- |
-| boolq | 0.04% | 0.04% | 0.07% |
-| hellaswag | 0.02% | 0.03% | 0.03% |
-| piqa | 0.00% | 0.00% | 0.07% |
-| winogrande | 0.00% | 0.11% | 0.20% |
-
-### MMLU
-
-Score type: multiple choice grade.
-
-Note: the number in brackets is the number of sub-tasks for each supercategory.
-
-| mmlu | 7b  | 13b  | 70b |
-| -- | -- | -- | -- |
-| stem [18]  | 0.79% | 0.05% | 0.01% |
-| humanities [13]  | 0.19% | 0.01% | 0.02% |
-| other (business, health, misc.) [14]  | 0.08% | 0.06% | 0.12% |
-| social sciences [12]  | 0.37% | 0.21% | 0.01% |
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
new file mode 100644
index 0000000000..0e3d4b2fb8
--- /dev/null
+++ b/docs/llama_mistral.md
@@ -0,0 +1,391 @@
+# Llama and Mistral support in Megatron-LM
+
+NOTE: Llama-3 and Mistral support in Megatron is currently experimental and we are still evaluting benchmark results to confirm model conversion, training and inference correctness.
+
+The [Llama-2](https://ai.meta.com/llama/) and [Llama-3](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/).
+
+Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results.
+
+Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatron can support loading checkpoints from all three for inference and finetuning. Converting the checkpoints and loading them is slightly different for each model and is detailed for each below.
+
+# Llama-2
+
+Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
+
+1. Get access to download the checkpoints.
+2. Convert the checkpoints from Meta/Huggingface format to Megatron format.
+3. Setup arguments for launching the model.
+
+The following sections detail these steps. The final section lists benchmark result comparisons between: 1) Llama-2 inference code running the Meta-format checkpoints, and 2) Megatron inference code running the converted checkpoints.
+
+## Contents
+  * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Meta format](#meta-format)
+    * [Huggingface format](#huggingface-format)
+  * [Launch model](#launch-model)
+    * [Megatron](#launch-megatron)
+    * [Meta](#launch-meta)
+    * [Huggingface](#launch-hf)
+  * [Benchmark results](#benchmark-results)
+
+## Download Meta or Huggingface checkpoints
+
+Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Meta format
+
+The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 7B, 13B, 70B, etc.), the following example command can be used to convert from Llama-2 format to HF format in bfloat16:
+
+```
+python tools/checkpoint/convert.py --model-type GPT \
+>   --loader llama_mistral \
+>   --saver megatron \
+>   --checkpoint-type meta \
+>   --model-size llama2-7B \
+>   --load-dir $LLAMA_META_FORMAT_DIR \
+>   --save-dir ${MEGATRON_FORMAT_DIR} \
+>   --tokenizer-model ${TOKENIZER_MODEL} \
+>   --target-tensor-parallel-size ${TP} \
+>   --target-pipeline-parallel-size ${PP} \
+>   --bf16
+```
+
+Valid values for `--model-size` are `llama2-7B`, `llama2-13B`, and `llama2-70B` (for pretrained-only models), and `llama2-7Bf`, `llama2-13Bf`, and `llama2-70Bf` (for chat-finetuned models).
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-2 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  7B        | 1                           |
+| 13B        | 2                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-2 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver megatron \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+```
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## Launch model
+
+### Launch Megatron
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 4096 \
+--max-position-embeddings 4096 \
+--tokenizer-type Llama2Tokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--use-rotary-position-embeddings \
+--normalization RMSNorm \
+--no-position-embedding \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32
+```
+
+### Launch Meta
+
+Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
+
+### Launch Huggingface
+
+Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+
+## Benchmark results
+
+The tables below list the benchmark comparisons between native Llama-2 (using Meta's checkpoint and Meta's inference code) and Megatron (using a converted HF checkpoint and Megatron's inference code).
+
+The values are the percent error between Megatron and Llama-2, calculated using the formula: `|<llama_score> - <megatron_score>| / <llama_score>`, where the type of score is detailed before each table. Across all tests (80 total per model size), the mean error is 0.15%. The small difference in benchmark scores between the two models is due to minor arithmetic differences in implementation that alter the numerics slightly. Some of the factors that influence this difference include:
+
+- Megatron performs batch matrix multiplications in a couple places, such as within self attention and in SwiGLU, that Llama performs separately.
+- Megatron uses `torch.baddbmm` within self attention, versus Llama using `torch.matmul`.
+- Megatron uses a `sin`/`cos` implementation for rotary position embeddings, versus Llama using a `polar`/`complex` implementation.
+- Llama calls `torch.set_default_dtype(torch.float16)` during initialization, which Megatron does not.
+
+### Big Bench
+
+Score type: multiple choice grade.
+
+| bigbench / standard | 7b | 13b | 70b |
+| -- | -- | -- | -- |
+| date_understanding | 0.29% | 0.13% | 0.12% |
+| general_knowledge | 0.00% | 0.00% | 0.00% |
+| human_organs_senses | 0.00% | 0.00% | 0.00% |
+| intent_recognition | 0.00% | 0.11% | 0.00% |
+| riddle_sense | 0.00% | 0.00% | 0.00% |
+| similarities_abstraction | 0.00% | 0.58% | 0.00% |
+| simple_arithmetic_json_multiple_choice | 0.00% | 0.00% | 0.00% |
+| undo_permutation | 0.19% | 0.19% | 0.18% |
+
+### Multilingual
+
+Score type: multiple choice grade.
+
+| multilingual / xcopa | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| en-template-mGPT-remove-punctuation | 0.08% | 0.00% | 0.00% |
+| et-template-mGPT-remove-punctuation | 0.00% | 0.13% | 0.25% |
+| ht-template-mGPT-remove-punctuation | 0.26% | 0.13% | 0.26% |
+| id-template-mGPT-remove-punctuation | 0.11% | 0.00% | 0.19% |
+| it-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
+| qu-template-mGPT-remove-punctuation | 0.00% | 0.00% | 0.27% |
+| sw-template-mGPT-remove-punctuation | 0.14% | 0.13% | 0.13% |
+| th-template-mGPT-remove-punctuation | 0.25% | 0.13% | 0.13% |
+| tr-template-mGPT-remove-punctuation | 0.26% | 0.00% | 0.34% |
+| vi-template-mGPT-remove-punctuation | 0.00% | 0.11% | 0.00% |
+| zh-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
+
+### LM Evaluation Harness
+
+Score type: multiple choice grade.
+
+| lm-eval | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| boolq | 0.04% | 0.04% | 0.07% |
+| hellaswag | 0.02% | 0.03% | 0.03% |
+| piqa | 0.00% | 0.00% | 0.07% |
+| winogrande | 0.00% | 0.11% | 0.20% |
+
+### MMLU
+
+Score type: multiple choice grade.
+
+Note: the number in brackets is the number of sub-tasks for each supercategory.
+
+| mmlu | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| stem [18]  | 0.79% | 0.05% | 0.01% |
+| humanities [13]  | 0.19% | 0.01% | 0.02% |
+| other (business, health, misc.) [14]  | 0.08% | 0.06% | 0.12% |
+| social sciences [12]  | 0.37% | 0.21% | 0.01% |
+
+# Llama-3
+
+Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Clone the llama3 loading code from Meta.
+3. Install the llama package from source.
+4. Convert the checkpoints from Meta/Huggingface format to Megatron format.
+5. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints)
+  * [Install tiktoken](#install-tiktoken)
+  * [Install llama package from Meta](#install-llama-package)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Meta format](#meta-format)
+    * [Huggingface format](#huggingface-format)
+  * [Launch model](#launch-model)
+    * [Megatron](#launch-megatron)
+    * [Meta](#launch-meta)
+    * [Huggingface](#launch-hf)
+  * [Benchmark results](#benchmark-results)
+
+## Download Meta or Huggingface checkpoints
+
+Users must first apply for access to download the Llama-3 checkpoints either directly from [Meta](https://llama.meta.com/llama-downloads) or through [Huggingface](https://huggingface.co/meta-llama) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+
+## Install tiktoken
+
+The Llama-3 tokenizer relies on the availability of the `tiktoken` module which can be installed through `pip`.
+
+## Install llama package from Meta
+
+1. In a location outside of the megatron-lm source directory, e.g `~`: `git clone https://github.com/meta-llama/llama3.git`
+2. `cd $LLAMA3_SOURCE_DIR`
+4. `pip install -e .`
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Meta format
+
+The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 8B, 70B, etc.), the following example command can be used to convert from Llama-3 format to HF format in bfloat16:
+
+```
+python tools/checkpoint/convert.py \
+>   --model-type GPT \
+>   --loader llama_mistral \
+>   --saver mcore \
+>   --checkpoint-type meta \
+>   --model-size llama3-8B \
+>   --load-dir $LLAMA_META_FORMAT_DIR \
+>   --save-dir ${MEGATRON_FORMAT_DIR} \
+>   --tokenizer-model ${TOKENIZER_MODEL} \
+>   --target-tensor-parallel-size ${TP} \
+>   --target-pipeline-parallel-size ${PP} \
+>   --bf16
+```
+
+Valid values for `--model_size` are `llama3-8B` and `llama3-70B` (for pretrained-only models), and `llama3-8Bf` and `llama3-70Bf` (for chat-finetuned models).
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  8B        | 1                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+ >    --model-size llama3-8B \
+```
+
+Valid values for `--model-size` are `llama3-8B` and `llama3-70B` (for pretrained-only models), and `llama3-8Bf` and `llama3-70Bf` (for chat-finetuned models).
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## Launch model
+
+### Launch Megatron
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 4096 \
+--max-position-embeddings 4096 \
+--tokenizer-type Llama3Tokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32
+```
+
+### Launch Meta
+
+Meta checkpoints can be launched with: https://github.com/meta-llama/llama3
+
+### Launch Huggingface
+
+Huggingface checkpoints can be launched by following the instructions here: https://huggingface.co/blog/llama3
+
+## Benchmark results
+
+Llama-3 support in Megatron is currently experimental and we are still carrying out benchmark evaluations.
+
+# Mistral-7b
+
+Megatron currently supports loading the v.03 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Install the `mistral-common` package
+3. Convert the checkpoints from HuggingFace format to Megatron format.
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Install mistral-common packgage](#install-mistral-common)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+  * [Launch model](#launch-model)
+  * [Benchmark results](#benchmark-results)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Mistral-7b checkpoints through [Huggingface](https://huggingface.co/mistralai/Mistral-7B-v0.3) (HF). Megatron does not currently support the v0.1 or v0.2 checkpoints, ensure you download v0.3. Megatron also does not currently support using the raw weights directly from [Mistral](https://docs.mistral.ai/getting-started/open_weight_models/).
+
+## Install the mistral-common package
+
+`pip install mistral-common`
+
+## Convert checkpoint format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Mistral checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`).
+
+Using the path to the Mistral tokenizer model (downloaded alongside the HF checkpoint), run the following command from the root of your Megatron source code to convert from HF format to mcore format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf \
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL} \
+ >    --model-size mistral-7B \
+```
+
+Valid values for `--model-size` are mistral-7B for the pretrained model or mistral-7Bf for the chat fine-tuned model.
+
+After this conversion, we are ready to load the checkpoints into an mcore GPT model.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 4096 \
+--max-position-embeddings 4096 \
+--tokenizer-type MistralTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32
+```
+
+# Benchmark results
+
+Mistral-7B support in Megatron is currently experimental and we are still carrying out benchmark evaluations.
diff --git a/megatron/inference/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py
index 18cc077e2c..cab2d2ea5a 100644
--- a/megatron/inference/text_generation/tokenization.py
+++ b/megatron/inference/text_generation/tokenization.py
@@ -30,10 +30,13 @@ def detokenize_generations(tokens_gpu_tensor,
         if return_segments:
             words = []
             for token in sequence_tokens:
-                if args.tokenizer_type in ['SentencePieceTokenizer', 
+                if args.tokenizer_type in ['SentencePieceTokenizer',
                                            'GPTSentencePieceTokenizer',
-                                           'Llama2Tokenizer']:
+                                           'Llama2Tokenizer',
+                                           'MistralTokenizer']:
                     word = tokenizer.decoder[token]
+                elif args.tokenizer_type == 'Llama3Tokenizer':
+                    word = tokenizer.decode([token])
                 elif args.tokenizer_type == 'NullTokenizer':
                     word = str(token)
                 else:
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 483fd51380..2022ebc6a8 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1467,6 +1467,8 @@ def _add_data_args(parser):
                                 'SentencePieceTokenizer',
                                 'GPTSentencePieceTokenizer',
                                 'Llama2Tokenizer',
+                                'Llama3Tokenizer',
+                                'MistralTokenizer',
                                 'NullTokenizer'],
                        help='What type of tokenizer to use.')
     group.add_argument('--tokenizer-model', type=str, default=None,
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
index eaf9ec6670..b5953a5c6c 100644
--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -41,6 +41,12 @@ def build_tokenizer(args):
     elif args.tokenizer_type == 'Llama2Tokenizer':
         assert args.tokenizer_model is not None
         tokenizer = _Llama2Tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'Llama3Tokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = create_llama3_tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'MistralTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = create_mistral_tokenizer(args.tokenizer_model)
     elif args.tokenizer_type == 'NullTokenizer':
         assert args.vocab_size is not None
         tokenizer = _NullTokenizer(args.vocab_size)
@@ -488,6 +494,68 @@ def additional_special_tokens_ids(self):
         return None
 
 
+def create_llama3_tokenizer(*args, **kwargs):
+
+    try:
+        from llama.tokenizer import Tokenizer as Llama3Tokenizer
+    except ImportError:
+        raise ImportError("Module 'llama' is required but not installed.")
+
+    class _Llama3Tokenizer(Llama3Tokenizer):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def tokenize(self, s: str, bos=True, eos=False):
+            '''Default args for text completion, not chat/dialog.'''
+
+            assert type(s) is str
+
+            t = self.encode(s, bos=False, eos=eos, allowed_special='all')
+            return t
+
+        def detokenize(self, ids):
+            return self.decode(ids)
+
+        @property
+        def cls(self):
+            return -1
+
+        @property
+        def sep(self):
+            return -1
+
+        @property
+        def mask(self):
+            return -1
+
+        @property
+        def eod(self):
+            return self.eos_id
+
+        @property
+        def additional_special_tokens_ids(self):
+            return None
+
+        @property
+        def vocab_size(self):
+            return self.model.n_vocab
+
+    return _Llama3Tokenizer(*args, **kwargs)
+
+
+def create_mistral_tokenizer(*args, **kwargs):
+    try:
+        from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+    except ImportError:
+        raise ImportError("Module 'mistral-common' is required but not installed.")
+
+    class _MistralTokenizer(MistralTokenizer):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+    return _MistralTokenizer.from_file(*args, **kwargs)
+
+
 class _NullTokenizer(MegatronTokenizer):
     def __init__(self, vocab_size):
         super().__init__(None, vocab_size=vocab_size)
diff --git a/tools/checkpoint/convert.py b/tools/checkpoint/convert.py
index 935613b143..7ead190046 100644
--- a/tools/checkpoint/convert.py
+++ b/tools/checkpoint/convert.py
@@ -112,7 +112,7 @@ def main():
                                      allow_abbrev=False, conflict_handler='resolve')
 
     parser.add_argument('--model-type', type=str, required=True,
-                        choices=['GPT', 'BERT'],
+                        choice=['GPT', 'BERT'],
                         help='Type of the model')
     parser.add_argument('--loader', type=str, default='megatron',
                         help='Module name to load checkpoint, should be on python path')
diff --git a/tools/checkpoint/loader_llama2.py b/tools/checkpoint/loader_llama_mistral.py
similarity index 81%
rename from tools/checkpoint/loader_llama2.py
rename to tools/checkpoint/loader_llama_mistral.py
index b7fd02f73a..ec222b4b37 100644
--- a/tools/checkpoint/loader_llama2.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -15,11 +15,13 @@
 
 
 def add_arguments(parser):
-    group = parser.add_argument_group(title='Llama-2 HF loader.')
+    group = parser.add_argument_group(title='Llama/Mistral loader.')
 
+    # TODO(jbarker): Need assertion to make sure *exactly* one of these is used
     parser.add_argument('--model-size', type=str, required=True,
-                        help='Model size can be `7B`, `13B`, and `70B` (for pretrained models), and `7Bf`, `13Bf`, '
-                             'and `70Bf` (for chat-finetuned models).')
+                        choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf'],
+                        help='Model size can be `llama2-7B`, `llama2-13B`, `llama2-70B`, `llama3-8B`, `llama3-70B`, `mistral-7B` (for pretrained models), '
+                        'and `llama2-7Bf`, `llama2-13Bf`, `llama2-70Bf`, `llama3-8Bf`, `llama3-70bf` and `mistral-7Bf` (for chat-finetuned models).')
     parser.add_argument('--checkpoint-type', type=str, required=True,
                         help='Type of checkpoint to convert, options are "meta" or "hf"')
     parser.add_argument('--bf16', action='store_true', help='Whether to load weights in bf16.')
@@ -30,7 +32,7 @@ def add_arguments(parser):
                        help='Path to the vocab file. If specified will use this to get vocab size and '
                        'trim padding from the embedding table.')
     group.add_argument('--tokenizer-model', required=True,
-                       help='Sentencepiece tokenizer model.')
+                       help='Tokenizer model file.')
     group.add_argument('--megatron-path', type=str, default=None,
                        help='Base directory of Megatron repository')
     group.add_argument('--loader-transformer-impl', default='local',
@@ -44,15 +46,18 @@ def verify_transformers_version():
 
 
 NUM_SHARDS = {
-    "7B": 1,
-    "7Bf": 1,
-    "13B": 2,
-    "13Bf": 2,
-    "34B": 4,
-    "30B": 4,
-    "65B": 8,
-    "70B": 8,
-    "70Bf": 8,
+    "llama2-7B": 1,
+    "llama2-7Bf": 1,
+    "llama2-13B": 2,
+    "llama2-13Bf": 2,
+    "llama2-70B": 8,
+    "llama2-70Bf": 8,
+    "llama3-8B": 1,
+    "llama3-8Bf": 1,
+    "llama3-70B": 8,
+    "llama3-70Bf": 8,
+    "mistral-7B": 1,
+    "mistral-7Bf": 1,
 }
 
 
@@ -74,7 +79,18 @@ def write_json(text, path):
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
 def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
 
-    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast
+    if "llama2" in model_size:
+        from transformers import LlamaConfig as ModelConfig
+        from transformers import  LlamaTokenizer, LlamaTokenizerFast
+    elif "llama3" in model_size:
+        from transformers import LlamaConfig as ModelConfig
+    elif "mistral" in model_size:
+        from transformers import MistralConfig as ModelConfig
+        try:
+            from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+        except ImportError:
+            raise ImportError("Module 'mistral-common' is required but not installed.")
+
 
     # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
     if not os.path.isfile(os.path.join(input_base_path, "params.json")):
@@ -93,15 +109,33 @@ def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
     base = params.get("rope_theta", 10000.0)
     inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
     if base > 10000.0:
-        max_position_embeddings = 16384
+        max_position_embeddings = 32768 if "mistral" in model_size else 16384
     else:
-        max_position_embeddings = 2048
-
-    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+        max_position_embeddings = 4096 if "mistral" in model_size else 2048
+
+    if "llama2" in model_size:
+        tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    elif "llama3" in model_size:
+        try:
+            from llama.tokenizer import Tokenizer as Llama3Tokenizer
+        except ImportError:
+            raise AssertionError("Module 'llama' is required but not installed.")
+        tokenizer_class = Llama3Tokenizer
+    elif "mistral" in model_size:
+        tokenizer_class = MistralTokenizer
+    else:
+        raise AttributeError(f"model_size={model_size} not supported")
     if tokenizer_path is not None:
-        tokenizer = tokenizer_class(tokenizer_path)
-        tokenizer.save_pretrained(model_path)
-    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+        if "llama" in model_size:
+            tokenizer = tokenizer_class(tokenizer_path)
+            if "llama2" in model_size:
+                tokenizer.save_pretrained(model_path)
+            vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+        elif "mistral" in model_size:
+            tokenizer = tokenizer_class.from_file(tokenizer_path)
+            vocab_size = 32768
+        else:
+            raise AttributeError(f"model_size={model_size} is not supported")
 
     if params.get("n_kv_heads", None) is not None:
         num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
@@ -134,13 +168,14 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
         filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
         if num_shards == 1:
             # Unsharded
+            q_proj = loaded[f"layers.{layer_i}.attention.wq.weight"]
+            k_proj = loaded[f"layers.{layer_i}.attention.wk.weight"]
+            if ("llama2" in model_size) or ("mistral" in model_size):
+                q_proj = permute(q_proj)
+                k_proj = permute(k_proj)
             state_dict = {
-                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                    loaded[f"layers.{layer_i}.attention.wq.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                    loaded[f"layers.{layer_i}.attention.wk.weight"]
-                ),
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj,
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj,
                 f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
                 f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
                 f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
@@ -224,10 +259,11 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
             "lm_head.weight": loaded["output.weight"],
         }
     else:
+        d = 0 if "llama3" in model_size else 1
         state_dict = {
             "model.norm.weight": loaded[0]["norm.weight"],
             "model.embed_tokens.weight": torch.cat(
-                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=d
             ),
             "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
         }
@@ -242,7 +278,7 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
     write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
     ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
     multiple_of = params["multiple_of"] if "multiple_of" in params else 256
-    config = LlamaConfig(
+    config = ModelConfig(
         hidden_size=dim,
         intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
         num_attention_heads=params["n_heads"],
@@ -266,33 +302,31 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
 def load_args_from_checkpoint(args):
 
     # Read Llama args.
-    llama_args_path = os.path.join(args.load, "config.json")
-    with open(llama_args_path) as f:
-        llama_args = json.load(f)
+    model_args_path = os.path.join(args.load, "config.json")
+    with open(model_args_path) as f:
+        model_args = json.load(f)
     # Update Megatron args.
     args.seq_length = 4096
-    args.max_position_embeddings = 4096
-    args.hidden_size = llama_args["hidden_size"]
-    args.num_attention_heads = llama_args["num_attention_heads"]
-    args.num_layers = llama_args["num_hidden_layers"]
+    args.max_position_embeddings = model_args["max_position_embeddings"]
+    args.hidden_size = model_args["hidden_size"]
+    args.num_attention_heads = model_args["num_attention_heads"]
+    args.num_layers = model_args["num_hidden_layers"]
     args.global_batch_size = 1024
-    args.norm_epsilon = llama_args["rms_norm_eps"]
+    args.norm_epsilon = model_args["rms_norm_eps"]
     args.iteration = 1 # '0', 'release' don't work
     args.add_position_embedding = False
     args.use_rotary_position_embeddings = True
     args.swiglu = True
-    args.tokenizer_type = "Llama2Tokenizer"
     args.normalization = "RMSNorm"
     args.add_bias_linear = False
     args.untie_embeddings_and_output_weights = True
-    args.vocab_size = llama_args["vocab_size"]
-    args.padded_vocab_size = llama_args["vocab_size"]
-    args.llama = llama_args
-    args.ffn_hidden_size = llama_args["intermediate_size"]
+    args.vocab_size = model_args["vocab_size"]
+    args.padded_vocab_size = model_args["vocab_size"]
+    args.ffn_hidden_size = model_args["intermediate_size"]
 
-    if "num_key_value_heads" in llama_args:
+    if "num_key_value_heads" in model_args:
         args.group_query_attention = True
-        args.num_query_groups = llama_args["num_key_value_heads"]
+        args.num_query_groups = model_args["num_key_value_heads"]
 
 
 def set_preprocess_state(args, model, hf_model):
@@ -323,7 +357,7 @@ def set_attn_state(args, layer, hf_layer):
     assert nh % ng == 0
 
     # Copy weights (re-order dimensions for Megatron).
-    attn.query_key_value.weight.data.copy_(torch.cat([ 
+    attn.query_key_value.weight.data.copy_(torch.cat([
         hf_attn.q_proj.weight.reshape((ng, dim*nh//ng, -1)),
         hf_attn.k_proj.weight.reshape((ng, dim, -1)),
         hf_attn.v_proj.weight.reshape((ng, dim, -1)),
@@ -360,10 +394,15 @@ def load_checkpoint_to_model(args):
     '''Set model params.'''
 
     from pretrain_gpt import model_provider
-    from transformers import LlamaForCausalLM
+    if "llama" in args.model_size:
+        from transformers import LlamaForCausalLM as ModelForCausalLM
+    elif "mistral" in args.model_size:
+        from transformers import MistralForCausalLM as ModelForCausalLM
+    else:
+        raise AttributeError(f"args.model_size={args.model_size} not supported")
 
     # Load Huggingface model.
-    hf_model = LlamaForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
+    hf_model = ModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
 
     # Init Megatron model.
     model = model_provider(True, True).to(args.params_dtype)
@@ -379,7 +418,6 @@ def load_checkpoint_to_model(args):
 
 def _load_checkpoint(queue, args):
 
-    # Llama-2 requires HF transformers >=4.31.0.
     verify_transformers_version()
 
     # Search in directory above this.
@@ -427,6 +465,13 @@ def _load_checkpoint(queue, args):
     margs.tokenizer_model = args.tokenizer_model
     load_args_from_checkpoint(margs)
 
+    if "llama2" in args.model_size:
+        margs.tokenizer_type = "Llama2Tokenizer"
+    elif "llama3" in args.model_size:
+        margs.tokenizer_type = "Llama3Tokenizer"
+    elif "mistral" in args.model_size:
+        margs.tokenizer_type = "MistralTokenizer"
+
     # Arguments do sanity checks on the world size, but we don't care,
     # so trick it into thinking we are plenty of processes.
     margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size
@@ -454,7 +499,6 @@ def check_for_arg(arg_name, default=None):
     check_for_arg('num_attention_heads')
     check_for_arg('max_position_embeddings')
     check_for_arg('position_embedding_type')
-    check_for_arg('tokenizer_type')
     check_for_arg('iteration')
     check_for_arg('bert_binary_head')
     check_for_arg('disable_bias_linear', False)
@@ -462,7 +506,7 @@ def check_for_arg(arg_name, default=None):
     check_for_arg('swiglu', False)
 
     # Determine how to make our models.
-    assert args.model_type == 'GPT', 'Llama-2 is a GPT model.'
+    assert args.model_type == 'GPT', 'Llama-2, Llama-3 and Mistral are GPT models.'
     margs.model_type = ModelType.encoder_or_decoder
     margs.params_dtype = torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32
 
@@ -501,12 +545,24 @@ def check_for_arg(arg_name, default=None):
     md.swiglu = margs.swiglu
     md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
     md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
-    md.true_vocab_size = None # skips padding in saver
     md.make_vocab_size_divisible_by = None
     md.checkpoint_args = margs
     md.consumed_train_samples = 0
     md.consumed_valid_samples = 0
 
+    margs.model_size = args.model_size
+
+    # Get true (non-padded) vocab size
+    if margs.tokenizer_model is not None and "llama3" in args.model_size:
+        try:
+            from llama.tokenizer import Tokenizer as Llama3Tokenizer
+        except ImportError:
+            raise AssertionError("Module 'llama' is required but not installed.")
+        tokenizer = Llama3Tokenizer(margs.tokenizer_model)
+        md.true_vocab_size = tokenizer.vocab_size
+    else:
+        md.true_vocab_size = None
+
     # Get first pipe stage.
     mpu.set_tensor_model_parallel_rank(0)
     mpu.set_pipeline_model_parallel_rank(0)
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
index 656103f360..6365b4ab43 100644
--- a/tools/checkpoint/saver_mcore.py
+++ b/tools/checkpoint/saver_mcore.py
@@ -592,7 +592,14 @@ def get_models(count, dtype, pre_process, post_process):
                 if not hasattr(models[0], 'output_layer'):
                     print("ERROR: got an output layer, but model does not have one")
                     exit(1)
-                output_layer_weight = torch.chunk(msg.pop("weight"), args.target_tensor_parallel_size, dim=0)
+                output_layer_weight = msg.pop("weight")
+                orig_vocab_size = orig_word_embed.shape[0]
+                padding_size = margs.padded_vocab_size - orig_vocab_size
+                output_layer_weight = torch.cat((
+                    output_layer_weight,
+                    output_layer_weight[-1].unsqueeze(0).expand(padding_size, -1)
+                ))
+                output_layer_weight = torch.chunk(output_layer_weight, args.target_tensor_parallel_size, dim=0)
                 for tp_rank, model in enumerate(models):
                     setter.set_output_layer(model, output_layer_weight[tp_rank])
                 del output_layer_weight
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 55d9d6c856..c1f258824f 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -201,7 +201,7 @@ def get_args():
                        choices=['BertWordPieceLowerCase','BertWordPieceCase',
                                 'GPT2BPETokenizer', 'SentencePieceTokenizer',
                                 'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
-                                'NullTokenizer'],
+                                'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'],
                        help='What type of tokenizer to use.')
     group.add_argument('--tokenizer-model', type=str, default=None,
                        help='YTTM tokenizer model.')

From 9aef9841456757816219d36dddd3c387135df725 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 29 May 2024 09:56:33 -0700
Subject: [PATCH 65/92] Multimodal example - TextVQA and VQAv2 eval

---
 LICENSE                                    |   2 +-
 examples/multimodal/Dockerfile             |   9 +-
 examples/multimodal/README.md              |  16 ++
 examples/multimodal/evaluate_textvqa.py    |  86 ++++++++++
 examples/multimodal/run_text_generation.py | 176 +++++++++++++++------
 examples/multimodal/text_generation_8b.sh  |  27 +++-
 tools/run_vlm_text_generation.py           |  49 +++---
 7 files changed, 286 insertions(+), 79 deletions(-)
 create mode 100644 examples/multimodal/evaluate_textvqa.py

diff --git a/LICENSE b/LICENSE
index 056220a445..b49c04ee33 100644
--- a/LICENSE
+++ b/LICENSE
@@ -35,7 +35,7 @@ organizations have notices at the top of each file. Below are
 licenses used in those files, as indicated.
 
 
-------------- LICENSE FOR Facebook, huggingface and Google Research code  --------------
+------------- LICENSE FOR Facebook, huggingface, Google Research and LLaVA code  --------------
 
 
                                  Apache License
diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
index 0ac8f91b75..18f0e659dc 100644
--- a/examples/multimodal/Dockerfile
+++ b/examples/multimodal/Dockerfile
@@ -14,9 +14,14 @@ RUN apt update && \
         default-jre
 
 RUN pip install --upgrade pip
-RUN pip install einops sentencepiece braceexpand webdataset
+RUN pip install einops einops-exts sentencepiece braceexpand webdataset
+RUN pip install transformers datasets
 RUN pip install pytest-cov pytest_mock nltk wrapt
 RUN pip install zarr "tensorstore==0.1.45"
 RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
 RUN pip install black==19.10b0 isort click==8.0.2
-RUN pip install pycocoevalcap megatron-energon
\ No newline at end of file
+RUN pip install pycocoevalcap megatron-energon
+RUN pip install git+https://github.com/openai/CLIP.git
+# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
+RUN pip install mmf --no-deps
+RUN pip install open-flamingo[eval] --no-deps
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index f3117d2533..6adbe5302b 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -49,3 +49,19 @@ First, run text generation using `--task captioning`. Then, run the following co
 ```
 python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
 ```
+
+### TextVQA
+
+First, run text generation using `--task TextVQA`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_textvqa.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
+```
+
+### VQAv2
+
+First, run text generation using `--task VQAv2`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_textvqa.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file --question-path /path/to/question/file
+```
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
new file mode 100644
index 0000000000..08c6b08fe2
--- /dev/null
+++ b/examples/multimodal/evaluate_textvqa.py
@@ -0,0 +1,86 @@
+import argparse
+import glob
+import json
+import re
+
+# This can help resolve an import error of an mmf dependency that is not needed.
+try:
+    from mmf.utils.m4c_evaluators import TextVQAAccuracyEvaluator
+except ModuleNotFoundError:
+    from mmf.utils.m4c_evaluators import TextVQAAccuracyEvaluator
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    output_file_path = input_path + "-TextVQA-merged.json"
+
+    pattern = input_path + "-TextVQA-[0-9].*jsonl"
+    input_file_paths = glob.glob(pattern)
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+# Note: This is based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/eval/eval_textvqa.py#L17
+# and slightly modified.
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif "Reference OCR token: " in prompt and len(prompt.split("\n")) == 3:
+        if prompt.startswith("Reference OCR token:"):
+            question = prompt.split("\n")[1]
+        else:
+            question = prompt.split("\n")[0]
+    elif len(prompt.split("\n")) == 2:
+        question = prompt.split("\n")[0]
+    else:
+        raise RuntimeError("unexpected prompt format")
+
+    return question.lower()
+
+
+# Note: This is based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/eval/eval_textvqa.py#L35
+# and slightly modified.
+def evaluate(result_file_path, groundtruth_path):
+    with open(groundtruth_path) as groundtruth_file:
+        groundtruth = json.load(groundtruth_file)["data"]
+
+    groundtruth = {(gt["image_id"], gt["question"].lower()): gt["answers"] for gt in groundtruth}
+
+    with open(result_file_path, "r") as result_file:
+        results = json.load(result_file)
+
+    predictions = []
+    for result in results:
+        gt_answers = groundtruth[(result["sample_id"], prompt_processor(result["prompt"]))]
+        predictions.append({"pred_answer": result["text"], "gt_answers": gt_answers})
+
+    evaluator = TextVQAAccuracyEvaluator()
+    print(
+        'Samples: {}\nAccuracy: {:.2f}%\n'.format(
+            len(predictions), 100.0 * evaluator.eval_pred_list(predictions)
+        )
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file")
+    args = parser.parse_args()
+
+    result_file_path = merge_input_files(args.input_path)
+
+    evaluate(result_file_path, args.groundtruth_path)
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index 9a912db6e0..564a9105e2 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -17,13 +17,13 @@
 import torch
 from PIL import Image
 from torchvision.transforms import Compose, Resize, ToPILImage
+from train import add_multimodal_extra_args, get_image_token_count, model_provider
 
 from megatron.inference.text_generation.api import generate_and_post_process
 from megatron.inference.text_generation.forward_step import ForwardStep
 from megatron.training import get_args, get_model, print_rank_0
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.initialize import initialize_megatron
-from train import model_provider, get_image_token_count, add_multimodal_extra_args
 
 
 def add_text_generation_args(parser):
@@ -37,13 +37,15 @@ def add_text_generation_args(parser):
         "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
     )
     group.add_argument("--output-path", type=str, required=True, help='Output file path')
-    group.add_argument('--input-path', type=str, required=True, help="Input directory")
+    group.add_argument('--input-image-path', type=str, required=True, help="Input image directory")
+    group.add_argument('--input-metadata-path', type=str, help="Input metadata path")
     group.add_argument(
         '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
     )
     group.add_argument('--partition-id', type=int, default=0, help="Partition index")
     group.add_argument("--drop-vision-class-token", action="store_true", default=False)
     group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+    group.add_argument("--task", type=str, help="Generation task to run")
 
     # Add common multimodal arguments needed for e.g. building the model.
     parser = add_multimodal_extra_args(parser)
@@ -51,77 +53,137 @@ def add_text_generation_args(parser):
     return parser
 
 
-def _convert_image_to_rgb(image):
-    return image.convert("RGB")
+def preprocess_image(target_h, target_w, img):
+    """Example image preprocessing. Resizes input image to target size.
 
+    Args:
+        target_h (int): Target height in pixels.
+        target_w (int): Target width in pixels
+        img (np.array [h, w, c]): Input image in a numpy array.
 
-def _transform_test(img_h, img_w):
-    return Compose([ToPILImage(), Resize((img_h, img_w)), _convert_image_to_rgb])
-
-
-def preprocess(img_h, img_w, img):
-    # Example image preprocessing.
-    pixel_mean = [123.675, 116.28, 103.53]  # Imagenet's mean.
+    Returns:
+        output_img (torch.Tensor [c, h, w]): Input image resized to target size.
+    """
+    # Imagenet's mean and std for normalization.
+    pixel_mean = [123.675, 116.28, 103.53]
     pixel_std = [58.395, 57.12, 57.375]
     pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
     pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
 
-    raw_h, raw_w = img.shape[0], img.shape[1]
-    ratio = float(max(img_h, img_w)) / max(raw_h, raw_w)
-    H, W = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5)
-    image_transform = _transform_test(H, W)
+    # Resize image considering ratio between input and target image sizes.
+    img_h, img_w = img.shape[0], img.shape[1]
+    ratio = float(max(target_h, target_w)) / max(img_h, img_w)
+
+    scaled_h, scaled_w = int(img_h * ratio + 0.5), int(img_w * ratio + 0.5)
+
+    image_transform = Compose(
+        [ToPILImage(), Resize((scaled_h, scaled_w)), lambda x: x.convert("RGB")]
+    )
     img = image_transform(img)
+
+    # Normalize pixel values.
     img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
-    delta_h, delta_w = img_h - H, img_w - W
-    padded_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
 
-    return padded_img
+    # Pad to target size.
+    delta_h, delta_w = target_h - scaled_h, target_w - scaled_w
+    output_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+    return output_img
 
 
 def generate_samples(model):
-    """Text generation using a trained vision language model. This is an example for the COCO dataset."""
+    """Text generation using a trained vision language model."""
     args = get_args()
 
-    image_files = sorted(glob.glob(args.input_path + "/*"))
-    # Optionally, process only a subset of the input files.
-    if args.num_partitions > 0:
-        per_part = len(image_files) // args.num_partitions
-        image_files = image_files[per_part * args.partition_id : per_part * (args.partition_id + 1)]
-
-    num_samples = len(image_files)
     images = []
+    questions, answers = [], []
+    samples, sample_ids = [], []
+
+    if args.task in ("TextVQA", "VQAv2"):
+        input_metadata_path = args.input_metadata_path
+
+        if input_metadata_path.endswith(".json"):
+            samples = json.load(open(input_metadata_path))
+        elif input_metadata_path.endswith(".jsonl"):
+            with open(input_metadata_path, 'r') as jsonl_file:
+                json_list = list(jsonl_file)
+                samples = [json.loads(json_str) for json_str in json_list]
+        else:
+            return NotImplementedError
+
+        # Optionally, process only a subset of the input files.
+        if args.num_partitions > 0:
+            per_part = len(samples) // args.num_partitions
+            samples = samples[per_part * args.partition_id : per_part * (args.partition_id + 1)]
 
-    # Run image preprocessing.
-    for image_file in image_files:
-        img = np.array(Image.open(image_file))
-        img = preprocess(args.img_h, args.img_w, img)
+        num_samples = len(samples)
 
-        images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+        for i in range(len(samples)):
+            sample = samples[i]
 
-    # Load optional ground truth.
-    gt_image_id_to_captions = defaultdict(list)
-    if args.gt_path:
-        gts = json.load(open(args.gt_path))
-        for gt in gts["annotations"]:
-            gt_image_id_to_captions[gt["image_id"]].append(gt['caption'])
+            img_file = "{}/{}".format(args.input_image_path, sample["image"])
 
-    num_image_tokens = get_image_token_count()
+            img_sample = np.array(Image.open(img_file))
+            processed_img = preprocess_image(args.img_h, args.img_w, img_sample)
+            images.append(processed_img.reshape(-1, 3, args.img_h, args.img_w))
+
+            if args.task == "VQAv2":
+                questions.append(sample["question"])
+                answers.append(sample["answer"])
+            elif args.task == 'TextVQA':
+                questions.append(sample["text"])
+
+            sample_ids.append(sample["question_id"])
+
+            if len(images) == num_samples:
+                break
+    elif args.task == "captioning":
+        image_files = sorted(glob.glob(args.input_image_path + "/*"))
+        # Optionally, process only a subset of the input files.
+        if args.num_partitions > 0:
+            per_part = len(image_files) // args.num_partitions
+            image_files = image_files[
+                per_part * args.partition_id : per_part * (args.partition_id + 1)
+            ]
+
+        num_samples = len(image_files)
+        images = []
+
+        # Run image preprocessing.
+        for image_file in image_files:
+            img = np.array(Image.open(image_file))
+            img = preprocess(args.img_h, args.img_w, img)
+
+            images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+
+            image_id = int(image_file.split("_")[-1].split(".")[0])
+            sample_ids.append(image_id)
+
+        # Load optional ground truth.
+        gt_sample_id_to_captions = defaultdict(list)
+        if args.gt_path:
+            gts = json.load(open(args.gt_path))
+            for gt in gts["annotations"]:
+                gt_sample_id_to_captions[gt["image_id"]].append(gt['caption'])
+    else:
+        raise NotImplementedError("unsupported task")
 
     idx = 0
     while idx < num_samples:
-        try:
-            image = images[idx].cuda()
-        except:
-            breakpoint()
-            pass
+        image = images[idx].cuda()
+        sample_id = sample_ids[idx]
 
-        image_id = int(image_files[idx].split("_")[-1].split(".")[0])
+        if args.task == "captioning":
+            prompt = "Give a short and clear explanation of the subsequent image.\n"
+        elif args.task == "TextVQA":
+            prompt = questions[idx]
+        elif args.task == "VQAv2":
+            prompt = questions[idx]
+            prompt += "\nAnswer the question using a single word or phrase."
 
-        forward_step = partial(VLMForwardStep, image, num_image_tokens)
+        forward_step = partial(VLMForwardStep, image, get_image_token_count())
 
         if torch.distributed.get_rank() == 0:
-            prompt = "Give a short and clear explanation of the subsequent image.\n"
-
             resp_sentences, _, _, _ = generate_and_post_process(
                 model,
                 forward_step=forward_step,
@@ -137,12 +199,25 @@ def generate_samples(model):
 
             for prompt, generation in zip([prompt], resp_sentences):
                 output = {
-                    "question_id": image_id,
+                    "sample_id": sample_id,
                     "prompt": prompt,
-                    "caption": generation[len(prompt) :],
                 }
 
-                output["ground_truth"] = gt_image_id_to_captions[image_id]
+                output_name = ""
+                if args.task == "captioning":
+                    output_name = "caption"
+                elif args.task == "VQAv2":
+                    output_name = "answer"
+                elif args.task == "TextVQA":
+                    output_name = "text"
+
+                generated = generation[len(prompt) :]
+                output[output_name] = generated
+
+                if args.task == "captioning":
+                    output["ground_truth"] = gt_sample_id_to_captions[sample_id]
+                elif args.task == "VQAv2":
+                    output["ground_truth"] = answers[idx]
 
                 print_rank_0(output)
 
@@ -150,6 +225,7 @@ def generate_samples(model):
                 idx += 1
         else:
             generate_and_post_process(model, forward_step=forward_step)
+
             idx += 1
 
 
diff --git a/examples/multimodal/text_generation_8b.sh b/examples/multimodal/text_generation_8b.sh
index b3b1deea8c..63c5beeefe 100755
--- a/examples/multimodal/text_generation_8b.sh
+++ b/examples/multimodal/text_generation_8b.sh
@@ -4,11 +4,23 @@ export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=1
 
+INPUT_METADATA_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
-        -i|--input-path)
-            INPUT_PATH="$2"
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        --input-metadata-path)
+            INPUT_METADATA_PATH="$2"
+            shift
+            shift
+            ;;
+        -g|--groundtruth-path)
+            GROUNDTRUTH_PATH="$2"
             shift
             shift
             ;;
@@ -27,15 +39,16 @@ while [[ $# -gt 0 ]]; do
             shift
             shift
             ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
         -g|--gt-path)
             GROUNDTRUTH_PATH="$2"
             shift
             shift
             ;;
-        --default)
-            DEFAULT=YES
-            shift # past argument
-            ;;
         -*|--*)
             echo "Invalid option $1"
             exit 1
@@ -46,7 +59,7 @@ done
 # Please modify these as needed.
 NUM_PARTITIONS=100
 START=0
-END=0
+END=2
 
 for PARTITION_ID in $( eval echo {$START..$END} )
 do
diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py
index ab0a2df41d..b42196fa91 100644
--- a/tools/run_vlm_text_generation.py
+++ b/tools/run_vlm_text_generation.py
@@ -46,31 +46,42 @@ def add_text_generation_args(parser):
     return parser
 
 
-def _convert_image_to_rgb(image):
-    return image.convert("RGB")
-
-
-def _transform_test(img_h, img_w):
-    return Compose([ToPILImage(), Resize((img_h, img_w)), _convert_image_to_rgb])
-
-
-def preprocess(img_h, img_w, img):
-    # Example image preprocessing.
-    pixel_mean = [123.675, 116.28, 103.53]  # Imagenet's mean.
+def preprocess_image(target_h, target_w, img):
+    """Example image preprocessing. Resizes input image to target size.
+
+    Args:
+        target_h (int): Target height in pixels.
+        target_w (int): Target width in pixels
+        img (np.array [h, w, c]): Input image in a numpy array.
+
+    Returns:
+        output_img (torch.Tensor [c, h, w]): Input image resized to target size.
+    """
+    # Imagenet's mean and std for normalization.
+    pixel_mean = [123.675, 116.28, 103.53]
     pixel_std = [58.395, 57.12, 57.375]
     pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
     pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
 
-    raw_h, raw_w = img.shape[0], img.shape[1]
-    ratio = float(max(img_h, img_w)) / max(raw_h, raw_w)
-    H, W = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5)
-    image_transform = _transform_test(H, W)
+    # Resize image considering ratio between input and target image sizes.
+    img_h, img_w = img.shape[0], img.shape[1]
+    ratio = float(max(target_h, target_w)) / max(img_h, img_w)
+
+    scaled_h, scaled_w = int(img_h * ratio + 0.5), int(img_w * ratio + 0.5)
+
+    image_transform = Compose(
+        [ToPILImage(), Resize((scaled_h, scaled_w)), lambda x: x.convert("RGB")]
+    )
     img = image_transform(img)
+
+    # Normalize pixel values.
     img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
-    delta_h, delta_w = img_h - H, img_w - W
-    padded_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
 
-    return padded_img
+    # Pad to target size.
+    delta_h, delta_w = target_h - scaled_h, target_w - scaled_w
+    output_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+    return output_img
 
 
 def generate_samples(model):
@@ -89,7 +100,7 @@ def generate_samples(model):
     # Run image preprocessing.
     for image_file in image_files:
         img = np.array(Image.open(image_file))
-        img = preprocess(args.img_h, args.img_w, img)
+        img = preprocess_image(args.img_h, args.img_w, img)
 
         images.append(img.reshape(-1, 3, args.img_h, args.img_w))
 

From 24271cc96ae545c3f191b3c24ffa8df805b57339 Mon Sep 17 00:00:00 2001
From: Gao Deng <gdeng@nvidia.com>
Date: Tue, 4 Jun 2024 17:22:43 -0700
Subject: [PATCH 66/92] [MoE] Remove redundant H2D sync point for MoE when
 pipeline parallelism enabled

---
 megatron/core/pipeline_parallel/schedules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index 4e91d290ea..07fa018566 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -233,7 +233,7 @@ def forward_step(
     if hasattr(config, 'num_moe_experts') and config.num_moe_experts is not None:
         # Calculate the loss scale based on the grad_scale_func if available, else default to 1.
         loss_scale = (
-            config.grad_scale_func(torch.tensor(1.0, device=output_tensor.device))
+            config.grad_scale_func(torch.ones(1, device=output_tensor.device))
             if config.grad_scale_func is not None
             else torch.tensor(1.0)
         )

From 427fdfd74e9cbae0e46797cda4f4023fee079221 Mon Sep 17 00:00:00 2001
From: Chenhan Yu <chenhany@nvidia.com>
Date: Wed, 5 Jun 2024 09:55:29 -0700
Subject: [PATCH 67/92] Add distributed checkpointing support to
 megatron.inference and megatron.training for Model Optimizer QAT

---
 examples/inference/text_generation_ptq.py |  53 +--------
 megatron/inference/checkpointing.py       | 135 ++++++++++++++++++++++
 megatron/inference/gpt/model_provider.py  |  10 +-
 megatron/training/checkpointing.py        |  36 +++++-
 4 files changed, 181 insertions(+), 53 deletions(-)
 create mode 100644 megatron/inference/checkpointing.py

diff --git a/examples/inference/text_generation_ptq.py b/examples/inference/text_generation_ptq.py
index b6c2b445b4..13b327b25a 100644
--- a/examples/inference/text_generation_ptq.py
+++ b/examples/inference/text_generation_ptq.py
@@ -16,12 +16,12 @@
 
 # [ModelOpt]: changing the default model provider to the ModelOpt version
 from megatron.core import mpu
-from megatron.core.dist_checkpointing import load
 from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.checkpointing import load_modelopt_checkpoint
 from megatron.inference.gpt.model_provider import model_provider
 from megatron.inference.text_generation import generate_and_post_process
 from megatron.training import get_args, get_model, initialize_megatron
-from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from megatron.training.checkpointing import save_checkpoint
 from megatron.training.utils import print_rank_0, unwrap_model
 
 QUANT_CFG_CHOICES = {
@@ -103,53 +103,6 @@ def get_calib_dataloader(
         yield batch
 
 
-def modelopt_load_checkpoint(
-    model, optimizer=None, opt_param_scheduler=None, strict=True, additional_sharded_prefix="model."
-):
-    """Load a megatron checkpoint depending its format.
-
-    Args:
-        model: MCoreGPTModel instance
-        optimizer: Megatron optimizer instance
-        opt_param_scheduler: Megatron scheduler instance
-        strict: if True, no extra or missing keys are allowed while loading the state_dict
-        additional_sharded_prefix (str): Append additional prefix to align the sharded checkpoint keys. When loading
-        an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is typically an empty string.
-    """
-
-    def _remove_prefix_state_dict_pre_hook(
-        state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
-    ):
-        """Pytorch _load_state_dict_pre_hook to remap the state_dict with the additional sharded prefix."""
-        if additional_sharded_prefix is None:
-            return
-        key_rewrite_list = []
-        for key, _ in state_dict.items():
-            if key.startswith(additional_sharded_prefix):
-                key_rewrite_list.append(key)
-        for old_key in key_rewrite_list:
-            new_key = old_key[len(additional_sharded_prefix) :]
-            state_dict[new_key] = state_dict.pop(old_key)
-
-    args = get_args()
-    load_dir = args.load
-
-    shared_model_state_dir = "model_weights"
-    sharded_load_dir = Path(load_dir + "/" + shared_model_state_dir)
-
-    if sharded_load_dir.exists() and optimizer is None and opt_param_scheduler is None:
-        unwrapped_model = unwrap_model(model)
-        shareded_state_dict = unwrapped_model[0].sharded_state_dict(
-            prefix=additional_sharded_prefix
-        )
-        if additional_sharded_prefix:
-            unwrapped_model[0]._register_load_state_dict_pre_hook(
-                _remove_prefix_state_dict_pre_hook
-            )
-        unwrapped_model[0].load_state_dict(load(shareded_state_dict, sharded_load_dir))
-    else:
-        _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict)
-
 
 if __name__ == "__main__":
     initialize_megatron(
@@ -175,7 +128,7 @@ def _remove_prefix_state_dict_pre_hook(
     model = get_model(text_generation_model_provider, wrap_with_ddp=False)
 
     if args.load is not None:
-        modelopt_load_checkpoint(model)
+        load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
         print_rank_0("Done loading checkpoint")
 
     # Removing virtual pipeline parallel and other wrapper
diff --git a/megatron/inference/checkpointing.py b/megatron/inference/checkpointing.py
new file mode 100644
index 0000000000..f8d3e2dd59
--- /dev/null
+++ b/megatron/inference/checkpointing.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from pathlib import Path
+from typing import Optional, Dict
+
+from megatron.core import dist_checkpointing
+from megatron.training import get_args
+from megatron.training.checkpointing import _load_base_checkpoint, load_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
+
+try:
+    from modelopt.torch.opt.plugins import (
+        get_sharded_modelopt_state,
+        restore_modelopt_state_metadata,
+    )
+except ImportError as e:
+    raise ImportError("Required `\"nvidia-modelopt[torch]\"` is not installed!") from e
+
+
+def load_modelopt_state(load_dir: Optional[str] = None) -> Dict:
+    """Loading modelopt_state without a model.
+
+    If --use-dist-ckpt, we try to load from the sharded modelopt_state. This will not load the model
+    state_dict. Otherwise, if the checkpoint is not sharded, we load the base checkpoint (that
+    contains the model state as well) and extract the modelopt_state.
+
+    Args:
+        load_dir: optionally provide a different loading path
+    """
+    args = get_args()
+
+    if load_dir is None:
+        load_dir = args.load
+
+    if args.use_dist_ckpt:
+        # Read the tracker file and set the iteration.
+        tracker_filename = os.path.join(load_dir, 'latest_checkpointed_iteration.txt')
+        # If no tracker file, assuming that it is a .nemo checkpoint.
+        if not os.path.isfile(tracker_filename):
+            sharded_load_dir = Path(load_dir) / "model_weights"
+        else:
+            with open(tracker_filename, 'r') as f:
+                metastring = f.read().strip()
+                try:
+                    iteration = int(metastring)
+                    sharded_load_dir = Path(load_dir) / 'iter_{:07d}'.format(iteration)
+                except ValueError:
+                    sharded_load_dir = Path(load_dir) / metastring
+        modelopt_state_dir = sharded_load_dir / "modelopt_state"
+        if modelopt_state_dir.exists():
+            print_rank_0("Loading sharded modelopt_state ({})".format(modelopt_state_dir))
+            modelopt_state = restore_modelopt_state_metadata(
+                dist_checkpointing.load(
+                    get_sharded_modelopt_state(args.num_layers), modelopt_state_dir,
+                )
+            )
+            return modelopt_state
+        else:
+            print_rank_0(
+                "sharded modelopt_state ({}) does not exist!".format(modelopt_state_dir)
+            )
+            return {}
+    else:
+        print_rank_0("Loading modelopt_state from base checkpoint ({})".format(load_dir))
+        try:
+            state_dict, _, _ = _load_base_checkpoint(args.load, rank0=False)
+        except Exception:
+            print_rank_0("Failed to load base checkpoint via megatron _load_base_checkpoint!")
+            return {}
+        if state_dict is None:
+            return {}
+        return state_dict.get("modelopt_state", {})
+
+
+def load_modelopt_checkpoint(
+    model,
+    optimizer=None,
+    opt_param_scheduler=None,
+    strict: bool = True,
+    additional_sharded_prefix: str = "model.",
+    load_arg: str = "load",
+) -> None:
+    """Load a sharded (untar .nemo or megatron --use-dist-ckpt) or unsharded checkpoint.
+
+    Essentially, the function is detecting whether the checkpoint is a .nemo sharded checkpoint.
+    If so, we load the sharded state_dict with additional_sharded_prefix `model.`.
+    This additional prefix is tha artifact of the lightning module wrapper. Once the sharded
+    state_dict is loaded, we use a state_dict pre_hook to pop this additional prefix (`model.`)
+    from all state_dict keys.
+
+    If this is not a .nemo sharded checkpoint, then this function will simply call
+    load_checkpoint. See megatron.checkpointing.load_checkpoint for explanation.
+
+    Args:
+        additional_sharded_prefix: append additional prefix to align the sharded checkpoint keys.
+            When loading an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is
+            typically an empty string.
+    """
+
+    def _remove_prefix_state_dict_pre_hook(
+        state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
+    ):
+        """Pytorch state_dict pre_hook to remove prefix of the state_dict keys."""
+        if additional_sharded_prefix is None:
+            return
+        key_rewrite_list = []
+        for key, _ in state_dict.items():
+            if key.startswith(additional_sharded_prefix):
+                key_rewrite_list.append(key)
+        for old_key in key_rewrite_list:
+            new_key = old_key[len(additional_sharded_prefix) :]
+            state_dict[new_key] = state_dict.pop(old_key)
+
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    sharded_load_dir = Path(load_dir) / "model_weights"
+
+    if sharded_load_dir.exists() and optimizer is None and opt_param_scheduler is None:
+        unwrapped_model = unwrap_model(model)
+        # Set this attribute will alter the sharded_offsets of transformer_block.
+        unwrapped_model[0].decoder.config.non_homogeneous_layers = False
+        sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix)
+        if additional_sharded_prefix:
+            unwrapped_model[0]._register_load_state_dict_pre_hook(
+                _remove_prefix_state_dict_pre_hook
+            )
+        unwrapped_model[0].load_state_dict(
+            dist_checkpointing.load(sharded_state_dict, sharded_load_dir)
+        )
+        # Set the attribute to True such that by-default we are storing the heterogenous arch.
+        unwrapped_model[0].decoder.config.non_homogeneous_layers = True
+    else:
+        _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict, load_arg=load_arg)
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index c6d3761de6..3c4c437f0d 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -2,6 +2,8 @@
 
 """ModelOpt GPT model provider."""
 
+import modelopt.torch.opt as mto
+
 from megatron.core.inference.gpt.model_specs import get_gpt_layer_modelopt_spec
 from megatron.core.inference.gpt.state_dict_hooks import (
     mcore_gpt_load_legacy_state_dict_pre_hook,
@@ -10,6 +12,7 @@
 from megatron.core.models.gpt import GPTModel as MCoreGPTModel
 from megatron.core.parallel_state import get_tensor_model_parallel_rank
 from megatron.core.transformer.spec_utils import import_module
+from megatron.inference.checkpointing import load_modelopt_state
 from megatron.training import get_args, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args
 
@@ -33,7 +36,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
     print_rank_0("building GPT model ...")
 
     # ModelOpt by default assumes none homogenous layers. This affect the storage format of the sharded checkpoint.
-    config = core_transformer_config_from_args(get_args())
+    config = core_transformer_config_from_args(args)
     config.non_homogeneous_layers = True
 
     if args.use_mcore_models:
@@ -65,6 +68,11 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
 
     model = model_type(**model_kwargs)
 
+    # Load modelopt_state
+    modelopt_state = load_modelopt_state() if args.load else {}
+    if modelopt_state:
+        model = mto.restore_from_modelopt_state(model, modelopt_state)
+
     # Register some load_state_dict prehooks to handle some known state_dict key mismatch.
     # (legacy <-> modelopt) and (default te <-> modelopt)
     if args.export_legacy_megatron:
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index d5cc881fc8..35f74ee890 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -21,6 +21,18 @@
 from ..core.dist_checkpointing.serialization import \
     get_default_save_sharded_strategy
 
+# [ModelOpt]: Import
+try:
+    from modelopt.torch.opt.plugins import (
+        save_modelopt_state,
+        save_sharded_modelopt_state,
+        restore_modelopt_state,
+        restore_sharded_modelopt_state,
+    )
+    has_nvidia_modelopt = True
+except Exception:
+    has_nvidia_modelopt = False
+
 _CHECKPOINT_VERSION = None
 
 
@@ -338,7 +350,15 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
                 checkpointing_context['save_strategy'] = save_strategy
             async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
                                                          async_sharded_save=args.async_save)
+
+            # [ModelOpt]: save sharded modelopt_state
+            if has_nvidia_modelopt:
+                save_sharded_modelopt_state(model, checkpoint_name, (args.dist_ckpt_format, 1))
         else:
+            # [ModelOpt]: Inject modelopt_state into state_dict
+            if has_nvidia_modelopt:
+                save_modelopt_state(model, state_dict)
+
             # Save.
             ensure_directory_exists(checkpoint_name)
             torch.save(state_dict, checkpoint_name)
@@ -718,8 +738,13 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                 optim_sd_kwargs['sharding_type'] = ('fully_sharded_bucket_space'
                                                     if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
                                                     else 'dp_zero_gather_scatter')
-            load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, optimizer, opt_param_scheduler,
-                                                                    rng_state, args.use_dist_ckpt, optim_sd_kwargs=optim_sd_kwargs)
+            # [ModelOpt]: remedy for finetune
+            if args.finetune or args.no_load_optim:
+                load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, None, None,
+                                                                        rng_state, args.use_dist_ckpt, optim_sd_kwargs=optim_sd_kwargs)
+            else:
+                load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, optimizer, opt_param_scheduler,
+                                                                        rng_state, args.use_dist_ckpt, optim_sd_kwargs=optim_sd_kwargs)
             load_kwargs['exit_on_missing_checkpoint'] = args.exit_on_missing_checkpoint
 
     state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False, **load_kwargs)
@@ -760,6 +785,13 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                                               'consumed_valid_samples', 0)
     else:
         print_rank_0('could not find arguments in the checkpoint ...')
+    
+    # [ModelOpt]: loading modelopt_state (sharded or not)
+    if has_nvidia_modelopt:
+        if args.use_dist_ckpt:
+            restore_sharded_modelopt_state(model, checkpoint_name)
+        else:
+            restore_modelopt_state(model, state_dict)
 
     # Model.
     strict = False if args.retro_add_retriever else strict

From 26d6a3e3b8fb9f4769385dd01bdad9801c2c8a8d Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Wed, 29 May 2024 13:06:26 -0700
Subject: [PATCH 68/92] Multimodal example - MMMU eval

---
 examples/multimodal/README.md              | 15 +++-
 examples/multimodal/clip_converter.py      |  3 +-
 examples/multimodal/evaluate_mmmu.py       | 66 +++++++++++++++
 examples/multimodal/run_text_generation.py | 99 ++++++++++++++++++++--
 4 files changed, 173 insertions(+), 10 deletions(-)
 create mode 100644 examples/multimodal/evaluate_mmmu.py

diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index 6adbe5302b..b14d988faf 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -39,7 +39,8 @@ examples/multimodal/sft_8b.sh
 Run the following script:
 
 ```
-examples/multimodal/text_generation_8b.sh --input-image-path /path/to/input/images --output-path /some/output/directory --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name
+examples/multimodal/text_generation_8b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name
 ```
 
 ### COCO captioning
@@ -65,3 +66,15 @@ First, run text generation using `--task VQAv2`. Then, run the following command
 ```
 python examples/multimodal/evaluate_textvqa.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file --question-path /path/to/question/file
 ```
+
+### MMMU
+
+The official MMMU repository is not pip installable currently so please clone their code in `examples/multimodal` by running `git clone https://github.com/MMMU-Benchmark/MMMU.git`.
+
+The MMMU dataset is loaded from HuggingFace.
+
+Run text generation using `--task MMMU`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
+```
diff --git a/examples/multimodal/clip_converter.py b/examples/multimodal/clip_converter.py
index e6c0fd8cc5..35c8b2306e 100644
--- a/examples/multimodal/clip_converter.py
+++ b/examples/multimodal/clip_converter.py
@@ -111,7 +111,8 @@ def convert(download_root, output_path, tensor_parallel_size, use_te_layernorm_l
             new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
 
         for i in range(tensor_parallel_size):
-            new_state_dicts[i]["model"][new_name] = new_tensors[i]
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
 
     for i in range(tensor_parallel_size):
         output_path_tp = os.path.join(output_path, f"state_dict_tp_{i}.pt")
diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluate_mmmu.py
new file mode 100644
index 0000000000..1f609fc809
--- /dev/null
+++ b/examples/multimodal/evaluate_mmmu.py
@@ -0,0 +1,66 @@
+import argparse
+import glob
+import json
+import subprocess
+
+
+def convert_to_mmmu_format(input_path):
+    """Convert input files to MMMU compatible format."""
+    output_file_path = input_path + "-MMMU-merged.json"
+
+    pattern = input_path + "-MMMU-[0-9].*jsonl"
+    input_file_paths = glob.glob(pattern)
+
+    output = dict()
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                sample_id = res["sample_id"]
+                prediction = res["prediction"]
+
+                output[sample_id] = prediction
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file)
+
+    return output_file_path
+
+
+def main():
+    # Using the validation groundtruth file from the MMMU repo by default. This assumes you have cloned the MMMU github repo here.
+    default_groundtruth_path = "examples/multimodal/MMMU/eval/answer_dict_val.json"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path",
+        type=str,
+        default=default_groundtruth_path,
+        help="Path to groundtruth file. Defaults to the validation file in the MMMU repo.",
+    )
+    args = parser.parse_args()
+
+    result_file = convert_to_mmmu_format(args.input_path)
+
+    # The MMMU repo has a script for running the actual evaluation but no API. So launching the script here.
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MMMU/eval/main_eval_only.py",
+            "--output_path",
+            result_file,
+            "--answer_path",
+            default_groundtruth_path,
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    print(output.stdout)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index 564a9105e2..b06bd368e3 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -91,6 +91,11 @@ def preprocess_image(target_h, target_w, img):
     return output_img
 
 
+def _get_partition_bounds(total_num_samples, num_partitions, partition_id):
+    samples_per_partition = total_num_samples // num_partitions
+    return samples_per_partition * partition_id, samples_per_partition * (partition_id + 1)
+
+
 def generate_samples(model):
     """Text generation using a trained vision language model."""
     args = get_args()
@@ -113,8 +118,8 @@ def generate_samples(model):
 
         # Optionally, process only a subset of the input files.
         if args.num_partitions > 0:
-            per_part = len(samples) // args.num_partitions
-            samples = samples[per_part * args.partition_id : per_part * (args.partition_id + 1)]
+            lb, ub = _get_partition_bounds(len(samples), args.num_partitions, args.partition_id)
+            samples = samples[lb:ub]
 
         num_samples = len(samples)
 
@@ -141,10 +146,8 @@ def generate_samples(model):
         image_files = sorted(glob.glob(args.input_image_path + "/*"))
         # Optionally, process only a subset of the input files.
         if args.num_partitions > 0:
-            per_part = len(image_files) // args.num_partitions
-            image_files = image_files[
-                per_part * args.partition_id : per_part * (args.partition_id + 1)
-            ]
+            lb, ub = _get_partition_bounds(len(image_files), args.num_partitions, args.partition_id)
+            image_files = image_files[lb:ub]
 
         num_samples = len(image_files)
         images = []
@@ -152,7 +155,7 @@ def generate_samples(model):
         # Run image preprocessing.
         for image_file in image_files:
             img = np.array(Image.open(image_file))
-            img = preprocess(args.img_h, args.img_w, img)
+            img = preprocess_image(args.img_h, args.img_w, img)
 
             images.append(img.reshape(-1, 3, args.img_h, args.img_w))
 
@@ -165,6 +168,70 @@ def generate_samples(model):
             gts = json.load(open(args.gt_path))
             for gt in gts["annotations"]:
                 gt_sample_id_to_captions[gt["image_id"]].append(gt['caption'])
+    elif args.task == 'MMMU':
+        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+        import datasets
+
+        from evaluation.MMMU.eval.utils.data_utils import (
+            CAT_SHORT2LONG,
+            construct_prompt,
+            load_yaml,
+            process_single_sample,
+        )
+
+        all_mmmu_datasets = []
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        for subject in CAT_SHORT2LONG.values():
+            subject_dataset = datasets.load_dataset(
+                "MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache
+            )
+            all_mmmu_datasets.append(subject_dataset)
+
+        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+
+        # Optionally, process only a subset of the input files.
+        start_idx = 0
+        end_idx = len(dataset)
+        if args.num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(dataset), args.num_partitions, args.partition_id
+            )
+
+        # Using the LLaVA config from the MMMU repo.
+        config = load_yaml("evaluation/MMMU/eval/configs/llava1.5.yaml")
+        for k, v in config.items():
+            if isinstance(v, list):
+                assert len(v) == 1, "only one value supported."
+                config[k] = v[0]
+
+        for idx in range(start_idx, end_idx):
+            sample = dataset[idx]
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, config)
+
+            # Skip samples with no images or multiple images. Not supported yet.
+            if "image" not in sample or "<image 2>" in sample['final_input_prompt']:
+                continue
+
+            img = np.array(sample['image'].convert("RGB"))
+            img = preprocess_image(args.img_h, args.img_w, img)
+            images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+
+            sample_ids.append(sample['id'])
+
+            # TODO: Support different image positions.
+            prompt = sample['final_input_prompt']
+            prompt = prompt.replace("<image 1>", "")
+            questions.append(prompt.strip())
+
+            answers.append(sample['answer'])
+
+            samples.append(sample)
+
+        num_samples = len(samples)
     else:
         raise NotImplementedError("unsupported task")
 
@@ -180,6 +247,8 @@ def generate_samples(model):
         elif args.task == "VQAv2":
             prompt = questions[idx]
             prompt += "\nAnswer the question using a single word or phrase."
+        elif args.task == "MMMU":
+            prompt = questions[idx]
 
         forward_step = partial(VLMForwardStep, image, get_image_token_count())
 
@@ -208,7 +277,7 @@ def generate_samples(model):
                     output_name = "caption"
                 elif args.task == "VQAv2":
                     output_name = "answer"
-                elif args.task == "TextVQA":
+                elif args.task in ("TextVQA", "MMMU"):
                     output_name = "text"
 
                 generated = generation[len(prompt) :]
@@ -218,6 +287,20 @@ def generate_samples(model):
                     output["ground_truth"] = gt_sample_id_to_captions[sample_id]
                 elif args.task == "VQAv2":
                     output["ground_truth"] = answers[idx]
+                elif args.task == "MMMU":
+                    sample = samples[idx]
+
+                    prediction = generated
+                    if sample["question_type"] == "multiple-choice":
+                        from evaluation.MMMU.eval.utils.eval_utils import (
+                            parse_multi_choice_response,
+                        )
+
+                        prediction = parse_multi_choice_response(
+                            generated, sample["all_choices"], sample["index2ans"]
+                        )
+
+                    output["prediction"] = prediction
 
                 print_rank_0(output)
 

From 3321ddee2769ac242486e3edb3e4273a145f6ba4 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Thu, 6 Jun 2024 13:55:49 -0700
Subject: [PATCH 69/92] Multimodal example - VQAv2 eval

---
 examples/multimodal/evaluate_vqav2.py | 41 +++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 examples/multimodal/evaluate_vqav2.py

diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
new file mode 100644
index 0000000000..6c767826ce
--- /dev/null
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -0,0 +1,41 @@
+import argparse
+import glob
+import json
+
+from open_flamingo.eval.vqa_metric import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    output_file_path = input_path + "-VQAv2-merged.json"
+
+    pattern = input_path + "-VQAv2-[0-9].*jsonl"
+    input_file_paths = glob.glob(pattern)
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file")
+    parser.add_argument('--question-path', type=str, help="Path to questions file")
+    args = parser.parse_args()
+
+    result_file = merge_input_files(args.input_path)
+
+    accuracy = compute_vqa_accuracy(result_file, args.question_path, args.groundtruth_path)
+    print(accuracy)

From edbcaf4a87c846845fbfe56bf8b01725ccf17169 Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Thu, 6 Jun 2024 14:19:07 -0700
Subject: [PATCH 70/92] Re-name gold value file, and remove seemingly unused
 gold value files

---
 ...t3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json | 1 -
 ..._gpus-8_bs-32_steps-50_tp-1_pp-4_args--sequence-parallel.json | 1 -
 ...t_dgx_a100_1N8G_mcore_tp2_pp2_ddp_average_in_collective.json} | 0
 3 files changed, 2 deletions(-)
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json
 delete mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_mcore-pyt_func-train_bf16_nodes-1_gpus-8_bs-32_steps-50_tp-1_pp-4_args--sequence-parallel.json
 rename tests/functional_tests/test_results/jet/{gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json => gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_ddp_average_in_collective.json} (100%)

diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json b/tests/functional_tests/test_results/jet/gpt3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json
deleted file mode 100644
index c01f8187f9..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89295, 10.89965, 10.88696, 10.83149, 10.67503, 10.64746, 10.43169, 10.14739, 9.93477, 9.83962, 9.58592, 9.85376, 9.88462, 9.62937, 9.78698, 9.51021, 9.4569, 9.64899, 9.38548, 9.33112, 9.24126, 9.14483, 9.17481, 8.99429, 9.1888, 9.05871, 9.15474, 9.16387, 9.29609, 8.98403, 8.92803, 9.04321, 9.04304, 8.65413, 8.71637, 8.75308, 8.68316, 8.73418, 8.65925, 8.76497, 8.6606, 8.84921, 8.83147, 8.49916, 8.38803, 8.43069, 8.49215, 8.38391, 8.43104, 8.57865, 8.36634, 8.19162, 8.22542, 8.22189, 8.26703, 7.91344, 8.09517, 7.89087, 8.2465, 8.23048, 8.00464, 7.96563, 7.91956, 7.74022, 7.74076, 7.64376, 7.51581, 7.90794, 7.69917, 7.45259, 7.74036, 7.76918, 7.54534, 7.30294, 7.45712, 7.33965, 7.46571, 7.22688, 7.64027, 7.2821, 7.35551, 7.21573, 7.21764, 7.42508, 7.179, 7.28301, 7.00235, 7.00525, 7.04089, 7.13801, 6.82455, 6.98719, 7.08954, 7.00194, 6.87671, 6.75964, 6.9945, 7.06114, 6.70771, 6.58536, 6.73211, 6.74421, 6.73693, 6.74041, 6.66046, 6.40939, 6.64151, 6.62177, 6.44766, 6.63091, 6.74583, 6.61004, 6.72608, 6.69453, 6.62642, 6.50811, 6.60009, 6.40567, 6.66319, 6.24928, 6.25243, 6.30153, 6.38864, 6.34843, 6.44573, 6.28621, 6.33582, 6.23394, 6.19542, 6.39288, 6.31922, 6.31522, 6.16159, 6.15281, 6.23723, 6.3793, 6.19561, 6.14539, 6.17533, 6.11707, 6.06229, 6.07306, 6.25712, 6.4088, 6.25922, 6.30041, 6.0985, 6.18078, 6.00348, 6.02831, 5.95765, 6.24835, 6.1907, 5.96332, 5.78393, 6.1227, 5.85174, 6.10686, 5.78936, 6.1611, 6.14934, 6.08933, 5.93437, 6.11627, 5.94931, 6.1959, 5.89728, 5.79696, 5.77985, 5.69106, 6.01797, 5.99702, 6.06684, 5.89233, 6.03992, 5.96984, 5.99144, 5.99084, 5.94926, 5.84, 5.94964, 5.61688, 5.70056, 5.88641, 5.84093, 5.86486, 5.76475, 5.83288, 5.72552, 5.55908, 5.71981, 5.62871, 5.83246, 5.60363, 5.70859, 5.71489, 5.89876, 5.64683, 5.85067, 5.74152, 5.87173, 5.3315, 5.89859, 5.87336, 5.85278, 5.41294, 5.41022, 5.62717, 5.59521, 5.48446, 5.5786, 5.67523, 5.47521, 5.74638, 5.50816, 5.59243, 5.62022, 5.61724, 5.51366, 5.60999, 5.67263, 5.68168, 5.58403, 5.65969, 5.37394, 5.6801, 5.62369, 5.42207, 5.58245, 5.62504, 5.54833, 5.33874, 5.53339, 5.47745, 5.48125, 5.37476, 5.54873, 5.59774, 5.38087, 5.51862, 5.48462, 5.32929, 5.49691, 5.4034, 5.43743, 5.31257, 5.06222, 5.47631, 5.56354, 5.70783, 5.41218, 5.59425, 5.63333, 5.23192, 5.26844, 5.39089, 5.38947, 5.32309, 5.49039, 5.18431, 5.29599, 5.24133, 5.37232, 5.25139, 5.44291, 5.53376, 5.30953, 5.43213, 5.3326, 5.06934, 5.31017, 5.2456, 5.30007, 5.10712, 5.26888, 5.25997, 5.46469, 5.15309, 5.265, 5.20089, 5.35182, 4.97744, 4.91128, 5.3191, 5.38342, 5.22158, 5.31482, 5.10055, 5.15062, 5.25425, 5.05933, 5.25916, 5.0681, 5.33434, 5.23801, 5.14332, 5.23365, 5.03027, 5.31092, 5.04297, 5.01922, 5.13459, 5.10233, 5.2615, 5.14369, 5.27474, 5.08794, 5.08712, 5.24364, 5.31268, 5.2473, 5.17894, 5.12937, 5.27707, 4.94263, 5.20017, 5.07864, 5.29574, 5.16763, 5.17788, 5.10299, 4.97517, 4.98936, 5.21665, 5.30115, 5.09159, 5.04444, 4.90885, 5.11544, 5.11275, 4.91946, 5.33019, 5.01514, 5.09862, 5.15512, 4.99686, 5.05374, 5.05884, 4.983, 5.0736, 5.15293, 4.97049, 5.17335, 4.92251, 4.91308, 5.061, 4.9877, 4.89966, 4.76814, 4.93873, 5.10814, 5.01176, 5.00849, 5.32387, 4.95456, 4.98476, 5.03739, 4.79615, 4.73207, 4.98707, 5.02855, 4.86434, 4.94355, 5.03402, 5.01752, 4.81092, 4.88429, 4.89489, 4.82181, 4.73641, 5.00109, 4.74233, 5.19651, 4.77623, 4.98947, 4.7294, 4.77668, 4.80796, 4.64252, 4.64775, 4.83341, 4.79729, 4.7938, 4.92003, 4.87251, 4.9153, 4.76085, 4.86782, 4.72453, 4.90116, 4.95015, 4.8665, 4.69742, 4.77375, 4.88912, 4.70003, 4.85456, 4.68245, 4.67576, 4.63947]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [66.0, 80.0, 86.0, 78.0, 96.0, 83.0, 100.0, 114.0, 112.0, 111.0, 117.0, 164.0, 139.0, 181.0, 200.0, 179.0, 152.0, 209.0, 186.0, 180.0, 193.0, 184.0, 199.0, 173.0, 200.0, 164.0, 179.0, 176.0, 188.0, 165.0, 179.0, 174.0, 139.0, 195.0, 147.0, 169.0, 183.0, 221.0, 161.0, 188.0, 183.0, 196.0, 160.0, 178.0, 186.0, 170.0, 223.0, 195.0, 181.0, 224.0, 232.0, 197.0, 221.0, 170.0, 185.0, 183.0, 164.0, 148.0, 216.0, 260.0, 203.0, 220.0, 215.0, 198.0, 212.0, 286.0, 232.0, 203.0, 223.0, 167.0, 267.0, 275.0, 176.0, 250.0, 220.0, 195.0, 230.0, 211.0, 282.0, 232.0, 237.0, 220.0, 171.0, 238.0, 240.0, 207.0, 182.0, 235.0, 229.0, 221.0, 247.0, 203.0, 231.0, 216.0, 224.0, 149.0, 225.0, 230.0, 174.0, 181.0, 192.0, 215.0, 185.0, 170.0, 169.0, 129.0, 155.0, 166.0, 163.0, 212.0, 172.0, 166.0, 208.0, 190.0, 152.0, 165.0, 143.0, 119.0, 188.0, 172.0, 154.0, 133.0, 154.0, 146.0, 169.0, 153.0, 165.0, 150.0, 137.0, 136.0, 162.0, 157.0, 119.0, 143.0, 133.0, 116.0, 138.0, 128.0, 118.0, 114.0, 107.0, 112.0, 137.0, 141.0, 143.0, 117.0, 131.0, 146.0, 112.0, 122.0, 103.0, 122.0, 114.0, 145.0, 119.0, 110.0, 108.0, 100.0, 107.0, 139.0, 116.0, 106.0, 108.0, 140.0, 108.0, 132.0, 131.0, 125.0, 148.0, 106.0, 109.0, 123.0, 104.0, 110.0, 130.0, 97.0, 141.0, 110.0, 117.0, 117.0, 148.0, 101.0, 131.0, 149.0, 126.0, 106.0, 92.0, 131.0, 128.0, 123.0, 117.0, 82.0, 129.0, 90.0, 95.0, 101.0, 135.0, 102.0, 129.0, 91.0, 118.0, 80.0, 130.0, 108.0, 115.0, 140.0, 111.0, 124.0, 146.0, 167.0, 119.0, 105.0, 112.0, 135.0, 106.0, 134.0, 118.0, 112.0, 110.0, 123.0, 108.0, 121.0, 113.0, 98.0, 126.0, 83.0, 105.0, 93.0, 107.0, 110.0, 123.0, 113.0, 117.0, 110.0, 100.0, 106.0, 106.0, 110.0, 115.0, 120.0, 127.0, 108.0, 112.0, 103.0, 119.0, 107.0, 100.0, 123.0, 124.0, 125.0, 123.0, 121.0, 127.0, 106.0, 112.0, 111.0, 136.0, 120.0, 137.0, 84.0, 143.0, 105.0, 131.0, 137.0, 95.0, 108.0, 99.0, 95.0, 121.0, 120.0, 111.0, 139.0, 101.0, 107.0, 111.0, 126.0, 88.0, 109.0, 130.0, 121.0, 107.0, 115.0, 92.0, 118.0, 112.0, 101.0, 115.0, 103.0, 101.0, 113.0, 135.0, 120.0, 130.0, 142.0, 124.0, 127.0, 118.0, 98.0, 113.0, 119.0, 121.0, 114.0, 141.0, 129.0, 112.0, 116.0, 129.0, 129.0, 143.0, 140.0, 114.0, 132.0, 137.0, 143.0, 108.0, 111.0, 130.0, 102.0, 109.0, 139.0, 129.0, 111.0, 104.0, 129.0, 139.0, 103.0, 125.0, 108.0, 122.0, 109.0, 119.0, 99.0, 123.0, 125.0, 121.0, 122.0, 148.0, 133.0, 100.0, 135.0, 133.0, 128.0, 154.0, 115.0, 125.0, 112.0, 151.0, 115.0, 119.0, 138.0, 123.0, 103.0, 120.0, 128.0, 135.0, 119.0, 128.0, 133.0, 118.0, 124.0, 130.0, 154.0, 148.0, 150.0, 145.0, 106.0, 127.0, 135.0, 122.0, 109.0, 117.0, 136.0, 117.0, 119.0, 121.0, 105.0, 109.0, 131.0, 103.0, 113.0, 122.0, 114.0, 120.0, 128.0, 129.0, 121.0, 99.0, 142.0, 140.0, 138.0, 119.0, 112.0, 125.0, 117.0, 112.0, 126.0, 104.0, 142.0, 152.0, 126.0]}, "iteration_timing_avg": 0.2665040554722642}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_mcore-pyt_func-train_bf16_nodes-1_gpus-8_bs-32_steps-50_tp-1_pp-4_args--sequence-parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_mcore-pyt_func-train_bf16_nodes-1_gpus-8_bs-32_steps-50_tp-1_pp-4_args--sequence-parallel.json
deleted file mode 100644
index 838a4b1285..0000000000
--- a/tests/functional_tests/test_results/jet/gpt3_345m_mcore-pyt_func-train_bf16_nodes-1_gpus-8_bs-32_steps-50_tp-1_pp-4_args--sequence-parallel.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 42, "step_interval": 5, "values": [10.85961, 10.88449, 10.89225, 10.82282, 10.69062, 10.59772, 10.06389, 10.18065, 10.10744]}, "num-zeros": {"start_step": 0, "end_step": 42, "step_interval": 5, "values": [1496.0, 1874.0, 1801.0, 1784.0, 1841.0, 1655.0, 1517.0, 1873.0, 2260.0]}, "iteration_timing_avg": 0.12682214285714286}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_ddp_average_in_collective.json
similarity index 100%
rename from tests/functional_tests/test_results/jet/gpt3-345m-merge-request-dgx-a100-1n8g-mcore-tp2-pp2-ddp-average-in-collective.json
rename to tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_ddp_average_in_collective.json

From f2e72c8a16124dc98af19a2cfe36ba8fac5758df Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 6 Jun 2024 16:48:08 -0700
Subject: [PATCH 71/92] Some preliminary refactoring

---
 .../detxoify_lm/README.md                     |  0
 .../annotations/filter-selfgeneration.py      |  0
 .../annotations/perspective_api_annotate.py   |  0
 .../detxoify_lm/annotations/preprocess.sh     |  0
 .../detxoify_lm/finetune_gpt.py               |  0
 .../finetune_gpt_distributed-1.3b.sh          |  0
 .../detxoify_lm/generate-1.3b.sh              |  0
 .../detxoify_lm/generate_samples_gpt.py       |  0
 .../detxoify_lm/perspective_api.py            |  0
 .../selfgenerate-1.3b-unconditional.sh        |  0
 .../msdp/README.md                            |  0
 .../msdp/data_processing.sh                   |  0
 .../msdp/eval_knwl_generation.sh              |  0
 .../msdp/eval_resp_generation.sh              |  0
 .../msdp/prep_resp_gen.sh                     |  0
 .../msdp/prompt_knwl_gen.sh                   |  0
 .../msdp/prompt_resp_gen.sh                   |  0
 .../sc21/CONFIG.sh                            |  0
 .../sc21/README.md                            |  0
 .../sc21/SBATCH.sh                            |  0
 .../{ => academic_paper_scripts}/sc21/SRUN.sh |  0
 .../sc21/run_figure_11.sh                     |  0
 .../sc21/run_figure_12.sh                     |  0
 .../sc21/run_figure_13.sh                     |  0
 .../sc21/run_figure_14.sh                     |  0
 .../sc21/run_figure_15.sh                     |  0
 .../sc21/run_figure_16.sh                     |  0
 .../sc21/run_figure_17.sh                     |  0
 .../sc21/run_figure_18.sh                     |  0
 .../sc21/run_table_1.sh                       |  0
 examples/evaluate_retriever_nq.sh             | 37 ---------
 examples/evaluate_zeroshot_gpt.sh             | 37 ---------
 examples/finetune_mnli_distributed.sh         | 43 -----------
 examples/finetune_race_distributed.sh         | 46 -----------
 examples/finetune_retriever_distributed.sh    | 56 --------------
 examples/inference/{ => ammo_ptq}/README.md   |  0
 .../{ => ammo_ptq}/ptq_trtllm_llama_7b.sh     |  0
 .../{ => ammo_ptq}/ptq_trtllm_nemotron3_8b.sh |  0
 .../{ => ammo_ptq}/text_generation_ptq.py     |  0
 .../{ => ammo_ptq}/trtllm_text_generation.py  |  0
 .../run_text_generation_server_345M.sh        |  0
 ...eneration_server_345M_8_tensor_parallel.sh |  0
 examples/merge_mp_bert.sh                     | 18 -----
 examples/pretrain_bert.sh                     | 46 -----------
 examples/pretrain_bert_distributed.sh         | 63 ---------------
 examples/pretrain_bert_distributed_with_mp.sh | 65 ----------------
 examples/pretrain_gpt.sh                      | 50 ------------
 examples/pretrain_gpt3_175B.sh                | 64 ----------------
 examples/pretrain_gpt_distributed.sh          | 67 ----------------
 examples/pretrain_gpt_distributed_with_mp.sh  | 71 -----------------
 examples/pretrain_ict.sh                      | 44 -----------
 examples/pretrain_t5.sh                       | 50 ------------
 examples/pretrain_t5_distributed.sh           | 67 ----------------
 examples/pretrain_t5_distributed_with_mp.sh   | 68 -----------------
 examples/pretrain_vision_classify.sh          | 64 ----------------
 examples/pretrain_vision_dino.sh              | 67 ----------------
 examples/pretrain_vision_inpaint.sh           | 65 ----------------
 examples/pretrain_vlm.sh                      | 76 -------------------
 pretrain_ict.py                               |  1 +
 .../report_theoretical_memory.py              |  0
 60 files changed, 1 insertion(+), 1164 deletions(-)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/README.md (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/annotations/filter-selfgeneration.py (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/annotations/perspective_api_annotate.py (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/annotations/preprocess.sh (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/finetune_gpt.py (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/finetune_gpt_distributed-1.3b.sh (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/generate-1.3b.sh (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/generate_samples_gpt.py (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/perspective_api.py (100%)
 rename examples/{ => academic_paper_scripts}/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/README.md (100%)
 rename examples/{ => academic_paper_scripts}/msdp/data_processing.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/eval_knwl_generation.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/eval_resp_generation.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/prep_resp_gen.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/prompt_knwl_gen.sh (100%)
 rename examples/{ => academic_paper_scripts}/msdp/prompt_resp_gen.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/CONFIG.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/README.md (100%)
 rename examples/{ => academic_paper_scripts}/sc21/SBATCH.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/SRUN.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_11.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_12.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_13.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_14.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_15.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_16.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_17.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_figure_18.sh (100%)
 rename examples/{ => academic_paper_scripts}/sc21/run_table_1.sh (100%)
 delete mode 100644 examples/evaluate_retriever_nq.sh
 delete mode 100755 examples/evaluate_zeroshot_gpt.sh
 delete mode 100755 examples/finetune_mnli_distributed.sh
 delete mode 100755 examples/finetune_race_distributed.sh
 delete mode 100755 examples/finetune_retriever_distributed.sh
 rename examples/inference/{ => ammo_ptq}/README.md (100%)
 rename examples/inference/{ => ammo_ptq}/ptq_trtllm_llama_7b.sh (100%)
 rename examples/inference/{ => ammo_ptq}/ptq_trtllm_nemotron3_8b.sh (100%)
 rename examples/inference/{ => ammo_ptq}/text_generation_ptq.py (100%)
 rename examples/inference/{ => ammo_ptq}/trtllm_text_generation.py (100%)
 rename examples/{ => inference}/run_text_generation_server_345M.sh (100%)
 rename examples/{ => inference}/run_text_generation_server_345M_8_tensor_parallel.sh (100%)
 delete mode 100755 examples/merge_mp_bert.sh
 delete mode 100755 examples/pretrain_bert.sh
 delete mode 100755 examples/pretrain_bert_distributed.sh
 delete mode 100755 examples/pretrain_bert_distributed_with_mp.sh
 delete mode 100755 examples/pretrain_gpt.sh
 delete mode 100755 examples/pretrain_gpt3_175B.sh
 delete mode 100755 examples/pretrain_gpt_distributed.sh
 delete mode 100755 examples/pretrain_gpt_distributed_with_mp.sh
 delete mode 100755 examples/pretrain_ict.sh
 delete mode 100644 examples/pretrain_t5.sh
 delete mode 100755 examples/pretrain_t5_distributed.sh
 delete mode 100644 examples/pretrain_t5_distributed_with_mp.sh
 delete mode 100755 examples/pretrain_vision_classify.sh
 delete mode 100755 examples/pretrain_vision_dino.sh
 delete mode 100755 examples/pretrain_vision_inpaint.sh
 delete mode 100755 examples/pretrain_vlm.sh
 rename report_theoretical_memory.py => tools/report_theoretical_memory.py (100%)

diff --git a/examples/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md
similarity index 100%
rename from examples/detxoify_lm/README.md
rename to examples/academic_paper_scripts/detxoify_lm/README.md
diff --git a/examples/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
similarity index 100%
rename from examples/detxoify_lm/annotations/filter-selfgeneration.py
rename to examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
diff --git a/examples/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
similarity index 100%
rename from examples/detxoify_lm/annotations/perspective_api_annotate.py
rename to examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
diff --git a/examples/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
similarity index 100%
rename from examples/detxoify_lm/annotations/preprocess.sh
rename to examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
diff --git a/examples/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
similarity index 100%
rename from examples/detxoify_lm/finetune_gpt.py
rename to examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
diff --git a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
similarity index 100%
rename from examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
rename to examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
diff --git a/examples/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
similarity index 100%
rename from examples/detxoify_lm/generate-1.3b.sh
rename to examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
diff --git a/examples/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
similarity index 100%
rename from examples/detxoify_lm/generate_samples_gpt.py
rename to examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
diff --git a/examples/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py
similarity index 100%
rename from examples/detxoify_lm/perspective_api.py
rename to examples/academic_paper_scripts/detxoify_lm/perspective_api.py
diff --git a/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
similarity index 100%
rename from examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
rename to examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
diff --git a/examples/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md
similarity index 100%
rename from examples/msdp/README.md
rename to examples/academic_paper_scripts/msdp/README.md
diff --git a/examples/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh
similarity index 100%
rename from examples/msdp/data_processing.sh
rename to examples/academic_paper_scripts/msdp/data_processing.sh
diff --git a/examples/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
similarity index 100%
rename from examples/msdp/eval_knwl_generation.sh
rename to examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
diff --git a/examples/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
similarity index 100%
rename from examples/msdp/eval_resp_generation.sh
rename to examples/academic_paper_scripts/msdp/eval_resp_generation.sh
diff --git a/examples/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
similarity index 100%
rename from examples/msdp/prep_resp_gen.sh
rename to examples/academic_paper_scripts/msdp/prep_resp_gen.sh
diff --git a/examples/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
similarity index 100%
rename from examples/msdp/prompt_knwl_gen.sh
rename to examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
diff --git a/examples/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
similarity index 100%
rename from examples/msdp/prompt_resp_gen.sh
rename to examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
diff --git a/examples/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh
similarity index 100%
rename from examples/sc21/CONFIG.sh
rename to examples/academic_paper_scripts/sc21/CONFIG.sh
diff --git a/examples/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md
similarity index 100%
rename from examples/sc21/README.md
rename to examples/academic_paper_scripts/sc21/README.md
diff --git a/examples/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh
similarity index 100%
rename from examples/sc21/SBATCH.sh
rename to examples/academic_paper_scripts/sc21/SBATCH.sh
diff --git a/examples/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh
similarity index 100%
rename from examples/sc21/SRUN.sh
rename to examples/academic_paper_scripts/sc21/SRUN.sh
diff --git a/examples/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh
similarity index 100%
rename from examples/sc21/run_figure_11.sh
rename to examples/academic_paper_scripts/sc21/run_figure_11.sh
diff --git a/examples/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh
similarity index 100%
rename from examples/sc21/run_figure_12.sh
rename to examples/academic_paper_scripts/sc21/run_figure_12.sh
diff --git a/examples/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh
similarity index 100%
rename from examples/sc21/run_figure_13.sh
rename to examples/academic_paper_scripts/sc21/run_figure_13.sh
diff --git a/examples/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh
similarity index 100%
rename from examples/sc21/run_figure_14.sh
rename to examples/academic_paper_scripts/sc21/run_figure_14.sh
diff --git a/examples/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh
similarity index 100%
rename from examples/sc21/run_figure_15.sh
rename to examples/academic_paper_scripts/sc21/run_figure_15.sh
diff --git a/examples/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh
similarity index 100%
rename from examples/sc21/run_figure_16.sh
rename to examples/academic_paper_scripts/sc21/run_figure_16.sh
diff --git a/examples/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh
similarity index 100%
rename from examples/sc21/run_figure_17.sh
rename to examples/academic_paper_scripts/sc21/run_figure_17.sh
diff --git a/examples/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh
similarity index 100%
rename from examples/sc21/run_figure_18.sh
rename to examples/academic_paper_scripts/sc21/run_figure_18.sh
diff --git a/examples/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh
similarity index 100%
rename from examples/sc21/run_table_1.sh
rename to examples/academic_paper_scripts/sc21/run_table_1.sh
diff --git a/examples/evaluate_retriever_nq.sh b/examples/evaluate_retriever_nq.sh
deleted file mode 100644
index a579b5fd94..0000000000
--- a/examples/evaluate_retriever_nq.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Evaluate natural question test data given Wikipedia embeddings and pretrained
-# ICT model or a finetuned model for Natural Question task
-
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
-EMBEDDING_PATH=<Specify path of the embeddings>
-CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
-
-QA_FILE=<Path of the natural question dev or test dataset>
-
-python tasks/main.py \
-    --task RETRIEVER-EVAL \
-    --tokenizer-type BertWordPieceLowerCase \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --tensor-model-parallel-size 1 \
-    --micro-batch-size 128 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --load ${CHECKPOINT_PATH} \
-    --evidence-data-path ${EVIDENCE_DATA_DIR} \
-    --embedding-path ${EMBEDDING_PATH} \
-    --retriever-seq-length 256 \
-    --vocab-file  bert-vocab.txt\
-    --qa-data-test ${QA_FILE} \
-    --faiss-use-gpu \
-    --retriever-report-topk-accuracies 1 5 20 100 \
-    --fp16 \
-    --indexer-log-interval 1000 \
-    --indexer-batch-size 128
-
-
diff --git a/examples/evaluate_zeroshot_gpt.sh b/examples/evaluate_zeroshot_gpt.sh
deleted file mode 100755
index 2cc1c5a760..0000000000
--- a/examples/evaluate_zeroshot_gpt.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TASK="LAMBADA"
-
-VALID_DATA=<lambada path>
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT=checkpoints/gpt2_345m
-
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task $TASK \
-               --valid-data $VALID_DATA \
-               --tokenizer-type GPT2BPETokenizer \
-               --strict-lambada \
-               --vocab-file $VOCAB_FILE \
-               --merge-file $MERGE_FILE \
-               --load $CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --batch-size 8 \
-               --seq-length 1024 \
-               --max-position-embeddings 1024 \
-               --log-interval 10 \
-               --fp16 \
-               --no-load-optim \
-               --no-load-rng
diff --git a/examples/finetune_mnli_distributed.sh b/examples/finetune_mnli_distributed.sh
deleted file mode 100755
index a3f9accbcc..0000000000
--- a/examples/finetune_mnli_distributed.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
-            data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task MNLI \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 5 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 8 \
-               --lr 5.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.065 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 500000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --fp16
diff --git a/examples/finetune_race_distributed.sh b/examples/finetune_race_distributed.sh
deleted file mode 100755
index 3d92253388..0000000000
--- a/examples/finetune_race_distributed.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
-            data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task RACE \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 3 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 4 \
-               --lr 1.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.06 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 100000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --clip-grad 1.0 \
-               --hidden-dropout 0.1 \
-               --attention-dropout 0.1 \
-               --fp16
diff --git a/examples/finetune_retriever_distributed.sh b/examples/finetune_retriever_distributed.sh
deleted file mode 100755
index 535a2e053d..0000000000
--- a/examples/finetune_retriever_distributed.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-# Finetune a BERT or pretrained ICT model using Google natural question data 
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
-
-# Load either of the below
-BERT_LOAD_PATH=<Path of BERT pretrained model>
-PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-        --task RET-FINETUNE-NQ \
-        --train-with-neg \
-        --train-hard-neg 1 \
-        --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --tokenizer-type BertWordPieceLowerCase \
-        --train-data nq-train.json \
-        --valid-data nq-dev.json \
-        --save ${CHECKPOINT_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --vocab-file bert-vocab.txt \
-        --bert-load ${BERT_LOAD_PATH} \
-        --save-interval 5000 \
-        --log-interval 10 \
-        --eval-interval 20000 \
-        --eval-iters 100 \
-        --indexer-log-interval 1000 \
-        --faiss-use-gpu \
-        --DDP-impl torch \
-        --fp16 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --seq-length 512 \
-        --retriever-seq-length 256 \
-        --max-position-embeddings 512 \
-        --retriever-score-scaling \
-        --epochs 80 \
-        --micro-batch-size 8 \
-        --eval-micro-batch-size 16 \
-        --indexer-batch-size 128 \
-        --lr 2e-5 \
-        --lr-warmup-fraction 0.01 \
-        --weight-decay 1e-1
diff --git a/examples/inference/README.md b/examples/inference/ammo_ptq/README.md
similarity index 100%
rename from examples/inference/README.md
rename to examples/inference/ammo_ptq/README.md
diff --git a/examples/inference/ptq_trtllm_llama_7b.sh b/examples/inference/ammo_ptq/ptq_trtllm_llama_7b.sh
similarity index 100%
rename from examples/inference/ptq_trtllm_llama_7b.sh
rename to examples/inference/ammo_ptq/ptq_trtllm_llama_7b.sh
diff --git a/examples/inference/ptq_trtllm_nemotron3_8b.sh b/examples/inference/ammo_ptq/ptq_trtllm_nemotron3_8b.sh
similarity index 100%
rename from examples/inference/ptq_trtllm_nemotron3_8b.sh
rename to examples/inference/ammo_ptq/ptq_trtllm_nemotron3_8b.sh
diff --git a/examples/inference/text_generation_ptq.py b/examples/inference/ammo_ptq/text_generation_ptq.py
similarity index 100%
rename from examples/inference/text_generation_ptq.py
rename to examples/inference/ammo_ptq/text_generation_ptq.py
diff --git a/examples/inference/trtllm_text_generation.py b/examples/inference/ammo_ptq/trtllm_text_generation.py
similarity index 100%
rename from examples/inference/trtllm_text_generation.py
rename to examples/inference/ammo_ptq/trtllm_text_generation.py
diff --git a/examples/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh
similarity index 100%
rename from examples/run_text_generation_server_345M.sh
rename to examples/inference/run_text_generation_server_345M.sh
diff --git a/examples/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
similarity index 100%
rename from examples/run_text_generation_server_345M_8_tensor_parallel.sh
rename to examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
diff --git a/examples/merge_mp_bert.sh b/examples/merge_mp_bert.sh
deleted file mode 100755
index 1383433284..0000000000
--- a/examples/merge_mp_bert.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-TENSOR_MODEL_PARALLEL_SIZE=2
-
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m
-
-WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
-                                --model-type BERT \
-                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
-                                --tokenizer-type BertWordPieceLowerCase \
-                                --vocab-file $VOCAB_FILE \
-                                --num-layers 24 \
-                                --hidden-size 1024 \
-                                --num-attention-heads 16 \
-                                --seq-length 512 \
-                                --max-position-embeddings 512 \
-                                --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert.sh b/examples/pretrain_bert.sh
deleted file mode 100755
index 3877b1a5f4..0000000000
--- a/examples/pretrain_bert.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.0001 \
-    --train-iters 2000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert_distributed.sh b/examples/pretrain_bert_distributed.sh
deleted file mode 100755
index 2e0209ae6b..0000000000
--- a/examples/pretrain_bert_distributed.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 32 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert_distributed_with_mp.sh b/examples/pretrain_bert_distributed_with_mp.sh
deleted file mode 100755
index 93a22c95a9..0000000000
--- a/examples/pretrain_bert_distributed_with_mp.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 2 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt.sh b/examples/pretrain_gpt.sh
deleted file mode 100755
index 1d4b20f004..0000000000
--- a/examples/pretrain_gpt.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt3_175B.sh b/examples/pretrain_gpt3_175B.sh
deleted file mode 100755
index 98886e1f19..0000000000
--- a/examples/pretrain_gpt3_175B.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-
-#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p $DIR/logs
-
-
-DATASET_1="<PATH TO THE FIRST DATASET>"
-DATASET_2="<PATH TO THE SECOND DATASET>"
-DATASET_3="<PATH TO THE THIRD DATASET>"
-DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
-
-
-options=" \
-	--tensor-model-parallel-size 8 \
-	--pipeline-model-parallel-size 16 \
-        --num-layers 96 \
-        --hidden-size 12288 \
-        --num-attention-heads 96 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-	--micro-batch-size 1 \
-	--global-batch-size 1536 \
-	--rampup-batch-size 16 16 5859375 \
-	--train-samples 146484375 \
-       	--lr-decay-samples 126953125 \
-        --lr-warmup-samples 183105 \
-        --lr 6.0e-5 \
-	--min-lr 6.0e-6 \
-        --lr-decay-style cosine \
-        --log-interval 10 \
-        --eval-iters 40 \
-        --eval-interval 1000 \
-	--data-path ${DATASET} \
-	--vocab-file <PATH TO gpt-vocab.json> \
-	--merge-file <PATH TO gpt-merges.txt> \
-	--save-interval 1000 \
-	--save <PATH TO CHECKPOINTS DIRECTORY> \
-	--load <PATH TO CHECKPOINTS DIRECTORY> \
-	--split 98,2,0 \
-	--clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-	--tensorboard-dir <TENSORBOARD DIRECTORY> \
-	--fp16 "
-
-
-run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
-
-
-srun -l \
-     --container-image "nvcr.io/nvidia/pytorch:24.01-py3" \
-     --container-mounts "<DIRECTORIES TO MOUNT>" \
-     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
-
-
-set +x
-
diff --git a/examples/pretrain_gpt_distributed.sh b/examples/pretrain_gpt_distributed.sh
deleted file mode 100755
index effce206d3..0000000000
--- a/examples/pretrain_gpt_distributed.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 8 \
-    --global-batch-size 64 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt_distributed_with_mp.sh b/examples/pretrain_gpt_distributed_with_mp.sh
deleted file mode 100755
index 470a2560d3..0000000000
--- a/examples/pretrain_gpt_distributed_with_mp.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --sequence-parallel \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 16 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-
diff --git a/examples/pretrain_ict.sh b/examples/pretrain_ict.sh
deleted file mode 100755
index 8cba0f08ba..0000000000
--- a/examples/pretrain_ict.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#! /bin/bash
-
-# Runs the "217M" parameter biencoder model for ICT retriever
-
-RANK=0
-WORLD_SIZE=1
-
-PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
-TEXT_DATA_PATH=<Specify path and file prefix of the text data>
-TITLE_DATA_PATH=<Specify path and file prefix od the titles>
-CHECKPOINT_PATH=<Specify path>
-
-
-python pretrain_ict.py \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --micro-batch-size 32 \
-        --seq-length 256 \
-        --max-position-embeddings 512 \
-        --train-iters 100000 \
-        --vocab-file bert-vocab.txt \
-        --tokenizer-type BertWordPieceLowerCase \
-        --DDP-impl torch \
-        --bert-load ${PRETRAINED_BERT_PATH} \
-        --log-interval 100 \
-        --eval-interval 1000 \
-        --eval-iters 10 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --retriever-score-scaling \
-        --load $CHECKPOINT_PATH \
-        --save $CHECKPOINT_PATH \
-        --data-path ${TEXT_DATA_PATH} \
-        --titles-data-path ${TITLE_DATA_PATH} \
-        --lr 0.0001 \
-        --lr-decay-style linear \
-        --weight-decay 1e-2 \
-        --clip-grad 1.0 \
-        --lr-warmup-fraction 0.01 \
-        --save-interval 4000 \
-        --exit-interval 8000 \
-        --query-in-block-prob 0.1 \
-        --fp16
diff --git a/examples/pretrain_t5.sh b/examples/pretrain_t5.sh
deleted file mode 100644
index c44cc5763c..0000000000
--- a/examples/pretrain_t5.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_t5_distributed.sh b/examples/pretrain_t5_distributed.sh
deleted file mode 100755
index 03bbf189cf..0000000000
--- a/examples/pretrain_t5_distributed.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_t5_core.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_t5_distributed_with_mp.sh b/examples/pretrain_t5_distributed_with_mp.sh
deleted file mode 100644
index 9802866263..0000000000
--- a/examples/pretrain_t5_distributed_with_mp.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-T5_ARGS="
-    --tensor-model-parallel-size 2 \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16  \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_vision_classify.sh b/examples/pretrain_vision_classify.sh
deleted file mode 100755
index 5fcdd6e6ef..0000000000
--- a/examples/pretrain_vision_classify.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#! /bin/bash
-
-# Pre-trains ViT based image classificaation model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_SL=1
-
-# Training and validation paths should each point to a folder where each
-# sub-folder contains a collection of images in jpg or png format
-# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
-DATA_PATH_TRAIN=<Specify train data path>
-DATA_PATH_VAL=<Specify validation data path>
-
-CHECKPOINT_PATH=<Specify path>
-
-CLASSIFIER_ARGS="
-   	   --tensor-model-parallel-size 1 \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --patch-dim 4 \
-        --seq-length 3136 \
-        --max-position-embeddings 3136 \
-        --img-h 224 \
-        --img-w 224 \
-        --mask-factor 1.0 \
-        --fp16 \
-        --train-iters 750000 \
-        --lr-decay-style cosine \
-        --micro-batch-size 4 \
-        --global-batch-size 1024 \
-        --lr 0.0005 \
-        --min-lr 0.00001 \
-        --attention-dropout 0.0 \
-        --weight-decay 0.05 \
-        --lr-warmup-iters 12500 \
-        --clip-grad 1.0 \
-        --no-gradient-accumulation-fusion \
-        --num-workers 4 \
-        --DDP-impl torch "
-
-DATA_ARGS="
-     --tokenizer-type NullTokenizer \
-     --vocab-size 0 \
-     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
-     --no-data-sharding \
-     --split 949,50,1 \
-"
-
-OUTPUT_ARG="
-     --log-interval 32 \
-     --save-interval 10000 \
-     --eval-interval 2500 \
-     --eval-iters 100 \
-     --tensorboard-dir ${CHECKPOINT_PATH} \
-"
-
-torchrun pretrain_vision_classification.py \
-     $CLASSIFIER_ARGS \
-     $DATA_ARGS \
-     $OUTPUT_ARGS \
-     --save $CHECKPOINT_PATH \
-     --load $CHECKPOINT_PATH
-
diff --git a/examples/pretrain_vision_dino.sh b/examples/pretrain_vision_dino.sh
deleted file mode 100755
index b047e4e340..0000000000
--- a/examples/pretrain_vision_dino.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#! /bin/bash
-
-# Pre-trains Dino V1 model
-# For model details: https://arxiv.org/abs/2104.14294
-# For original author implementation: https://github.com/facebookresearch/dino/tree/main
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_SL=1
-
-# Training and validation paths should each point to a folder where each
-# sub-folder contains a collection of images in jpg or png format
-# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
-DATA_PATH_TRAIN=<Specify train data path>
-DATA_PATH_VAL=<Specify validation data path>
-
-CHECKPOINT_PATH=<Specify path>
-
-DINO_ARGS="
-        --vision-pretraining-type dino \
-   	   --tensor-model-parallel-size 1 \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --patch-dim 4 \
-        --seq-length 3136 \
-        --max-position-embeddings 3136 \
-        --img-h 224 \
-        --img-w 224 \
-        --mask-factor 1.0 \
-        --fp16 \
-        --train-iters 750000 \
-        --lr-decay-style cosine \
-        --micro-batch-size 4 \
-        --global-batch-size 1024 \
-        --lr 0.0005 \
-        --min-lr 0.00001 \
-        --attention-dropout 0.0 \
-        --weight-decay 0.05 \
-        --lr-warmup-iters 12500 \
-        --clip-grad 1.0 \
-        --no-gradient-accumulation-fusion \
-        --num-workers 4 \
-        --DDP-impl torch "
-
-DATA_ARGS="
-     --tokenizer-type NullTokenizer \
-     --vocab-size 0 \
-     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
-     --no-data-sharding \
-     --split 949,50,1 \
-"
-
-OUTPUT_ARG="
-     --log-interval 32 \
-     --save-interval 10000 \
-     --eval-interval 2500 \
-     --eval-iters 100 \
-     --tensorboard-dir ${CHECKPOINT_PATH} \
-"
-
-torchrun pretrain_vision_dino.py \
-     $DINO_ARGS \
-     $DATA_ARGS \
-     $OUTPUT_ARGS \
-     --save $CHECKPOINT_PATH \
-     --load $CHECKPOINT_PATH
-
diff --git a/examples/pretrain_vision_inpaint.sh b/examples/pretrain_vision_inpaint.sh
deleted file mode 100755
index 01c7e71a9e..0000000000
--- a/examples/pretrain_vision_inpaint.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#! /bin/bash
-
-# Pre-trains ViT based image inpainting model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_SL=1
-
-# Training and validation paths should each point to a folder where each
-# sub-folder contains a collection of images in jpg or png format
-# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
-DATA_PATH_TRAIN=<Specify train data path>
-DATA_PATH_VAL=<Specify validation data path>
-
-CHECKPOINT_PATH=<Specify path>
-
-INPAINT_ARGS="
-        --vision-pretraining-type inpaint \
-   	   --tensor-model-parallel-size 1 \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --patch-dim 4 \
-        --seq-length 3136 \
-        --max-position-embeddings 3136 \
-        --img-h 224 \
-        --img-w 224 \
-        --mask-factor 1.0 \
-        --fp16 \
-        --train-iters 750000 \
-        --lr-decay-style cosine \
-        --micro-batch-size 4 \
-        --global-batch-size 1024 \
-        --lr 0.0005 \
-        --min-lr 0.00001 \
-        --attention-dropout 0.0 \
-        --weight-decay 0.05 \
-        --lr-warmup-iters 12500 \
-        --clip-grad 1.0 \
-        --no-gradient-accumulation-fusion \
-        --num-workers 4 \
-        --DDP-impl torch "
-
-DATA_ARGS="
-     --tokenizer-type NullTokenizer \
-     --vocab-size 0 \
-     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
-     --no-data-sharding \
-     --split 949,50,1 \
-"
-
-OUTPUT_ARG="
-     --log-interval 32 \
-     --save-interval 10000 \
-     --eval-interval 2500 \
-     --eval-iters 100 \
-     --tensorboard-dir ${CHECKPOINT_PATH} \
-"
-
-torchrun pretrain_vision_inpaint.py \
-     $INPAINT_ARGS \
-     $DATA_ARGS \
-     $OUTPUT_ARGS \
-     --save $CHECKPOINT_PATH \
-     --load $CHECKPOINT_PATH
-
diff --git a/examples/pretrain_vlm.sh b/examples/pretrain_vlm.sh
deleted file mode 100755
index c74cf1eff6..0000000000
--- a/examples/pretrain_vlm.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-# Train a vision language model.
-# Default arguments here use a mock dataset. Please edit the arguments to your liking.
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-# Check that the user has set an output path for model checkpoints.
-if [[ -z $CHECKPOINT_PATH ]]; then
-    echo "Please set CHECKPOINT_PATH for storing your model checkpoints."
-    exit 1
-fi
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node 8 \
-"
-
-# Note: the learning rate and other hyperparameters used here are just examples and not optimized in any way.
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 512 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 2 \
-    --global-batch-size 16 \
-    --lr 0.00015 \
-    --train-iters 10000 \
-    --lr-decay-iters 3200 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-IMG_ARGS="
-    --img-h 336 \
-    --img-w 336 \
-    --patch-dim 14
-"
-
-DATA_ARGS="
-    --split 949,50,1
-    --tokenizer-type NullTokenizer
-    --vocab-size=8192
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 5000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-# Select one of the cases below.
-
-# Multi GPU
-# torchrun $DISTRIBUTED_ARGS \
-
-# Single GPU
-# CUDA_VISIBLE_DEVICES=0 python -u \
-
-# Single GPU with a debugger
-# CUDA_VISIBLE_DEVICES=0 python -u -m debugpy --listen 0.0.0.0:5678 --wait-for-client \
-
-torchrun $DISTRIBUTED_ARGS \
-    pretrain_vlm.py \
-    $GPT_ARGS \
-    $IMG_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/pretrain_ict.py b/pretrain_ict.py
index 0ae9059273..205588b5e9 100644
--- a/pretrain_ict.py
+++ b/pretrain_ict.py
@@ -158,6 +158,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 
 if __name__ == "__main__":
+    print_rank_0("WARNING : This script is DEPRECATED. Will be removed in mcore release 0.9")
     pretrain(train_valid_test_datasets_provider,
              pretrain_ict_model_provider,
              ModelType.encoder_or_decoder,
diff --git a/report_theoretical_memory.py b/tools/report_theoretical_memory.py
similarity index 100%
rename from report_theoretical_memory.py
rename to tools/report_theoretical_memory.py

From 32363998af1cf8c37ea7f8633743e1e2d5d79f8c Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Fri, 7 Jun 2024 10:12:13 -0700
Subject: [PATCH 72/92] Multimodal example - Dataset files

---
 examples/multimodal/dataloader_provider.py | 131 ++++++
 examples/multimodal/dataset_helpers.py     | 513 +++++++++++++++++++++
 examples/multimodal/manual_prompts.json    |  29 ++
 examples/multimodal/pretrain_8b.sh         |   4 +-
 examples/multimodal/pretrain_dataset.yaml  |  15 +
 examples/multimodal/sft_8b.sh              |   9 +-
 examples/multimodal/sft_dataset.yaml       |  15 +
 examples/multimodal/train.py               |   5 +-
 8 files changed, 713 insertions(+), 8 deletions(-)
 create mode 100644 examples/multimodal/dataloader_provider.py
 create mode 100644 examples/multimodal/dataset_helpers.py
 create mode 100644 examples/multimodal/manual_prompts.json
 create mode 100644 examples/multimodal/pretrain_dataset.yaml
 create mode 100644 examples/multimodal/sft_dataset.yaml

diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
new file mode 100644
index 0000000000..5fcdb458bf
--- /dev/null
+++ b/examples/multimodal/dataloader_provider.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import torch
+from dataset_helpers import TaskEncoder, print_error_handler
+
+from megatron.core import mpu
+from megatron.energon import (
+    LimitDataset,
+    RepeatDataset,
+    WorkerConfig,
+    get_loader,
+    get_savable_loader,
+    get_train_dataset,
+    get_val_datasets,
+)
+from megatron.training import get_args, get_num_microbatches, print_rank_0
+from megatron.training.checkpointing import get_checkpoint_name
+
+
+def datasets_provider(worker_config=None):
+    """Create multimodal train, validation and test datasets."""
+    args = get_args()
+    dname = args.data_path[0] if type(args.data_path) is list else args.data_path
+    train_dataset = get_train_dataset(
+        dname,
+        batch_size=args.micro_batch_size,
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        virtual_epoch_length=1000,
+        max_samples_per_sequence=100,
+        shuffle_buffer_size=100,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+
+    val_datasets = get_val_datasets(
+        dname,
+        batch_size=args.micro_batch_size,
+        # This is the total number over all workers
+        # limit=args.eval_iters * get_num_microbatches(),
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+    val_datasets_without_source_datasets = [
+        # Limit the dataset to eval_iters * num_microbatches
+        LimitDataset(
+            # Repeat the inner dataset in case it's too short
+            RepeatDataset(val_ds, worker_config=worker_config),
+            length=args.eval_iters * get_num_microbatches(),
+            worker_config=worker_config,
+            reset_after_epoch=True,
+        )
+        for val_ds, _src_ds in val_datasets
+    ]
+
+    return train_dataset, val_datasets_without_source_datasets, None
+
+
+def train_valid_test_dataloaders_provider(train_val_test_num_samples):
+    """Build multimodal train, validation and test dataloaders."""
+    args = get_args()
+
+    worker_debug_path = None
+    worker_log_level = 0
+
+    rank = mpu.get_data_parallel_rank()
+    world_size = mpu.get_data_parallel_world_size()
+    data_parallel_group = mpu.get_data_parallel_group()
+
+    worker_config = WorkerConfig(
+        rank=rank,
+        world_size=world_size,
+        num_workers=args.num_workers,
+        data_parallel_group=data_parallel_group,
+        worker_debug_path=worker_debug_path,
+        worker_log_level=worker_log_level,
+    )
+    train_ds, valid_ds1, test_ds = datasets_provider(worker_config)
+
+    train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
+    if args.load is not None:
+        if hasattr(args, "dataloader_path"):
+            dp_rank = (
+                mpu.get_data_parallel_rank()
+                if torch.distributed.is_initialized()
+                else 0
+            )
+            data_save_name = get_checkpoint_name(
+                args.dataloader_path,
+                args.iteration,
+                save_basename=f"train_dataloader_dprank{dp_rank:03d}.pt",
+            )
+            try:
+                dataset_state_dict = torch.load(
+                    data_save_name, map_location="cpu"
+                )
+                if (
+                    "dataset_state_dict" in dataset_state_dict.keys()
+                    and dataset_state_dict["train_data_path"]
+                    != args.train_data_path
+                ):
+                    print_rank_0(
+                        f"Not restoring dataset state from {data_save_name}, path to dataset changed from {dataset_state_dict['train_data_path']} to {args.train_data_path}"
+                    )
+                else:
+                    train_dataloader.restore_state_rank(
+                        dataset_state_dict["dataloader_state_dict"]
+                    )
+                    print_rank_0(
+                        f"restoring dataset state from {data_save_name}"
+                    )
+            except Exception as e:
+                print_rank_0(
+                    "loading dataloader checkpoint failed. Skipping. " + str(e)
+                )
+
+    valid_dataloader = [
+        iter(cyclic_iter(get_loader(valid_ds, worker_config=worker_config)))
+        for valid_ds in valid_ds1
+    ]
+    test_dataloader = None
+
+    return iter(cyclic_iter(train_dataloader)), valid_dataloader, iter(cyclic_iter(test_dataloader))
+
+
+
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py
new file mode 100644
index 0000000000..74d7aa990e
--- /dev/null
+++ b/examples/multimodal/dataset_helpers.py
@@ -0,0 +1,513 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import dataclasses
+import json
+import random
+import re
+import sys
+import traceback
+from dataclasses import dataclass
+from typing import Any, List, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
+from torchvision import transforms as T
+from torchvision.transforms import Compose, RandAugment, RandomResizedCrop, Resize, ToPILImage
+
+from megatron.core import mpu
+from megatron.energon import Batch, CaptioningSample, DefaultTaskEncoder, OCRSample, VQASample
+from megatron.energon.transforms import CustomTransform, MergeTransform
+from megatron.training import get_args
+from megatron.training.tokenizer import build_tokenizer
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+
+# Imagenet's mean and std.
+pixel_mean = [123.675, 116.28, 103.53]
+pixel_std = [58.395, 57.12, 57.375]
+
+
+def convert_to_rgb(image):
+    return image.convert("RGB")
+
+def _transform_train(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)),
+        convert_to_rgb,
+    ])
+
+def _transform_train_aug(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)),
+        convert_to_rgb,
+        RandAugment(2, 5, isPIL=True, augs=['Identity', 'AutoContrast', 'Brightness', 'Sharpness', 'Equalize',
+                                              'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),
+    ])
+
+def _transform_test(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        Resize((img_h, img_w)),
+        convert_to_rgb,
+    ])
+
+class RandomResize(CustomTransform):
+    """Resizes the image by a random scale factor in the given interval, but at most max_size"""
+
+    def __init__(self, min_scale: float, max_scale: float, max_size: int):
+        self._min_scale = min_scale
+        self._max_scale = max_scale
+        self._max_size = max_size
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        scale = random.uniform(self._min_scale, self._max_scale)
+        new_size = tuple(int(x * scale) for x in dst_size)
+
+        if max(new_size) > self._max_size:
+            scale = self._max_size / max(new_size)
+            new_size = tuple(int(x * scale) for x in dst_size)
+
+        matrix = self.scale(scale, scale) @ matrix
+        dst_size = np.array(new_size, dtype=dst_size.dtype)
+
+        return matrix, dst_size, (self.__class__.__name__, scale)
+
+
+class RandomResizeLongEdge(CustomTransform):
+    """Resizes the image's longer edge to a random length between min_size and max_size pixels."""
+
+    def __init__(self, min_size: int, max_size: int):
+        self._min_size = min_size
+        self._max_size = max_size
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        new_long = random.randint(self._min_size, self._max_size)
+        if dst_size[0] > dst_size[1]:  # h > w
+            new_w, new_h = int(new_long * dst_size[1] / dst_size[0]), new_long
+        else:  # w > h
+            new_w, new_h = new_long, int(new_long * dst_size[0] / dst_size[1])
+
+        new_size = (new_h, new_w)
+        matrix = self.scale(new_w / dst_size[1], new_h / dst_size[0]) @ matrix
+        dst_size = np.array(new_size, dtype=dst_size.dtype)
+
+        return matrix, dst_size, (self.__class__.__name__, new_size)
+
+
+class RandomPad(CustomTransform):
+    """Pads the image to the given size, randomly choosing the position of the image within the new larger image.
+    If the image is already larger than the given size, it will not be padded in that direction(s)."""
+
+    def __init__(self, size: Tuple[int, int]):
+        self._new_size = size  # h, w
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        h_pad = max(self._new_size[0] - dst_size[0], 0)
+        w_pad = max(self._new_size[1] - dst_size[1], 0)
+
+        if h_pad == 0 and w_pad == 0:
+            return matrix, dst_size, (self.__class__.__name__, None)
+        else:
+            # TODO: fix me
+            # top = random.randint(0, h_pad)
+            # left = random.randint(0, w_pad)
+            top = 0
+            left = 0
+
+            matrix = self.translate(left, top) @ matrix
+            dst_size = np.array(self._new_size, dtype=dst_size.dtype)
+            return matrix, dst_size, (self.__class__.__name__, (top, left))
+
+
+def _get_ocr_document_visual_transform(IMG_H=1024, IMG_W=1024):
+    document_visual_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
+                    RandomResizeLongEdge(960, 1008),  # Note: 1008 comes from list(range(960, 1024, 16))[-1]
+                    T.RandomRotation(5, interpolation=T.InterpolationMode.BILINEAR),
+                    T.RandomPerspective(distortion_scale=0.1, p=0.1),
+                    RandomPad((IMG_H, IMG_W)),
+                ]
+            ),
+            T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
+            T.RandomGrayscale(p=0.5),
+            T.RandomInvert(p=0.5),
+            T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
+            T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
+            # LogImage(),
+            # T.ToTensor(),
+            # T.Normalize(IMAGE_MEAN, IMAGE_STD),
+        ]
+    )
+    return document_visual_transform
+
+def _get_ocr_document_identity_transform(IMG_H=1024, IMG_W=1024):
+    long_edge = max(IMG_H, IMG_W)
+    document_identity_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    RandomResizeLongEdge(long_edge, long_edge),
+                    RandomPad((long_edge, long_edge)),
+                ]
+            )
+        ]
+    )
+    return document_identity_transform
+
+def _get_ocr_paragraph_visual_transform(IMG_H=1024, IMG_W=1024):
+    paragraph_visual_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
+                    RandomResize(0.5, 2.0, min(IMG_H, IMG_W)), #FINAL_SIZE),
+                    T.RandomRotation(1, interpolation=T.InterpolationMode.BILINEAR),
+                    T.RandomPerspective(distortion_scale=0.1, p=0.1),
+                    RandomPad((IMG_H, IMG_W)),
+                ]
+            ),
+            T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
+            T.RandomGrayscale(p=0.5),
+            T.RandomInvert(p=0.5),
+            # T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
+            # T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
+            # LogImage(),
+            # T.ToTensor(),
+            # T.Normalize(IMAGE_MEAN, IMAGE_STD),
+        ]
+    )
+    return paragraph_visual_transform
+
+# Type for intermediate batch, after batch()
+@dataclass
+class ImageTaskSample:
+    __key__: str
+    __subflavors__: Dict
+    # (c, h, w)
+    img: torch.Tensor
+    text: np.ndarray
+    prompt_len: np.int64
+    img_clip: Optional[torch.Tensor] = None
+
+
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatch(Batch):
+    __keys__: List[str]
+    __subflavors__: List[Dict]
+    # (n, c, h, w)
+    img: torch.Tensor
+    # (n, seq_len)
+    text: torch.Tensor
+    # (n, 1)
+    prompt_len: torch.Tensor
+    # (n, c, h, w)
+    img_clip: Optional[torch.Tensor] = None
+
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+
+class Tokenizer:
+    def __init__(self):
+
+        args = get_args()
+        self.args = args
+
+        self.IMAGE_TOKEN_INDEX = -200
+        self.initializer()
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Tokenizer.tokenizer = build_tokenizer(self.args)
+        self.eod_token = Tokenizer.tokenizer.eod
+        self.split_token = 313131
+
+        if (
+            hasattr(self.args, "split_sentences") and self.args.split_sentences
+        ):  # default false
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            library = "tokenizers/punkt/{}.pickle".format("english")
+            # print("loading: " + library)
+            splitter = nltk.load(library)
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Tokenizer.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text=splitter._params, lang_vars=CustomLanguageVars()
+                )
+            else:
+                Tokenizer.splitter = splitter
+        else:
+            Tokenizer.splitter = IdentitySplitter()
+
+    def __call__(self, text: str, padded: bool = True): # -> torch.Tensor:
+        sentence = Tokenizer.splitter.tokenize(text)[0]
+        sentence = Tokenizer.tokenizer.tokenize(sentence)
+        return sentence
+
+    def pad(self, content, seq_len=1024):
+        out = np.pad(content, pad_width=(0,max(0,seq_len-len(content))), mode='constant', constant_values=self.eod_token)
+
+        return out
+
+
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatch, dict]):
+    """A simple task encoder for captioning."""
+
+    def __init__(
+        self
+    ):
+        # Specify the batch_type for default batching (batching is performed here "manually" by
+        # overwriting the `batch` method)
+        super().__init__()
+
+        self.args = get_args()
+
+        self.tokenizer = Tokenizer()
+        self.manual_prompts = json.load(open(self.args.prompt_path))
+        self.seq_len = self.args.seq_length
+
+        self.txt_to_token_dict = {}
+
+        self.img_h, self.img_w = self.args.img_h, self.args.img_w
+
+        self.pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
+        self.pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+
+        self.ocr_document_visual_transform = _get_ocr_document_visual_transform(self.img_h, self.img_w)
+        self.ocr_document_identity_transform = _get_ocr_document_identity_transform(self.img_h, self.img_w)
+        self.ocr_paragraph_visual_transform = _get_ocr_paragraph_visual_transform(self.img_h, self.img_w)
+
+
+    def get_visual_transform(self, img_sample, sample_augmentation=False):
+        raw_h, raw_w = img_sample.shape[0], img_sample.shape[1]
+        ratio = float(max(self.img_h, self.img_w)) / max(raw_h, raw_w)
+        scaled_h, scaled_w = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5)
+
+        # if the sample needs augmentation or not
+        if sample_augmentation:
+            # further check if augmentation is a global flag in args
+            if self.args.aug:
+                visual_transform = _transform_train_aug(scaled_h, scaled_w)
+            else:
+                visual_transform = _transform_train(scaled_h, scaled_w)
+        else:
+            visual_transform = _transform_test(scaled_h, scaled_w)
+
+        img = visual_transform(img_sample)
+
+        # Normalize pixel values.
+        img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std
+
+        # Pad to target image size.
+        delta_h, delta_w = self.img_h - scaled_h, self.img_w - scaled_w
+        img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+        return img
+
+    def encode_sample(self, sample: Union[
+        CaptioningSample, OCRSample, VQASample]
+        ):
+
+        if isinstance(sample, OCRSample):
+            yield self.encode_ocr(sample)
+
+        elif isinstance(sample, CaptioningSample):
+            yield self.encode_captioning(sample)
+
+        elif isinstance(sample, VQASample):
+            yield self.encode_vqa(sample)
+
+        else:
+            raise NotImplementedError('Sample format not supported')
+            yield None
+
+    def encode_captioning(self, sample: CaptioningSample):
+        sample_augmentation = sample.__subflavors__["augmentation"] == True
+
+        img = self.get_visual_transform(np.array(sample.image), sample_augmentation=sample_augmentation)
+
+        # randomly select a prompt
+        if 'CaptioningDetailed' in sample.__subflavors__["type"]:
+            prompt_idx = np.random.randint(len(self.manual_prompts["CaptioningDetailed"]["raw"]))
+            cur_prompt = self.manual_prompts["CaptioningDetailed"]["raw"][prompt_idx]
+        else:
+            prompt_idx = np.random.randint(len(self.manual_prompts["Captioning"]["raw"]))
+            cur_prompt = self.manual_prompts["Captioning"]["raw"][prompt_idx]
+
+        if cur_prompt not in self.txt_to_token_dict:
+            self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt)
+        cur_prompt = self.txt_to_token_dict[cur_prompt]
+
+        prompt_len = len(cur_prompt)
+
+        caption = sample.caption
+        if 'SplitByLine' in sample.__subflavors__["type"]:
+            # caption = re.sub(r"\n+", "\n", caption)
+            caption_list = caption.split('\n')
+            caption_list = [caption for caption in caption_list if caption.strip() != '']
+            caption = np.random.choice(caption_list)
+        caption_token = self.tokenizer(caption.strip())
+
+        if len(caption.strip()) == 0:
+            raise RuntimeError('Empty string in caption!')
+
+        seq_len = self.seq_len + 4
+        text_sample = np.concatenate([[self.tokenizer.IMAGE_TOKEN_INDEX], cur_prompt, caption_token])
+        text_sample = self.tokenizer.pad(text_sample, seq_len)
+        text_sample = text_sample[:seq_len]
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            img=img,
+            text=text_sample,
+            prompt_len=prompt_len
+        )
+
+    def encode_vqa(self, sample: VQASample):
+        task_name = None
+
+        no_image_flag = True if '-noimage' in sample.__key__ else False
+
+        if 'pretrain' in sample.__key__:
+            task_name = 'pretrain'
+        else:
+            task_name = sample.__key__.split("/")[0]
+
+        sample_augmentation = sample.__subflavors__["augmentation"] == True
+
+        if no_image_flag:
+            img = torch.from_numpy(np.array([0]).astype(np.float32))
+        else:
+            img = self.get_visual_transform(np.array(sample.image), sample_augmentation=sample_augmentation)
+
+        if "<image>" in sample.context:
+            sample.context = sample.context.replace("<image>","")
+
+        if task_name != 'pretrain' and sample.context[-1:] != "\n":
+            sample.context = sample.context + "\n"
+
+        question_token = self.tokenizer(sample.context)
+        if isinstance(sample.answers, list):
+            answer_list = sample.answers
+            weight_list = np.array(sample.answer_weights).astype(np.float32)
+            weight_list = weight_list / np.sum(weight_list)
+            answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+            answer = answer_list[answer_idx]
+            answer_token = self.tokenizer(answer)
+        else:
+            answer_token = self.tokenizer(sample.answers)
+
+        prompt_len = len(question_token)
+
+        seq_len = self.seq_len + 4
+
+        text_sample = np.concatenate([[self.tokenizer.IMAGE_TOKEN_INDEX], question_token, answer_token])
+        text_sample = self.tokenizer.pad(text_sample, seq_len)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            img=img,
+            text=text_sample,
+            prompt_len=prompt_len
+        )
+
+    def encode_ocr(self, sample: OCRSample) -> ImageTaskSample:
+        if sample.__subflavors__["type"] == "document":
+            visual_transform = self.ocr_document_visual_transform
+        elif sample.__subflavors__["type"] == "paragraph":
+            visual_transform = self.ocr_paragraph_visual_transform
+        elif sample.__subflavors__["augmentation"] == False:
+            visual_transform = self.ocr_document_identity_transform
+        else:
+            raise ValueError(f"Unknown subflavor {sample.__subflavors__}")
+
+        if sample.words_boxes is not None and sample.words_boxes.shape[1] >= 5:
+            # Boxes with conf below 0.9 are skipped
+            filter_words_mask = sample.words_boxes[:, 4] < 0.9
+            filter_boxes = sample.words_boxes[filter_words_mask, :4]
+            for x, y, x2, y2 in filter_boxes:
+                if isinstance(sample.image, Image.Image):
+                    draw = ImageDraw.Draw(sample.image)
+                    draw.rectangle([int(x), int(y), (int(x2), int(y2))], fill=0)
+                else:
+                    sample.image[:, int(y) : int(y2) + 1, int(x) : int(x2) + 1] = 0
+
+            text = " ".join(
+                text for skip, text in zip(filter_words_mask, sample.words_text) if not skip
+            )
+        else:
+            text = " ".join(sample.text.splitlines())
+
+        match = re.search(r'"text_sequence": "(.*?)"', text)
+        if match:
+            text = match.group(1)
+
+        img = visual_transform(sample.image)
+        img_clip = None
+        img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std
+        img = torch.nn.functional.pad(img, (0, self.img_w - img.shape[2], 0, self.img_h - img.shape[1]))
+
+        # randomly select a prompt
+        prompt_idx = np.random.randint(len(self.manual_prompts["OCR"]["raw"]))
+        cur_prompt = self.manual_prompts["OCR"]["raw"][prompt_idx]
+
+        if cur_prompt not in self.txt_to_token_dict:
+            self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt)
+        cur_prompt = self.txt_to_token_dict[cur_prompt]
+
+        text_sample = self.tokenizer(text)
+        prompt_len = len(cur_prompt)
+        seq_len = self.seq_len + 4
+        text_sample = np.concatenate([cur_prompt, text_sample])
+        text_sample = self.tokenizer.pad(text_sample, seq_len=seq_len)
+        text_sample = text_sample[:seq_len]
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            img=img,
+            img_clip=img_clip,
+            text=text_sample,
+            prompt_len=prompt_len
+        )
+
+    def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch:
+        batch = ImageTaskBatch(
+            __keys__=[s.__key__ for s in samples],
+            __subflavors__=[s.__subflavors__ for s in samples],
+            img=torch.stack([s.img for s in samples]),
+            text=torch.from_numpy(np.stack([s.text for s in samples], axis=0).astype(np.int64)),
+            prompt_len=torch.from_numpy(np.array([s.prompt_len for s in samples], dtype=np.int64))
+        )
+
+        return batch
+
+    def encode_batch(self, batch: ImageTaskBatch) -> dict:
+        raw = dataclasses.asdict(batch)
+        del raw["__subflavors__"]
+        return raw
+
+
+def print_error_handler(exc: Exception, key: Optional[str]):
+    print(
+        f"The following exception occurred in the dataloader for sample {key} and is skipped",
+        file=sys.stderr,
+    )
+    traceback.print_exc()
diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json
new file mode 100644
index 0000000000..e4bf3e493a
--- /dev/null
+++ b/examples/multimodal/manual_prompts.json
@@ -0,0 +1,29 @@
+{
+    "Captioning": {
+        "raw": [
+            "Can you briefly explain what you see in the image?",
+            "Describe what's happening in this image in one short sentence.",
+            "Write a short caption that accurately represents the content of this image.",
+            "Please generate a descriptive caption for the image provided.",
+            "How would you summarize the scene depicted in the picture in short?"
+        ]
+    },
+    "OCR": {
+        "raw": [
+            "Can you read the text from image and output here?",
+            "Extract and document the text from the provided image.",
+            "Converting the text embedded in this image into a readable document.",
+            "Transcribe all the text you find.",
+            "Can you extract all visible text from the image here?"
+        ]
+    },
+    "VQA": {
+        "raw": [
+            "Given the image, answer the following question with few words.",
+            "Answer the following question: ",
+            "What is the answer to this question?",
+            "Write the answer: ",
+            "Please answer this question: "
+        ]
+    }
+}
diff --git a/examples/multimodal/pretrain_8b.sh b/examples/multimodal/pretrain_8b.sh
index efa638360e..dc1f5ce89c 100755
--- a/examples/multimodal/pretrain_8b.sh
+++ b/examples/multimodal/pretrain_8b.sh
@@ -48,7 +48,7 @@ else
     BZ=256
     NW=2
     HD=0.1
-    LI=1
+    LI=10
     EXTRA_ARGS=""
     NONDETERMINISTIC_ATTN=1
 fi
@@ -88,7 +88,6 @@ OPTIONS=" \
     --data-path ${DATA_TRAIN} \
     --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
-    --dataset-config ${SOURCE}/examples/multimodal/dataset_config.yaml \
     --save-interval 1000 \
     --save ${FINETUNE_DIR} \
     --load ${CHECKPOINT_DIR} \
@@ -115,6 +114,7 @@ OPTIONS=" \
     ${EXTRA_ARGS} \
     --distributed-timeout-minutes 60 \
     --allow-missing-vision-projection-checkpoint \
+    --use-te
 "
 
 export NVTE_APPLY_QK_LAYER_SCALING=1
diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml
new file mode 100644
index 0000000000..5c6660b95e
--- /dev/null
+++ b/examples/multimodal/pretrain_dataset.yaml
@@ -0,0 +1,15 @@
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: /workspace/data/pretrain/train/dataset
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: /workspace/data/pretrain/validation/dataset
+        subflavors:
+          augmentation: false
\ No newline at end of file
diff --git a/examples/multimodal/sft_8b.sh b/examples/multimodal/sft_8b.sh
index a88c51870e..4c026a7de0 100755
--- a/examples/multimodal/sft_8b.sh
+++ b/examples/multimodal/sft_8b.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Run SFT on a multimodal model.
+# Run SFT on a pretrained multimodal model.
 
 export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -41,11 +41,13 @@ DEBUG=0
 if [[ $DEBUG -eq 1 ]]; then
     BZ=8
     NW=1
+    LI=1
     HD=0.0
     EXTRA_ARGS=""
 else
     BZ=128
     NW=1
+    LI=10
     HD=0.1
     EXTRA_ARGS=""
 fi
@@ -76,7 +78,7 @@ OPTIONS=" \
     --lr 1e-6 \
     --min-lr 1e-7 \
     --lr-decay-style cosine \
-    --log-interval 10 \
+    --log-interval ${LI} \
     --eval-iters 10 \
     --eval-interval 1000 \
     --tokenizer-type GPTSentencePieceTokenizer \
@@ -84,7 +86,6 @@ OPTIONS=" \
     --data-path ${DATA_TRAIN} \
     --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
-    --dset-config ${SOURCE}/examples/multimodal/dataset_config.yaml \
     --save-interval 1000 \
     --exit-duration-in-mins 230 \
     --save ${FINETUNE_DIR} \
@@ -115,4 +116,4 @@ OPTIONS=" \
 export NVTE_APPLY_QK_LAYER_SCALING=1
 
 # MULTI GPU
-torchrun --nproc_per_node 8 pretrain_multimodal.py ${OPTIONS}
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml
new file mode 100644
index 0000000000..83230a9cd2
--- /dev/null
+++ b/examples/multimodal/sft_dataset.yaml
@@ -0,0 +1,15 @@
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: /workspace/data/sft/train/dataset
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: /workspace/data/sft/validation/dataset
+        subflavors:
+          augmentation: false
\ No newline at end of file
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
index 2a448f248b..d20f469602 100644
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -19,6 +19,7 @@
 from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te
 from megatron.training import pretrain
 from megatron.training.utils import average_losses_across_data_parallel_group
+from dataloader_provider import train_valid_test_dataloaders_provider
 
 
 def model_provider(pre_process=True, post_process=True, parallel_output=True) -> LLaVAModel:
@@ -291,10 +292,10 @@ def add_multimodal_extra_args(parser):
 
 
 if __name__ == "__main__":
-    train_valid_test_datasets_provider.is_distributed = True
+    train_valid_test_dataloaders_provider.is_distributed = True
 
     pretrain(
-        train_valid_test_datasets_provider,
+        train_valid_test_dataloaders_provider,
         model_provider,
         ModelType.encoder_or_decoder,
         forward_step,

From c241c617bc6175abb888468992a03eff90da733f Mon Sep 17 00:00:00 2001
From: Markus Kliegl <mkliegl@nvidia.com>
Date: Fri, 7 Jun 2024 10:37:59 -0700
Subject: [PATCH 73/92] Change the default for --split to None

---
 megatron/training/arguments.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index dc23152889..ae0e2b599c 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -231,6 +231,13 @@ def validate_args(args, defaults={}):
         else:
             setattr(args, key, defaults[key])
 
+    if args.data_path is not None and args.split is None:
+        legacy_default_split_value = '969, 30, 1'
+        if args.rank == 0:
+            print('WARNING: Please specify --split when using --data-path. Using legacy default value '
+                  f'of "{legacy_default_split_value}"')
+        args.split = legacy_default_split_value
+
     # Batch size.
     assert args.micro_batch_size is not None
     assert args.micro_batch_size > 0
@@ -1411,7 +1418,7 @@ def _add_data_args(parser):
                        '(3) a list of prefixes e.g. prefix1 prefix2. '
                        'For (3), weights are inferred from the lengths of the contributing datasets. '
                        'This argument is exclusive to the other independent --*-data-path arguments.')
-    group.add_argument('--split', type=str, default='969, 30, 1',
+    group.add_argument('--split', type=str, default=None,
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
                        '`90,5,5` will use 90%% of data for training, 5%% for '

From e8ad5be08c7eb0ce9a8611ebfda18d03d4e27f70 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 7 Jun 2024 11:11:50 -0700
Subject: [PATCH 74/92] Updates jet token used for summaries to one pulled from
 vault

---
 jet-tests.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/jet-tests.yml b/jet-tests.yml
index 1a5bc3e1ae..4737a62050 100644
--- a/jet-tests.yml
+++ b/jet-tests.yml
@@ -5,6 +5,11 @@
     - if: $JET_CUSTOM_FILTER != "" && $CI_PIPELINE_SOURCE != 'merge_request_event'
     - when: never
 
+default:
+  id_tokens:
+    VAULT_JWT_TOKEN:
+      aud: https://stg.vault.nvidia.com
+
 include:
   - project: dl/jet/gitlab-templates
     ref: main
@@ -62,7 +67,7 @@ jet-results-summary:
     - os/linux
   needs: [ jet-trigger ]
   before_script:
-    - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $CI_JOB_JWT
+    - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
   script: 
     - python -m pip install -U --no-cache-dir prettytable
     - rc=0

From d0513c1e6ff46eb3b015b3aca3358eb3264c6a39 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Date: Fri, 7 Jun 2024 17:05:33 -0700
Subject: [PATCH 75/92] Fix tests.

---
 .../pipeline_parallel/test_schedules.py       | 58 +++++++++++--------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py
index 02bdd2882b..5dd6605d68 100644
--- a/tests/unit_tests/pipeline_parallel/test_schedules.py
+++ b/tests/unit_tests/pipeline_parallel/test_schedules.py
@@ -25,7 +25,7 @@ def test_deallocate_output_tensor():
     out = torch.tensor([[1, 2, 3], [4, 5, 6]])
     schedule.deallocate_output_tensor(out)
     assert(out.nelement() == 6) 
-""" 
+
 def test_forward_backward_func_without_pipeline_parallel(mocker):
     from megatron.core.pipeline_parallel import get_forward_backward_func
 
@@ -56,19 +56,22 @@ def set_input_tensor(input_tensor):
 
     losses_reduced = forward_backward_func(
         forward_step_func=forward_step_func,
-        data_iterator=None,
+        data_iterator=range(0,100),
         model=[model],
         num_microbatches=4,
         seq_length=None,
         micro_batch_size=None,
-        forward_only=False) 
+        forward_only=True) 
     
+
     loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
+    
     for i,j in zip(losses_reduced, loss_reduced_expected):
         print(losses_reduced)
         assert(i['loss_reduced'] == j['loss_reduced'])
     Utils.destroy_model_parallel() 
 
+
 def test_forward_backward_func_with_pipeline_parallel(mocker):
     from megatron.core.pipeline_parallel import get_forward_backward_func
 
@@ -96,14 +99,15 @@ def set_input_tensor(input_tensor):
 
     config = ModelParallelConfig(
         pipeline_model_parallel_size = 4,
-        sequence_parallel = False
+        sequence_parallel = False,
+        pipeline_dtype=torch.float,
     )
+    config.hidden_size = hidden_size
     model.config = config
     
     losses_reduced = forward_backward_func(
         forward_step_func=forward_step_func,
         data_iterator=None,
-        dtype=torch.float32,
         model=[model],
         num_microbatches= micro_batch_size,
         seq_length=sequence_length,
@@ -142,57 +146,62 @@ def set_input_tensor(input_tensor):
     micro_batch_size = 8
     hidden_size = 256
 
+    config = ModelParallelConfig(
+        pipeline_model_parallel_size = 4,
+        sequence_parallel = False,
+        pipeline_dtype=torch.float,
+    )
+    config.hidden_size = hidden_size
+    model.config = config
+
     mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
 
     with pytest.raises(RuntimeError):
         model.model_type = ModelType.encoder_and_decoder
         forward_backward_func(
             forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
+            data_iterator=[range(0,100)],
             model=[model, model],
             num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size, 
             decoder_seq_length=sequence_length,
-            sequence_parallel=False,
             forward_only=True)
-        
+ 
     with pytest.raises(RuntimeError):
         model.model_type = ModelType.encoder_or_decoder
         forward_backward_func(
             forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
+            data_iterator=[range(0,100)],
             model=[model, model],
             num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size, 
             decoder_seq_length=256,
-            sequence_parallel=False,
             forward_only=True)
-
+     
     with pytest.raises(RuntimeError):
         model.model_type = ModelType.encoder_or_decoder
         forward_backward_func(
             forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
+            data_iterator=[range(0,100)],
             model=[model, model],
             num_microbatches= 7,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size, 
             decoder_seq_length=512,
-            sequence_parallel=False,
             forward_only=True)    
 
+    
     model.model_type = ModelType.encoder_or_decoder
     losses_reduced = forward_backward_func(
         forward_step_func=forward_step_func,
-        data_iterator=range(0,100),
-        dtype=torch.float32,
+        data_iterator=[range(0,100), range(0,100)],
         model=[model, model],
         num_microbatches= micro_batch_size,
-        tensor_shape=[sequence_length, micro_batch_size, hidden_size],
+        seq_length=sequence_length,
+        micro_batch_size=micro_batch_size, 
         decoder_seq_length=sequence_length,
-        sequence_parallel=True,
         forward_only=True) 
     
     loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
@@ -200,5 +209,4 @@ def set_input_tensor(input_tensor):
         print(losses_reduced)
         assert(i['loss_reduced'] == j['loss_reduced'])
 
-    Utils.destroy_model_parallel()  
-"""
+    Utils.destroy_model_parallel()    

From 5dcd9956f0e4e3600b7faaa76e0dacd0fe45b9ff Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Tue, 11 Jun 2024 11:10:28 -0700
Subject: [PATCH 76/92] Multimodal functional test improvements

---
 tests/functional_tests/jet_recipes/MR-multimodal.yaml  |  2 ++
 ...a_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json |  2 +-
 .../multimodal/pretrain_llava_distributed_test.sh      | 10 ++++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/functional_tests/jet_recipes/MR-multimodal.yaml b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
index a93e840b9f..64ffd79585 100644
--- a/tests/functional_tests/jet_recipes/MR-multimodal.yaml
+++ b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
@@ -27,6 +27,7 @@ spec:
   time_limit: 1200
   ckpt_format: torch
   ckpt_resume: 0
+  allow_nondeterministic: 0
   script: |-
     ls
     cd /workspace/megatron-lm
@@ -46,6 +47,7 @@ spec:
         MOE_GROUPED_GEMM={moe_grouped_gemm} \
         CKPT_FORMAT={ckpt_format} \
         CHECKPOINT_RESUME_TEST={ckpt_resume} \
+        ALLOW_NONDETERMINISTIC={allow_nondeterministic} \
         JOB_NAME={name} \
         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
 products:
diff --git a/tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json b/tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json
index a3efbeb21e..64780812b5 100644
--- a/tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json
+++ b/tests/functional_tests/test_results/jet/multimodal_llava_merge-request_dgx_a100_1N8G_mcore_te_tp1_pp1.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13475, 9.1392, 9.13457, 9.12454, 9.09413, 9.07808, 9.02886, 9.00177, 8.96967, 8.92995]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2594425.0, 2527253.0, 2602008.0, 2497235.0, 2554616.0, 2677868.0, 2491787.0, 2610638.0, 2656468.0, 2684047.0]}, "iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.14052, 9.14041, 9.13223, 9.12307, 9.07696, 9.06413, 9.00897, 8.96969, 8.93509, 8.85701]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2557220.0, 2644506.0, 2554848.0, 2479331.0, 2739591.0, 2557907.0, 2491851.0, 2537345.0, 2513770.0, 2645270.0]}, "iteration_timing_avg": 0.21943264705882357}
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
index 68a572d3b2..ea4969a0c8 100755
--- a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh
@@ -33,7 +33,7 @@ TRANSFORMER_IMPL=local
 if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then
    command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;"
 else
-   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;"
+   command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=Tree;"
    ADDITIONAL_PARAMS+=" --deterministic-mode"
 fi
 
@@ -70,9 +70,9 @@ else
        __SAVE_INTERVAL=10000  # inf
 fi
 if [[ -n "$CKPT_FORMAT" ]] && [[ "$CKPT_FORMAT" != 'torch' ]]; then
-       echo "Using distributed checkpoint format..."
-       command="$command pip install zarr tensorstore==0.1.45;"
-       ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT"
+       echo "Using distributed checkpoint format $CKPT_FORMAT..."
+       [[ "$CKPT_FORMAT" == 'zarr' ]] && command="$command pip install zarr tensorstore==0.1.45;"
+       ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT --use-mcore-models"
 fi
 set +x
 
@@ -83,6 +83,8 @@ build_torch_run_cmd() {
     pretrain_vlm.py \
       --num-layers 12 \
       --hidden-size 512 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
       --num-attention-heads 8 \
       --log-params-norm \
       --log-num-zeros-in-grad \

From 3fa97d41d5c597dcf786c0c0feb0749ca285af0c Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 11 Jun 2024 11:12:54 -0700
Subject: [PATCH 77/92] Add terryk to test code owner section

---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index cf30f9c148..f9b05a66b3 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,5 +2,5 @@
 megatron/core/ @shanmugamr @maanug @jcasper @eharper
 
 [TESTS]
-tests/ @shanmugamr @maanug
+tests/ @shanmugamr @maanug @terryk
 

From c0293d898d8985a6a09d5a65c86e3c94e0510d54 Mon Sep 17 00:00:00 2001
From: Mikolaj Blaz <mblaz@nvidia.com>
Date: Tue, 11 Jun 2024 14:33:13 -0700
Subject: [PATCH 78/92] Fix optimizer loading for finetuning

---
 megatron/training/checkpointing.py            |  46 ++++----
 .../dist_checkpointing/test_optimizer.py      | 105 +++++++++++++++++-
 2 files changed, 129 insertions(+), 22 deletions(-)

diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index 35f74ee890..22e3912c50 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -723,28 +723,36 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
             run_tp_pp = (mpu.get_tensor_model_parallel_world_size(), mpu.get_pipeline_model_parallel_world_size())
             mismatch_msg = "(TP, PP) mismatch after resume ({} vs {} from checkpoint)".format(ckpt_tp_pp, run_tp_pp)
 
-            if ckpt_tp_pp == run_tp_pp and not getattr(state_dict['args'], 'no_save_rng', False):
-                rng_state = get_rng_state(True)  # we can load the rng state
+            # Determine if RNG state will be loaded
+            if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng
+                    and not getattr(state_dict['args'], 'no_save_rng', False)):
+                gen_sd_rng_state = get_rng_state(True)  # we can load the rng state
             else:
-                rng_state = None
-                print_rank_0("{}: RNG state will be ignored".format(mismatch_msg))
-
-            # TODO: add DistributedOptimizer support for differing TPxPP
-            if ckpt_tp_pp != run_tp_pp and not release and not args.finetune and not args.no_load_optim and args.use_distributed_optimizer:
-                raise RuntimeError("{}: not supported for DistributedOptimizer".format(mismatch_msg))
+                gen_sd_rng_state = None
+                if ckpt_tp_pp != run_tp_pp:
+                    print_rank_0("{}: RNG state will be ignored".format(mismatch_msg))
 
             optim_sd_kwargs = dict(is_loading=True)
-            if args.use_distributed_optimizer:
-                optim_sd_kwargs['sharding_type'] = ('fully_sharded_bucket_space'
-                                                    if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
-                                                    else 'dp_zero_gather_scatter')
-            # [ModelOpt]: remedy for finetune
-            if args.finetune or args.no_load_optim:
-                load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, None, None,
-                                                                        rng_state, args.use_dist_ckpt, optim_sd_kwargs=optim_sd_kwargs)
+            # Determine if optimizer state will be loaded
+            if (not release and not args.finetune and not args.no_load_optim
+                    and not getattr(state_dict['args'], 'no_save_optim', False)):
+                gen_sd_optim = optimizer
+                gen_sd_opt_param_scheduler = opt_param_scheduler
+
+                # TODO: add DistributedOptimizer support for differing TPxPP
+                if ckpt_tp_pp != run_tp_pp and args.use_distributed_optimizer:
+                    raise RuntimeError("{}: not supported for DistributedOptimizer".format(mismatch_msg))
+
+
+                if args.use_distributed_optimizer:
+                    optim_sd_kwargs['sharding_type'] = ('fully_sharded_bucket_space'
+                                                        if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
+                                                        else 'dp_zero_gather_scatter')
             else:
-                load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, optimizer, opt_param_scheduler,
-                                                                        rng_state, args.use_dist_ckpt, optim_sd_kwargs=optim_sd_kwargs)
+                gen_sd_optim = None
+                gen_sd_opt_param_scheduler = None
+            load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, gen_sd_optim, gen_sd_opt_param_scheduler,
+                                                                    gen_sd_rng_state, True, optim_sd_kwargs=optim_sd_kwargs)
             load_kwargs['exit_on_missing_checkpoint'] = args.exit_on_missing_checkpoint
 
     state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False, **load_kwargs)
@@ -785,7 +793,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                                               'consumed_valid_samples', 0)
     else:
         print_rank_0('could not find arguments in the checkpoint ...')
-    
+
     # [ModelOpt]: loading modelopt_state (sharded or not)
     if has_nvidia_modelopt:
         if args.use_dist_ckpt:
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
index 82daa24d67..a0fb3bd58b 100644
--- a/tests/unit_tests/dist_checkpointing/test_optimizer.py
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from copy import deepcopy
 from functools import partial
 from time import sleep
+from types import SimpleNamespace
 from unittest import mock
 
 import numpy as np
@@ -26,6 +28,7 @@
 from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import get_model_config
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
 from megatron.training.training import get_model
 from megatron.training.utils import unwrap_model
 from pretrain_gpt import model_provider
@@ -105,20 +108,53 @@ def initialize_gpt_model(pre_process=True, post_process=True, seed=0, **config_k
     return model
 
 
-def init_mock_args(args, bf16=True):
+def init_basic_mock_args(args, bf16=True):
     args.data_parallel_random_init = False
     args.virtual_pipeline_model_parallel_size = None
+    args.fp16 = False
     args.bf16 = bf16
     args.accumulate_allreduce_grads_in_fp32 = False
     args.overlap_grad_reduce = False
     args.use_distributed_optimizer = True
     args.ddp_bucket_size = None
+    args.check_for_nan_in_loss_and_grad = False
+    args.ddp_average_in_collective = False
     return args
 
 
+def init_checkpointing_mock_args(args, ckpt_dir, fully_parallel=False):
+    args.save = ckpt_dir
+    args.load = ckpt_dir
+    args.pretrained_checkpoint = None
+    args.ckpt_fully_parallel_save = fully_parallel
+    args.ckpt_fully_parallel_load = fully_parallel
+    args.async_save = False
+    args.use_dist_ckpt = True
+    args.dist_ckpt_format = 'torch_dist'
+    args.no_save_optim = False
+    args.no_save_rng = False
+    args.ckpt_assume_constant_structure = False
+    args.log_progress = False
+    args.auto_detect_ckpt_format = False
+    args.exit_on_missing_checkpoint = False
+    args.finetune = False
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+    args.retro_add_retriever = False
+    args.no_load_optim = False
+    args.no_load_rng = False
+
+
+def load_checkpoint_no_arg_checks(*args, **kwargs):
+    with mock.patch('megatron.training.checkpointing.check_checkpoint_args'):
+        with mock.patch('megatron.training.checkpointing.update_num_microbatches'):
+            return load_checkpoint(*args, **kwargs)
+
+
 def setup_model_and_optimizer(seed, bf16=True):
-    with mock.patch('megatron.training.training.get_args', data_parallel_random_init=False) as mock_args:
-        init_mock_args(mock_args.return_value, bf16)
+    mock_args = SimpleNamespace()
+    with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
+        init_basic_mock_args(mock_args, bf16=bf16)
         model = get_model(partial(initialize_gpt_model, seed=seed))
 
     config = OptimizerConfig(bf16=bf16, params_dtype=torch.bfloat16 if bf16 else torch.float, use_distributed_optimizer=bf16)
@@ -204,6 +240,69 @@ def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_
             finally:
                 Utils.set_world_size()
 
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp',),
+        [
+            ((2, 2), (2, 4)),
+            ((1, 8), (4, 1)),
+            ((2, 4), (4, 2)),
+        ]
+    )
+    def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp,):
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_finetune_doesnt_load_optimizer') as ckpt_dir:
+            mock_args = SimpleNamespace()
+            with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
+                init_basic_mock_args(mock_args)
+                init_checkpointing_mock_args(mock_args, ckpt_dir, False)
+
+                Utils.initialize_model_parallel(*src_tp_pp)
+                model, optimizer = setup_model_and_optimizer(seed=2)
+
+                # We need to save the TPxPP of the source model
+                mock_args.tensor_model_parallel_size = src_tp_pp[0]
+                mock_args.pipeline_model_parallel_size = src_tp_pp[1]
+                save_checkpoint(10, model, optimizer, None, 0)
+                Utils.destroy_model_parallel()
+
+                Utils.initialize_model_parallel(*dest_tp_pp)
+                model, optimizer = setup_model_and_optimizer(seed=3)
+                model_unloaded_state_dict = deepcopy(model[0].state_dict())
+                optim_unloaded_state_dict = deepcopy(optimizer.state_dict())
+
+                # Load with different TPxPP should raise DistributeOptimizer error
+                with pytest.raises(RuntimeError) as exc_info:
+                    load_checkpoint_no_arg_checks(model, optimizer, None)
+                assert "(TP, PP) mismatch" in str(exc_info.value)
+
+                ## Check that the state didn't change
+                assert not any(diff(model[0].state_dict(), model_unloaded_state_dict))
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+                # Now test the same with a `finetune` flag
+                mock_args.finetune = True
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+                ## Model weights should be different, but optimizer state is unchanged
+                diffs = diff(model[0].state_dict(), model_unloaded_state_dict)
+                # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff - we expect only values diff
+                assert not diffs[0] and not diffs[1] and diffs[2]
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+                # ... or `no_load_optim` flag
+                model, optimizer = setup_model_and_optimizer(seed=3)
+                mock_args.finetune = False
+                mock_args.no_load_optim = True
+                mock_args.no_load_rng = True
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+                ## Model weights should be different, but optimizer state is unchanged
+                diffs = (diff(model[0].state_dict(), model_unloaded_state_dict))
+                # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff - we expect only values diff
+                assert not diffs[0] and not diffs[1] and diffs[2]
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+                Utils.destroy_model_parallel()
+
 
 class TestFP32Optimizer:
     @pytest.mark.parametrize(

From 4537bbeb3faf4af1e138e03bbc1a1225df8d9d12 Mon Sep 17 00:00:00 2001
From: Pallab Bhattacharya <pallabb@nvidia.com>
Date: Tue, 11 Jun 2024 16:55:15 -0700
Subject: [PATCH 79/92] use cuevents for get_batch, type hardening

---
 megatron/core/README_STRAGGLER.md |   9 +-
 megatron/core/utils.py            | 241 ++++++++++++++++--------------
 tests/unit_tests/test_utils.py    |  18 +++
 3 files changed, 151 insertions(+), 117 deletions(-)

diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md
index de399f7fe0..fe9062c851 100644
--- a/megatron/core/README_STRAGGLER.md
+++ b/megatron/core/README_STRAGGLER.md
@@ -1,13 +1,16 @@
-## StragglerDetector
+## StragglerDetector for a TP Group
 
-The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts
+The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts.
+It can be used to find straggling TP group based on the RTT of the ranks in the TP Group. It also collects
+Power/Temp/Utilization for GPUs, which can additionally be used to narrow down to the exact GPU in the TP Group,
+assuming the straggling was caused by hardware anomaly in a given GPU.<br>
 This class supports collecting timing events for various steps of a given iteration. It
 keeps collecting such timing events on a per rank basis, and when the reporter is invoked
 during a logging interval, it computes the min and max of certain metric across all
 ranks and logs the observed metric and the rank as follows
 
 ```
- 0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27us/23 | MxDRtt/Rnk: 34.65us/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8
+ 0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27ms/23 | MxDRtt/Rnk: 34.65ms/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8
 ```
 <hr>
 
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index 159bbf1163..9895a9f822 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -16,7 +16,7 @@
 from datetime import datetime
 from functools import reduce
 from types import TracebackType
-from typing import Any, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 
@@ -546,7 +546,7 @@ class _StragglerData:
     # clock
     min_clock = _ValueWithRank(sys.float_info.max, 0, "MHz")
     max_clock = _ValueWithRank(sys.float_info.min, 0, "MHz")
-    aflops: List[_ValueWithRank] = None
+    aflops: Union[List[_ValueWithRank], None] = None
 
 
 class StragglerDetector:
@@ -575,15 +575,15 @@ class StragglerDetector:
         toggle (bool): whether to start/stop detector collection
         bdata (bool): when true, just collect get_batch
         dev (int): cuda device
-        idx (int): index into the list below
-        idx_q (LifoQueue): queue of index
         evt_q (LifoQueue): cuda event queue
-        start_events (list[torch.cuda.Event]): cuda start event
-        stop_events (list[torch.cuda.Event]): cuda stop event
-        start_time (list[int]): start time (wallclock)
-        stop_time (list[int]): stop time (wallclock)
-        start_batch (list[int]): start time for get_batch
-        stop_batch (list[int]): stop time for get_batch
+        start_gemm_ev (list[torch.cuda.Event]): cuda start event
+        stop_gemm_ev (list[torch.cuda.Event]): cuda stop event
+        start_data_ev (list[torch.cuda.Event]): cuda start event
+        stop_data_ev (list[torch.cuda.Event]): cuda stop event
+        start_gemm_tm (list[int]): start time (wallclock)
+        stop_gemm_tm (list[int]): stop time (wallclock)
+        start_data_tm (list[int]): start time for get_batch
+        stop_data_tm (list[int]): stop time for get_batch
         sock (socket): the controller socket
         ctrlr (Thread): the controller thread
     """
@@ -614,28 +614,28 @@ def __init__(self) -> None:
         The enabled state is indicated using self._off member variable
         and the proerty enabled.
         """
-        self._off = True
+        self._off: bool = True
         self.start = self.null_method
         self.stop = self.null_method
-        self.world = 0
-        self.rank = 0
-        self.mmcnt = 1
-        self.port = 0
-        self.amp = 3.0
-        self.toggle = False
-        self.bdata = False
-        self.dev = None
-        self.idx = 0
-        self.idx_q = None
-        self.evt_q = None
-        self.start_events = None
-        self.stop_events = None
-        self.start_time = None
-        self.stop_time = None
-        self.start_batch = None
-        self.stop_batch = None
-        self.sock = None
-        self.ctrlr = None
+        self.world: int = 0
+        self.rank: int = 0
+        self.mmcnt: int = 1
+        self.port: int = 0
+        self.amp: float = 3.0
+        self.toggle: bool = False
+        self.bdata: bool = False
+        self.dev: Union[torch.device, int, None] = None
+        self.evt_q: Union[queue.LifoQueue, None] = None
+        self.start_gemm_ev: List[torch.cuda.Event] = []
+        self.stop_gemm_ev: List[torch.cuda.Event] = []
+        self.start_data_ev: List[torch.cuda.Event] = []
+        self.stop_data_ev: List[torch.cuda.Event] = []
+        self.start_gemm_tm: List[int] = []
+        self.stop_gemm_tm: List[int] = []
+        self.start_data_tm: List[int] = []
+        self.stop_data_tm: List[int] = []
+        self.sock: Union[socket.socket, None] = None
+        self.ctrlr: Union[threading.Thread, None] = None
 
     def configure(
         self,
@@ -688,15 +688,15 @@ def configure(
             self.port = port
             self.toggle = False
             self.bdata = False
-            self.idx = 0
-            self.idx_q = queue.LifoQueue()
             self.evt_q = queue.LifoQueue()
-            self.start_events = []
-            self.stop_events = []
-            self.start_time = []
-            self.stop_time = []
-            self.start_batch = []
-            self.stop_batch = []
+            self.start_gemm_ev = []
+            self.stop_gemm_ev = []
+            self.start_data_ev = []
+            self.stop_data_ev = []
+            self.start_gemm_tm = []
+            self.stop_gemm_tm = []
+            self.start_data_tm = []
+            self.stop_data_tm = []
             backend = torch.distributed.get_backend()
             if backend == "nccl":
                 self.dev = torch.cuda.current_device()
@@ -719,18 +719,21 @@ def reset(self) -> None:
         """
         if self._off:
             return
-        self.idx = 0
-        self.idx_q = queue.LifoQueue()
         # Pool them
-        _ = [self.evt_q.put(ev) for ev in self.start_events]
-        _ = [self.evt_q.put(ev) for ev in self.stop_events]
-        self.start_events = []
-        self.stop_events = []
+        if self.evt_q is not None:
+            _ = [self.evt_q.put(ev) for ev in self.start_gemm_ev]
+            _ = [self.evt_q.put(ev) for ev in self.stop_gemm_ev]
+            _ = [self.evt_q.put(ev) for ev in self.start_data_ev]
+            _ = [self.evt_q.put(ev) for ev in self.stop_data_ev]
+        self.start_gemm_ev = []
+        self.stop_gemm_ev = []
+        self.start_data_ev = []
+        self.stop_data_ev = []
         # Use regular timers
-        self.start_time = []
-        self.stop_time = []
-        self.start_batch = []
-        self.stop_batch = []
+        self.start_gemm_tm = []
+        self.stop_gemm_tm = []
+        self.start_data_tm = []
+        self.stop_data_tm = []
         self.bdata = False
 
     def start_method(self) -> None:
@@ -742,26 +745,30 @@ def start_method(self) -> None:
         CPU - generally useful for timing get_batch()
         """
         # Not reentrant
-        # First check if this start is for data
-        if self.bdata:
-            self.start_batch.append(time.perf_counter_ns())
-            self.stop_batch.append(0)  # this indicate we need to add timer
-            self.bdata = False
-            return
-        if self.evt_q.qsize() > 1:
+        if self.evt_q is not None and self.evt_q.qsize() > 1:
             sev = self.evt_q.get()  # no try-catch
             eev = self.evt_q.get()  # no try-catch
         else:
             sev = torch.cuda.Event(enable_timing=True)
             eev = torch.cuda.Event(enable_timing=True)
-        self.start_events.append(sev)
-        self.stop_events.append(eev)
-        self.start_time.append(0)
-        self.stop_time.append(0)
-        self.idx_q.put(self.idx)
-        self.start_time[self.idx] = time.perf_counter_ns()
-        self.start_events[self.idx].record()
-        self.idx += 1
+        # First check if this start is for data
+        if self.bdata:
+            self.start_data_ev.append(sev)
+            self.stop_data_ev.append(eev)
+            self.start_data_tm.append(0)
+            self.stop_data_tm.append(0)
+            idx = len(self.stop_data_tm) - 1
+            self.start_data_tm[idx] = time.perf_counter_ns()
+            self.start_data_ev[idx].record()
+            self.bdata = False
+            return
+        self.start_gemm_ev.append(sev)
+        self.stop_gemm_ev.append(eev)
+        self.start_gemm_tm.append(0)
+        self.stop_gemm_tm.append(0)
+        idx = len(self.stop_gemm_tm) - 1
+        self.start_gemm_tm[idx] = time.perf_counter_ns()
+        self.start_gemm_ev[idx].record()
 
     def stop_method(self) -> None:
         """This method adds the stop timers.
@@ -772,13 +779,15 @@ def stop_method(self) -> None:
         """
         # Not reentrant
         # First check if this stop is for data
-        dle = len(self.stop_batch) - 1
-        if dle >= 0 and self.stop_batch[dle] == 0:
-            self.stop_batch[dle] = time.perf_counter_ns()
+        idx = len(self.stop_data_tm) - 1
+        if idx >= 0 and self.stop_data_tm[idx] == 0:
+            self.stop_data_tm[idx] = time.perf_counter_ns()
+            self.stop_data_ev[idx].record()
             return
-        idx = self.idx_q.get()
-        self.stop_time[idx] = time.perf_counter_ns()
-        self.stop_events[idx].record()
+        idx = len(self.stop_gemm_tm) - 1
+        if idx >= 0 and self.stop_gemm_tm[idx] == 0:
+            self.stop_gemm_tm[idx] = time.perf_counter_ns()
+            self.stop_gemm_ev[idx].record()
 
     def elapsed(self) -> Tuple[float, float, int, int, int, int]:
         """This method is called from report(), or can be called directly
@@ -798,10 +807,10 @@ def elapsed(self) -> Tuple[float, float, int, int, int, int]:
         if self._off:
             # match with return below
             return 0, 0, 0, 0, 0, 0
-        ls_ev = len(self.start_events)
-        le_ev = len(self.stop_events)
-        ls_bs = len(self.start_batch)
-        ls_be = len(self.stop_batch)
+        ls_ev = len(self.start_gemm_ev)
+        le_ev = len(self.stop_gemm_ev)
+        ls_bs = len(self.start_data_ev)
+        ls_be = len(self.stop_data_ev)
         delta = 0.0
         batch_delta = 0.0
         temp = 0
@@ -819,15 +828,18 @@ def elapsed(self) -> Tuple[float, float, int, int, int, int]:
             torch.cuda.synchronize()
             # Process Events
             for i in range(ls_ev):
-                e_ev = self.start_events[i].elapsed_time(self.stop_events[i])
-                e_tm = (self.stop_time[i] - self.start_time[i]) / 1e6  # ns to ms
+                e_ev = self.start_gemm_ev[i].elapsed_time(self.stop_gemm_ev[i])
+                e_tm = (self.stop_gemm_tm[i] - self.start_gemm_tm[i]) / 1e6  # ns to ms
                 # Pick the larger of Event and perf_counter time?
                 delta += max(e_ev, e_tm)
             # Process get_batch
             for i in range(ls_bs):
-                batch_delta = (self.stop_batch[i] - self.start_batch[i]) / 1e3  # us
+                b_ev = self.start_data_ev[i].elapsed_time(self.stop_data_ev[i])
+                b_tm = (self.stop_data_tm[i] - self.start_data_tm[i]) / 1e6  # ns to ms
+                # data fetching has prefetch, hence take the max, instead of avg
+                batch_delta = max(batch_delta, max(b_ev, b_tm))
         self.reset()  # Prepare for next round
-        # time in ms, batch_delta in us, check return above
+        # time in ms, batch_delta in ms, check return above
         return delta, batch_delta, temp, power, util, clock
 
     def report(self, total_flops: float = 0.0, log_interval: int = 0) -> bool:
@@ -848,9 +860,9 @@ def report(self, total_flops: float = 0.0, log_interval: int = 0) -> bool:
         """
         ret = False
         if not self._off and total_flops > 0.0 and log_interval > 0:
-            elapsed, btime_us, temp, power, util, clock = self.elapsed()  # get raw time
+            elapsed, btime, temp, power, util, clock = self.elapsed()  # get raw time
+            # btime (get_batch time is max in the iteration)
             ptime = elapsed / (log_interval * 1.0)  # avg per iteration elapsed time, ms
-            btime = btime_us / (log_interval * 1.0)  # avg per iteration get_batch time, us
             api_flops = total_flops / (log_interval * 1.0)  # avg per iteration flops, ms
             apir_flops = api_flops / (
                 ptime * 10 ** 9 * self.world
@@ -860,7 +872,7 @@ def report(self, total_flops: float = 0.0, log_interval: int = 0) -> bool:
             o_dt = self._min_max(
                 ptime, btime, float(temp), float(power), float(util), float(clock), et_flops,
             )
-            if self.rank == 0:
+            if self.rank == 0 and o_dt is not None and o_dt.aflops is not None:
                 now = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
                 min_flops, min_frank, _ = o_dt.aflops[0]()
                 max_flops, max_frank, _ = o_dt.aflops[-1]()
@@ -910,19 +922,22 @@ def _check_toggle(self) -> None:
         if self.rank == 0 and self.toggle:
             off = not self._off
             self.toggle = False
-        state = torch.tensor(off, dtype=torch.bool, device=self.dev)
-        torch.distributed.broadcast(state, 0)  # Blocking
-        self._off = state.item()
-        if not self._off:
-            self.start = self.start_method
-            self.stop = self.stop_method
-            state = "ON"
-        else:
-            self.start = self.null_method
-            self.stop = self.null_method
-            state = "OFF"
-        if self.rank == 0 and off is not self._off:
-            logger.info(f"Toggling StragglerDetector State {state}")
+        st = torch.tensor(off, dtype=torch.bool, device=self.dev)
+        torch.distributed.broadcast(st, 0)  # Blocking
+        # save old switch
+        off = self._off
+        self._off = bool(st.item())
+        if off != self._off:
+            if not self._off:
+                self.start = self.start_method
+                self.stop = self.stop_method
+                state = "ON"
+            else:
+                self.start = self.null_method
+                self.stop = self.null_method
+                state = "OFF"
+            if self.rank == 0:
+                logger.info(f"Toggling StragglerDetector State {state}")
 
     def _handler(self) -> None:
         """Thread function for the controller.
@@ -939,7 +954,7 @@ def _handler(self) -> None:
             logger.info(
                 f"Controller ready to recv " f"commands on port {self.port}. Current state {state}"
             )
-            while True:
+            while True and self.sock is not None:
                 try:
                     conn, _ = self.sock.accept()
                     _ = conn.recv(1024)
@@ -1007,7 +1022,8 @@ def _min_max(
         # initialize output data object
         o_dt = _StragglerData()
 
-        prof_data = {}
+        prof_data: Dict[str, Union[int, float]] = {}
+        data_list: List[Dict[str, Union[int, float]]] = []
         prof_data["rank"] = self.rank
         prof_data["time"] = ptime
         prof_data["btime"] = btime
@@ -1019,8 +1035,6 @@ def _min_max(
 
         if self.rank == 0:
             data_list = [prof_data] * self.world
-        else:
-            data_list = None
 
         # this is blocking by default
         torch.distributed.gather_object(prof_data, object_gather_list=data_list, dst=0)
@@ -1048,46 +1062,47 @@ def _min_max(
             min_rank = min_ctime["rank"]
             max_val = max_ctime["time"]
             max_rank = max_ctime["rank"]
-            o_dt.min_elapsed = _ValueWithRank(min_val, min_rank, "ms")
-            o_dt.max_elapsed = _ValueWithRank(max_val, max_rank, "ms")
+            o_dt.min_elapsed = _ValueWithRank(min_val, int(min_rank), "ms")
+            o_dt.max_elapsed = _ValueWithRank(max_val, int(max_rank), "ms")
 
             min_val = min_cbatch["btime"]
             min_rank = min_cbatch["rank"]
             max_val = max_cbatch["btime"]
             max_rank = max_cbatch["rank"]
-            o_dt.min_btime = _ValueWithRank(min_val, min_rank, "us")
-            o_dt.max_btime = _ValueWithRank(max_val, max_rank, "us")
+            o_dt.min_btime = _ValueWithRank(min_val, int(min_rank), "ms")
+            o_dt.max_btime = _ValueWithRank(max_val, int(max_rank), "ms")
 
             min_val = min_ctemp["temp"]
             min_rank = min_ctemp["rank"]
             max_val = max_ctemp["temp"]
             max_rank = max_ctemp["rank"]
-            o_dt.min_temp = _ValueWithRank(min_val, min_rank, "C")
-            o_dt.max_temp = _ValueWithRank(max_val, max_rank, "C")
+            o_dt.min_temp = _ValueWithRank(min_val, int(min_rank), "C")
+            o_dt.max_temp = _ValueWithRank(max_val, int(max_rank), "C")
 
             min_val = min_cpower["power"]
             min_rank = min_cpower["rank"]
             max_val = max_cpower["power"]
             max_rank = max_cpower["rank"]
-            o_dt.min_power = _ValueWithRank(min_val, min_rank, "W")
-            o_dt.max_power = _ValueWithRank(max_val, max_rank, "W")
+            o_dt.min_power = _ValueWithRank(min_val, int(min_rank), "W")
+            o_dt.max_power = _ValueWithRank(max_val, int(max_rank), "W")
 
             min_val = min_cutil["util"]
             min_rank = min_cutil["rank"]
             max_val = max_cutil["util"]
             max_rank = max_cutil["rank"]
-            o_dt.min_util = _ValueWithRank(min_val, min_rank, "%")
-            o_dt.max_util = _ValueWithRank(max_val, max_rank, "%")
+            o_dt.min_util = _ValueWithRank(min_val, int(min_rank), "%")
+            o_dt.max_util = _ValueWithRank(max_val, int(max_rank), "%")
 
             min_val = min_cclock["clock"]
             min_rank = min_cclock["rank"]
             max_val = max_cclock["clock"]
             max_rank = max_cclock["rank"]
-            o_dt.min_clock = _ValueWithRank(min_val, min_rank, "MHz")
-            o_dt.max_clock = _ValueWithRank(max_val, max_rank, "MHz")
+            o_dt.min_clock = _ValueWithRank(min_val, int(min_rank), "MHz")
+            o_dt.max_clock = _ValueWithRank(max_val, int(max_rank), "MHz")
 
             o_dt.aflops = [
-                _ValueWithRank(d.get("flops"), d.get("rank")) for _, d in enumerate(data_list)
+                _ValueWithRank(d.get("flops", 0.0), int(d.get("rank", -1)))
+                for _, d in enumerate(data_list)
             ]
             o_dt.aflops.sort(key=lambda val_with_rank: val_with_rank()[0])
         # wait for everyone here
@@ -1177,13 +1192,11 @@ def __exit__(
             bool: True if the exception was handled
         """
         # Should not suppress errors even if turned off
-        ret = False
         if ex_type is not None:
-            err = traceback.format_exception(ex_tb)
+            err = traceback.format_exception(ex_type, ex_val, ex_tb)
             logger.warning(f"{str(ex_val)}\n{err}")
-            ret = True
         self.stop()
-        return ret
+        return False
 
 
 # Singleton, global visibility
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
index e8b8416f84..509b33b325 100644
--- a/tests/unit_tests/test_utils.py
+++ b/tests/unit_tests/test_utils.py
@@ -80,6 +80,7 @@ def test_check_param_hashes_across_dp_replicas():
     # Teardown.
     _deinit_distributed()
 
+
 def test_straggler_detector():
     world = int(os.getenv('WORLD_SIZE', '1'))
     rank = int(os.getenv('RANK', '0'))
@@ -120,6 +121,21 @@ def straggler_detector_timeit():
         assert delta > 0.0
         assert batch_delta >= s
 
+    # Test function to raise ValueError
+    def straggler_value_error():
+        raise ValueError("Exception value raised")
+
+    # Check that exception is not suppressed.
+    def straggler_detector_exception_propagate():
+        # batch_data
+        with pytest.raises(ZeroDivisionError):
+            with stimer(bdata=True):
+                x = 1 / 0
+        # non-batch-data
+        with pytest.raises(ValueError, match=r".* value .*"):
+            with stimer():
+                straggler_value_error()
+
     # Reporting.
     def straggler_detector_report():
         s = 2  # Sleep for 2 seconds.
@@ -160,6 +176,8 @@ def straggler_detector_report():
     straggler_detector_timeit()
     # Report only from rank 0.
     straggler_detector_report()
+    # Check that exception is not suppressed.
+    straggler_detector_exception_propagate()
 
     # Teardown.
     _deinit_distributed()

From 1af20dd139bd06b37173d3bd3d2cbcc7ba4e7921 Mon Sep 17 00:00:00 2001
From: Mikolaj Blaz <mblaz@nvidia.com>
Date: Wed, 12 Jun 2024 09:40:35 -0700
Subject: [PATCH 80/92] Implement "model space" DistOpt checkpoint format

---
 megatron/core/dist_checkpointing/mapping.py   | 163 +++++++++++++-
 megatron/core/dist_checkpointing/optimizer.py |   4 +-
 .../core/dist_checkpointing/serialization.py  |  16 +-
 .../dist_checkpointing/strategies/torch.py    | 212 ++++++++++++++----
 megatron/core/optimizer/distrib_optimizer.py  | 150 +++++++++++--
 megatron/core/transformer/mlp.py              | 142 ++++++++----
 megatron/training/checkpointing.py            |  11 +-
 .../functional_tests/jet_recipes/MR-gpt.yaml  |   2 +-
 .../test_flattened_resharding.py              |  99 ++++++++
 .../dist_checkpointing/test_mapping.py        |  55 ++++-
 .../dist_checkpointing/test_optimizer.py      | 116 +++++++---
 .../dist_checkpointing/test_serialization.py  |   3 +-
 12 files changed, 807 insertions(+), 166 deletions(-)
 create mode 100644 tests/unit_tests/dist_checkpointing/test_flattened_resharding.py

diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py
index 3001c20f6c..bd5fd2236c 100644
--- a/megatron/core/dist_checkpointing/mapping.py
+++ b/megatron/core/dist_checkpointing/mapping.py
@@ -7,7 +7,7 @@
 """
 
 import logging
-from abc import ABC
+from abc import ABC, abstractmethod
 from dataclasses import dataclass, replace
 from itertools import chain
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -33,6 +33,10 @@ class ShardedBase(ABC):
     data: object
     replica_id: ReplicaId
 
+    @abstractmethod
+    def validate_metadata_integrity(self):
+        """Codifies the constraints on metadata attributes."""
+
 
 @dataclass
 class ShardedTensor(ShardedBase):
@@ -67,6 +71,62 @@ class ShardedTensor(ShardedBase):
     allow_shape_mismatch: bool = False
     flattened_range: Optional[slice] = None
 
+    def __post_init__(self):
+        self.validate_metadata_integrity()
+
+    def validate_metadata_integrity(self) -> None:
+        """Codifies the constraints on metadata attributes.
+
+        Meeting those constraints is guaranteed when instantiating a ShardedTensor
+        class with `from_rank_offsets` or `from_rank_offsets_flat` constructors.
+
+        Returns:
+            None
+        """
+        has_flattened_range = self.flattened_range is not None
+        if self.data is not None:
+            if self.data.dtype != self.dtype:
+                raise CheckpointingException(
+                    f'Data dtype should match `dtype` attribute for {self}'
+                )
+            if not has_flattened_range and self.data.shape != self.local_shape:
+                raise CheckpointingException(
+                    f'Data shape should match `local_shape` attribute for {self}'
+                )
+            if has_flattened_range:
+                if self.data.ndim != 1:
+                    raise CheckpointingException(f'Data should be 1D for a flattened {self}')
+                real_data = self.data
+                try:
+                    self.data = None
+                    self.init_data(device='meta')
+                    if self.data.shape != real_data.shape:
+                        raise CheckpointingException(
+                            f'Data shape doesnt match expected {self.data.shape} for {self}'
+                        )
+                finally:
+                    self.data = real_data
+
+        if len(self.global_shape) != len(self.global_offset):
+            raise CheckpointingException(
+                f'Global offset dimensions should be equal to global shape dimensions for {self}'
+            )
+        if len(self.local_shape) + self.prepend_axis_num != len(self.global_shape):
+            raise CheckpointingException(
+                f'Local shape together with `prepend_axis_num` dimensions should be equal to global shape dimensions for {self}'
+            )
+
+        for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
+            if off % sh != 0:
+                raise CheckpointingException(
+                    f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.'
+                )
+
+        if has_flattened_range and self.flattened_range.step is not None:
+            raise CheckpointingException(
+                f'`step` argument in the flattened range of a ShardedTensor is not supported.'
+            )
+
     def global_slice(self) -> Tuple[Union[int, slice], ...]:
         assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
         return tuple(
@@ -111,12 +171,25 @@ def local_coordinates(self) -> Tuple[np.ndarray, ...]:
         mask[self.flattened_range] = True
         return np.nonzero(mask.reshape(self.local_shape))
 
+    def local_chunk_offset_in_global(self) -> Tuple[int, ...]:
+        """Offset of a local chunk in a global array of chunks.
+
+        Returns:
+            Tuple[int, ...]: the offset of the whole local chunk in a global array of chunks.
+        """
+        assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
+        chunk_offset = list(self.global_offset[: self.prepend_axis_num])
+        for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
+            assert off % sh == 0, str(self)
+            chunk_offset.append(off // sh)
+        return tuple(chunk_offset)
+
     def max_allowed_chunks(self) -> Tuple[int, ...]:
         chunks = []
         for axis_sh, axis_fragm in zip(self.global_shape, self.axis_fragmentations):
             if not self.allow_shape_mismatch and axis_sh % axis_fragm != 0:
                 raise CheckpointingException(
-                    f'Axis shape ({axis_sh}) not divisible' f' by axis fragmentation ({axis_fragm}'
+                    f'Axis shape ({axis_sh}) not divisible by axis fragmentation ({axis_fragm}'
                 )
             axis_chunk_size = axis_sh // axis_fragm
             chunks.append(axis_chunk_size)
@@ -133,18 +206,25 @@ def from_rank_offsets(
         *rank_offsets: Tuple[int, int, int],
         replica_id: ReplicaId = 0,
         prepend_axis_num: int = 0,
+        flattened_range: None = None,
         **init_kwargs,
     ):
         """Allows to construct the ShardedTensor given offset specified in process ranks.
 
         Args:
-            key: unique key
-            data: local tensor data
-            rank_offsets: each tuple (axis, axis_rank_offset, axis_fragm) says that if global tensor is divided into `axis_fragm` fragment along `axis` axis, then local tensor data corresponds to the `axis_rank_offset` chunk.
-            replica_id: see ShardedTensor
-            prepend_axis_num: see ShardedTensor
+            key (str): unique key
+            data (torch.Tensor): local tensor data
+            rank_offsets (Tuple[int, int, int]): each tuple (axis, axis_rank_offset, axis_fragm) says that if global tensor is divided into `axis_fragm` fragment along `axis` axis, then local tensor data corresponds to the `axis_rank_offset` chunk.
+            replica_id (ReplicaId): see ShardedTensor
+            prepend_axis_num (int): see ShardedTensor
+            flattened_range (None): must be None when using this constructor
             init_kwargs: passed to ShardedTensor.__init__
         """
+        if flattened_range is not None:
+            raise ValueError(
+                'Cannot instantiate a flat ShardedTensor with `from_rank_offsets` method.'
+                ' Use `from_rank_offsets_flat` instead'
+            )
         global_offset = [0] * (data.ndim + prepend_axis_num)
         global_shape = ([1] * prepend_axis_num) + list(data.shape)
         axis_fragmentations = [1] * (data.ndim + prepend_axis_num)
@@ -177,10 +257,55 @@ def from_rank_offsets(
             tuple(axis_fragmentations),
             replica_id,
             prepend_axis_num,
+            flattened_range=flattened_range,
             **init_kwargs,
         )
 
-    def init_data(self, device: torch.device, init_fn=torch.empty):
+    @classmethod
+    def from_rank_offsets_flat(
+        cls,
+        key: str,
+        data: torch.Tensor,
+        non_flat_local_shape: Tuple[int, ...],
+        *args,
+        flattened_range: Optional[slice] = None,
+        **kwargs,
+    ):
+        """Allows to construct a *flattened* ShardedTensor given offset specified in process ranks.
+
+        Args:
+            key (str):
+            data (torch.Tensor): this should be a flattened data tensor
+            non_flat_local_shape (Tuple[int, ...]): expected local shape of a non-flat chunk
+            *args: passed unchanged to the `from_rank_offsets` constructor
+            flattened_range (slice): see ShardedTensor. Defaults to None, but must be set to
+                a non-None slice.
+            **kwargs:
+
+        Returns:
+            ShardedTensor: constructed ShardedTensor instance
+        """
+        if flattened_range is None:
+            raise CheckpointingException(
+                'Cannot instantiate a non-flat ShardedTensor with `from_rank_offsets_flat` method.'
+                ' Use `from_rank_offsets` instead'
+            )
+        if data.ndim != 1:
+            raise CheckpointingException(
+                f'Flattened ShardedTensor requires 1D data, got shape: {data.shape}'
+            )
+        if flattened_range.stop - flattened_range.start != data.numel():
+            raise CheckpointingException(
+                f'Flattened ShardedTensor data length ({data.numel()}) must meet the slice length: {flattened_range.stop - flattened_range.start}'
+            )
+
+        non_flat_data_meta = torch.empty(*non_flat_local_shape, dtype=data.dtype, device='meta')
+        sh_ten = cls.from_rank_offsets(key, non_flat_data_meta, *args, **kwargs)
+        instance = replace(sh_ten, data=data, flattened_range=flattened_range)
+        instance.validate_metadata_integrity()
+        return instance
+
+    def init_data(self, device: Union[str, torch.device], init_fn=torch.empty):
         if self.data is not None:
             return
         self.data = init_fn(self.local_shape, dtype=self.dtype, device=device)
@@ -252,6 +377,15 @@ class ShardedObject(ShardedBase):
     global_offset: Tuple[int, ...]
     replica_id: ReplicaId = 0
 
+    def __post_init__(self):
+        self.validate_metadata_integrity()
+
+    def validate_metadata_integrity(self):
+        if len(self.global_shape) != len(self.global_offset):
+            raise CheckpointingException(
+                f'Global offset dimensions should be equal to global shape dimensions for {self}'
+            )
+
     def without_data(self):
         return replace(self, data=None)
 
@@ -269,6 +403,9 @@ class ShardedTensorFactory(ShardedBase):
 
     The essence of those transformations is that they can be applied to
     optimizer states the same way they are applied to the model params.
+    The ultimate state dict with sharded tensors must depend functionally on
+    `build_fn` arguments (key, data, replica_id, flattened_range),
+    which will be provided by the optimizer.
 
     Builder creates a sub-state-dict out of a tensor before saving, and merger
     merges the corresponding state dict after loading.
@@ -279,16 +416,22 @@ class ShardedTensorFactory(ShardedBase):
         build_fn (callable): function that transforms the original tensor to a sharded state dict
         merge_fn (callable): function that transforms loaded subtree back into a single tensor (inverse of `build_fn`)
         replica_id (ReplicaId): indicates factory replication wrt. factories in different processes
+        flattened_range (slice, optional): indicates additional flattening applied to the ShardedTensors produced by the factory
     """
 
     key: str
     data: torch.Tensor
-    build_fn: Callable[[str, torch.Tensor, ReplicaId], ShardedStateDict]
+    build_fn: Callable[[str, torch.Tensor, ReplicaId, Optional[slice]], ShardedStateDict]
     merge_fn: Callable[[StateDict], torch.Tensor]
     replica_id: ReplicaId = 0
+    flattened_range: Optional[slice] = None
 
     def build(self):
-        return self.build_fn(self.key, self.data, self.replica_id)
+        return self.build_fn(self.key, self.data, self.replica_id, self.flattened_range)
+
+    def validate_metadata_integrity(self):
+        """No reasonable checks can be applied"""
+        pass
 
 
 def apply_factories(sharded_state_dict: ShardedStateDict):
diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py
index bec174209e..ed9b5b5069 100644
--- a/megatron/core/dist_checkpointing/optimizer.py
+++ b/megatron/core/dist_checkpointing/optimizer.py
@@ -82,9 +82,11 @@ def make_sharded_optimizer_tensor(
     assert (
         tuple(optim_param.shape) == model_param.local_shape
     ), f'Optimizer shape ({tuple(optim_param.shape)} does not match model shape ({model_param.local_shape})'
-    return replace(
+    sh_ten = replace(
         model_param, key=f'{prefix}.{model_param.key}', data=optim_param, dtype=optim_param.dtype
     )
+    sh_ten.validate_metadata_integrity()
+    return sh_ten
 
 
 def optim_state_to_sharding_state(
diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py
index d19aa71908..79541f4526 100644
--- a/megatron/core/dist_checkpointing/serialization.py
+++ b/megatron/core/dist_checkpointing/serialization.py
@@ -476,26 +476,12 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
 
 
 def _compute_shards_access(rank_sharding):
-    def chunk_offset(sharding):
-        assert len(sharding.global_offset) == len(sharding.local_shape) + sharding.prepend_axis_num
-        return tuple(
-            chain(
-                (off for off in sharding.global_offset[: sharding.prepend_axis_num]),
-                (
-                    off // sh
-                    for off, sh in zip(
-                        sharding.global_offset[sharding.prepend_axis_num :], sharding.local_shape
-                    )
-                ),
-            )
-        )
-
     shard_access_cnt = torch.zeros(
         rank_sharding[0][1].axis_fragmentations, dtype=torch.int, device='cpu'
     )
     for rank, sharding in rank_sharding:
         if is_main_replica(sharding.replica_id):
-            shard_access_cnt[chunk_offset(sharding)] += 1
+            shard_access_cnt[sharding.local_chunk_offset_in_global()] += 1
         # TODO: consider validating different replicas too
     return shard_access_cnt
 
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
index 5cddfc5cd6..b290007457 100644
--- a/megatron/core/dist_checkpointing/strategies/torch.py
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -4,11 +4,15 @@
 import dataclasses
 import io
 import itertools
-from collections import defaultdict
+import math
+from collections import ChainMap, defaultdict
+from dataclasses import dataclass
+from itertools import product
 from logging import getLogger
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
 
+import numpy as np
 import torch
 from torch.distributed import checkpoint
 from torch.distributed._shard.metadata import ShardMetadata
@@ -19,6 +23,7 @@
     DefaultSavePlanner,
     FileSystemReader,
     LoadPlan,
+    Metadata,
     SavePlan,
     TensorStorageMetadata,
     WriteItem,
@@ -87,14 +92,24 @@ def sharded_tensor_to_torch_sharded_tensor(
 ) -> TorchShardedTensor:
     """Convert MCore ShardedTensor to PyT ShardedTensor. PyT requires information about all chunks.
 
+    On high-level, this function follows the logic of torch.distributed.fsdp._shard_utils._create_chunk_sharded_tensor.
+    Additionally, it saves `prepend_axis_num` and `has_flattened_range` (specific to MCore) as attributes
+    for further restoration in `_unwrap_pyt_sharded_tensor`.
+
     NOTE: this function assumes regular (grid) sharding of the MCore ShardedTensor.
     The only local irregularities could be introduced with a `flattened_range` attribute.
 
-    NOTE: `flattened_range` is currently supported only for 1D tensors.
+    This function handles 3 different type of ShardedTensors:
+    1. Non-flat regular ShardedTensors (`not has_flattened_range`)
+    2. 1D flattened ShardedTensors (`is_flattened_range_1d`)
+    3. N-D flattened ShardedTensors (`has_flattened_range`)
 
-    This function follows the logic of torch.distributed.fsdp._shard_utils._create_chunk_sharded_tensor.
-    Additionally, it saves `prepend_axis_num` (specific to MCore) as an attribute
-    for further restoration in `_unwrap_pyt_sharded_tensor`.
+    (1) and (2) type are saved according to their original shape.
+    Type (3) however requires global shape adjustment for efficiency:
+    we treat [X, Y, Z] global shape tensor with local shape [x, y, z]
+    as a [X // x, Y // y, Z // z, x * y * z] tensor with last axis
+    partitioned according to `flattened_range` slices.
+    This will need special handling while resharding.
 
     Args:
         sh_tens (List[ShardedTensor]): list of sharded tensors to convert
@@ -109,42 +124,82 @@ def sharded_tensor_to_torch_sharded_tensor(
 
     some_sh_ten = sh_tens[0]
     has_flattened_range = some_sh_ten.flattened_range is not None
+    is_flattened_range_1d = has_flattened_range and len(some_sh_ten.global_shape) == 1
+
+    for sh_ten in sh_tens:
+        assert (sh_ten.flattened_range is not None) == has_flattened_range, sh_tens
+        if not sh_ten.data.is_contiguous():
+            sh_ten.data = sh_ten.data.contiguous()
+
+    local_global_offsets = {}
 
     prepend_axis_num = sh_tens[0].prepend_axis_num
-    # Determine local shards
-    if has_flattened_range:
-        if prepend_axis_num:
-            raise NotImplementedError(
-                '`prepend_axis_num` attribute of ShardedTensor not supported'
-                'together with `flattened_range` for PyT Distributed format'
-            )
+    # Determine local shards according to tensor type (see docs)
+    if is_flattened_range_1d:
+        # Type (2) case: 1D flattened ShardedTensors
         for sh_ten in sh_tens:
-            assert sh_ten.flattened_range is not None
             assert len(sh_ten.global_offset) == 1, sh_ten
+            assert sh_ten.prepend_axis_num == 0, sh_ten
+            local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+
+        global_shape = some_sh_ten.global_shape
+        offsets_shape = (
+            some_sh_ten.local_shape
+        )  # local shape is not flattened, we need it for chunk offsets
 
         local_shards = [
             Shard.from_tensor_and_offsets(
-                sh_ten.data, [sh_ten.global_offset[0] + sh_ten.flattened_range.start], rank
+                sh_ten.data,
+                [
+                    sh_ten.global_offset[0] + sh_ten.flattened_range.start
+                ],  # additional flattened offset
+                rank,
             )
             for sh_ten in sh_tens
         ]
-        offsets_shape = some_sh_ten.local_shape  # used to determine local offsets
-    else:
-        # Apply extra axes `prepend_axis_num` with a view
+
+    elif has_flattened_range:
+        # Type (3) case: N-D flattened ShardedTensors
         for sh_ten in sh_tens:
-            assert sh_ten.flattened_range is None, sh_ten.flattened_range
-            if prepend_axis_num:
-                sh_ten.data = sh_ten.data.view((1,) * prepend_axis_num + sh_ten.local_shape)
+            local_global_offsets.setdefault(sh_ten.local_chunk_offset_in_global(), []).append(
+                sh_ten
+            )
+            assert sh_ten.data.ndim == 1, sh_ten
+            sh_ten.data = sh_ten.data.view((1,) * len(sh_ten.global_shape) + (-1,))
+
+        # Global shape reformulation:
+        global_shape = some_sh_ten.axis_fragmentations + (int(np.prod(some_sh_ten.local_shape)),)
+        offsets_shape = (1,) * len(
+            some_sh_ten.global_shape
+        )  # reformulated global shape has shape equal ti number of local chunks
 
         local_shards = [
-            Shard.from_tensor_and_offsets(sh_ten.data, list(sh_ten.global_offset), rank)
+            Shard.from_tensor_and_offsets(
+                sh_ten.data,
+                list(
+                    sh_ten.local_chunk_offset_in_global() + (sh_ten.flattened_range.start,)
+                ),  # additional flattened offset
+                rank,
+            )
             for sh_ten in sh_tens
         ]
+    else:
+        # Type (1) case: non-flat regular ShardedTensors
+        for sh_ten in sh_tens:
+            local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+            sh_ten.data = sh_ten.data.view(
+                (1,) * prepend_axis_num + sh_ten.local_shape
+            )  # adjust to prepended_axis_num
+
+        global_shape = some_sh_ten.global_shape
         offsets_shape = some_sh_ten.data.shape  # includes prepended axes
 
-    local_global_offsets = {}
-    for sh_ten in sh_tens:
-        local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+        local_shards = [
+            Shard.from_tensor_and_offsets(
+                sh_ten.data, list(sh_ten.global_offset), rank  # simple case
+            )
+            for sh_ten in sh_tens
+        ]
 
     # Create a ShardedTensor without invoking communication. Determine global shards
     shard_metadata = []
@@ -155,20 +210,33 @@ def sharded_tensor_to_torch_sharded_tensor(
             # local shard
             placement = f"rank:{rank}/cuda"
             for sh_ten in local_global_offsets[offset]:
-                if has_flattened_range:
+                if is_flattened_range_1d:
                     offset = (sh_ten.global_offset[0] + sh_ten.flattened_range.start,)
-                size = sh_ten.data.shape
+                    size = sh_ten.data.shape
+                elif has_flattened_range:
+                    assert offset == sh_ten.local_chunk_offset_in_global()
+                    # This is not an actual offset, but an offset of the whole shard
+                    # This is needed for a PyT Dist internal integrity check
+                    offset = sh_ten.local_chunk_offset_in_global() + (0,)
+                    size = (1,) * len(offsets_shape) + global_shape[-1:]
+                else:
+                    size = sh_ten.data.shape
                 shard_metadata.append(ShardMetadata(offset, size, placement))
 
         else:
             # for shards from other ranks we provide simplistic data - this information will be discarded
             # during TorchShardedTensor._init_from_local_shards_and_global_metadata call
-            shard_metadata.append(ShardMetadata(offset, offsets_shape, "cuda"))
+            if has_flattened_range and not is_flattened_range_1d:
+                offset = offset + (0,)
+                size = (1,) * len(offsets_shape) + global_shape[-1:]
+            else:
+                size = offsets_shape
+            shard_metadata.append(ShardMetadata(offset, size, "cuda"))
 
     tensor = some_sh_ten.data
     sharded_tensor_metadata = ShardedTensorMetadata(
         shards_metadata=shard_metadata,
-        size=torch.Size(some_sh_ten.global_shape),
+        size=torch.Size(global_shape),
         tensor_properties=TensorProperties(
             dtype=tensor.dtype,
             layout=tensor.layout,
@@ -180,7 +248,11 @@ def sharded_tensor_to_torch_sharded_tensor(
     pyt_sh_ten = TorchShardedTensor._init_from_local_shards_and_global_metadata(
         local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=None
     )
-    pyt_sh_ten.prepend_axis_num = prepend_axis_num
+    # Store MCore related data as PyTShardedTensor attribute. This won't be stored in the checkpoint, only for runtime purposes
+    pyt_sh_ten.mcore_sh_ten = sh_ten.without_data()
+    pyt_sh_ten.mcore_metadata = {}
+    if has_flattened_range and not is_flattened_range_1d:
+        pyt_sh_ten.mcore_metadata['nd_reformulated_orig_global_shape'] = sh_ten.global_shape
     return pyt_sh_ten
 
 
@@ -258,14 +330,16 @@ def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]
     If `prepend_axis_num` was non-zero (which is specific to MCore ShardedTensor)
     then the tensor has additional singleton dimensions which should be squeezed.
     """
-    prepend_axis_num = getattr(sh_ten, 'prepend_axis_num', 0)
-    if prepend_axis_num == 0:
-        return [sh.tensor for sh in sh_ten.local_shards()]
+    mcore_sh_ten = sh_ten.mcore_sh_ten
     ret_tensors = []
     for sh in sh_ten.local_shards():
         ten = sh.tensor
-        for _ in range(prepend_axis_num):
-            ten = ten.squeeze(0)
+        if mcore_sh_ten.flattened_range is not None:
+            assert ten.shape[:-1] == (1,) * (len(ten.shape) - 1), ten.shape
+            ten = ten.view(-1)
+        else:
+            for _ in range(mcore_sh_ten.prepend_axis_num):
+                ten = ten.squeeze(0)
         ret_tensors.append(ten)
     return ret_tensors
 
@@ -316,6 +390,11 @@ def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[dict, li
             _restore_dict_types(x_val, templ_val)
 
 
+@dataclass(frozen=True)
+class MCoreSavePlan(SavePlan):
+    mcore_data: Dict[str, Dict[str, Any]] = None  # Mcore related data about each tensor
+
+
 class MCoreSavePlanner(DefaultSavePlanner):
     """Differs with the default planner by saving BytesIO objects on all ranks.
 
@@ -327,15 +406,39 @@ class MCoreSavePlanner(DefaultSavePlanner):
     in transform_object.
     """
 
+    def __init__(
+        self,
+        *args,
+        nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.nd_flattened_global_shapes = nd_flattened_global_shapes or {}
+
     def create_local_plan(self) -> SavePlan:
         plan = create_default_local_save_plan(self.state_dict, self.is_coordinator)
         self._add_non_coordinator_iobytes_request(plan)
         if self.flatten_state_dict:
             plan = dataclasses.replace(plan, planner_data=self.mappings)
+        plan = MCoreSavePlan(
+            items=plan.items,
+            storage_data=plan.storage_data,
+            planner_data=plan.planner_data,
+            mcore_data={
+                k: sh_ten.mcore_metadata
+                for k, sh_ten in self.state_dict.items()
+                if isinstance(sh_ten, TorchShardedTensor)
+            },
+        )
         self.plan = plan
 
         return self.plan
 
+    def create_global_plan(self, all_plans: List[MCoreSavePlan]) -> Tuple[List[SavePlan], Metadata]:
+        global_plan, metadata = super().create_global_plan(all_plans)
+        metadata.mcore_data = dict(ChainMap(*(plan.mcore_data for plan in all_plans)))
+        return global_plan, metadata
+
     def _add_non_coordinator_iobytes_request(self, plan):
         if self.is_coordinator:
             return
@@ -363,10 +466,14 @@ def __init__(
     def _validate_global_shapes(self, metadata, sharded_tensors):
         for sh_ten in sharded_tensors:
             loaded_shape = metadata.state_dict_metadata[sh_ten.key].size
-            if loaded_shape != sh_ten.global_shape:
+            if sh_ten.flattened_range is None or len(sh_ten.global_shape) == 1:
+                expected_shape = sh_ten.global_shape
+            else:
+                expected_shape = sh_ten.axis_fragmentations + (int(np.prod(sh_ten.local_shape)),)
+            if loaded_shape != expected_shape:
                 _msg = (
                     f'Global shape mismatch for loaded ({loaded_shape})'
-                    f' and expected ({sh_ten.global_shape}) tensor'
+                    f' and expected ({expected_shape}) tensor'
                     f' for key {sh_ten.key}'
                 )
                 raise CheckpointingException(_msg)
@@ -500,13 +607,32 @@ def load_tensors_metadata(self, checkpoint_dir: Path):
         fs_reader = FileSystemReader(checkpoint_dir)
         metadata = fs_reader.read_metadata()
 
-        return {
-            k: ShardedTensor.from_rank_offsets(
-                k, torch.empty(tp.size, **tp.properties.__dict__, device='meta')
-            ).without_data()
-            for k, tp in metadata.state_dict_metadata.items()
-            if isinstance(tp, TensorStorageMetadata)
-        }
+        mcore_data = getattr(metadata, 'mcore_data', {})
+        sharded_metadata = {}
+        for k, tp in metadata.state_dict_metadata.items():
+            if not isinstance(tp, TensorStorageMetadata):
+                continue  # load only tensors
+
+            nd_orig_global_shape = mcore_data.get(k, {}).get('nd_reformulated_orig_global_shape')
+            if nd_orig_global_shape is None:
+                # Regular tensor
+                sharded_metadata[k] = ShardedTensor.from_rank_offsets(
+                    k, torch.empty(tp.size, **tp.properties.__dict__, device='meta'),
+                ).without_data()
+            else:
+                # N-D flattened tensor
+                unflat_ten = torch.empty(
+                    nd_orig_global_shape, **tp.properties.__dict__, device='meta'
+                )
+                flat_ten = unflat_ten.flatten()
+                sharded_metadata[k] = ShardedTensor.from_rank_offsets_flat(
+                    k,
+                    flat_ten,
+                    unflat_ten.shape,
+                    flattened_range=slice(0, unflat_ten.numel()),  # whole slice
+                ).without_data()
+
+        return sharded_metadata
 
     def can_handle_sharded_objects(self):
         return True
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
index c297f4ef4d..2add1f5090 100644
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -4,6 +4,7 @@
 
 
 import itertools
+from dataclasses import replace
 from logging import getLogger
 from typing import Callable, Dict, List, Optional, Tuple
 
@@ -12,7 +13,15 @@
 
 from .. import parallel_state, tensor_parallel
 from ..dist_checkpointing import ShardedTensor
-from ..dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject, ShardedStateDict
+from ..dist_checkpointing.dict_utils import nested_values
+from ..dist_checkpointing.mapping import (
+    LocalNonpersitentObject,
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensorFactory,
+)
+from ..dist_checkpointing.optimizer import get_param_id_to_sharded_param_map
+from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories
 from ..distributed import ParamAndGradBuffer, shard_buffer
 from .grad_scaler import MegatronGradScaler
 from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
@@ -651,6 +660,8 @@ def load_state_dict(self, state_dict):
                 self.load_parameter_state_from_dp_zero(param_state)
             elif sharding_type == 'fully_sharded_bucket_space':
                 self.load_parameter_state_from_fs_bucket_space(param_state)
+            elif sharding_type == 'fully_sharded_model_space':
+                self.load_parameter_state_from_fs_model_space(param_state)
             else:
                 raise NotImplementedError(f'Unknown sharding_type: {sharding_type}')
 
@@ -828,24 +839,33 @@ def sharded_state_dict(
         self,
         model_sharded_state_dict: ShardedStateDict,
         is_loading: bool = False,
-        sharding_type: str = 'fully_sharded_bucket_space',
+        sharding_type: str = 'fully_sharded_model_space',
     ):
         """
         Chooses between 3 param state sharding implementations as requested by `sharding_type`.
 
         Regular state dict parameters are saved on DP rank 0 and loaded on all ranks.
         """
-
-        state_dict = {
-            k: ShardedObject(
-                f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.{k}',
-                v,
-                (1,),
-                (0,),
-                replica_id=torch.distributed.get_rank(self.data_parallel_group),
+        if not is_loading and sharding_type == 'fully_sharded_bucket_space':
+            logger.warning(
+                '`fully_sharded_bucket_space` sharding for DistributedOptimizer'
+                ' checkpoint is deprecated and will be removed in the future.'
+                ' Please switch to `full_sharded_model_space`.'
             )
-            for k, v in self.state_dict().items()
-        }
+
+        state_dict = self.state_dict()
+        if sharding_type != 'fully_sharded_model_space':
+            # State dict differs between different model parallel groups
+            state_dict = {
+                k: ShardedObject(
+                    f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.{k}',
+                    v,
+                    (1,),
+                    (0,),
+                    replica_id=torch.distributed.get_rank(self.data_parallel_group),
+                )
+                for k, v in state_dict.items()
+            }
 
         if is_loading:
             self.init_state_fn(self.optimizer)
@@ -857,14 +877,8 @@ def sharded_state_dict(
         elif sharding_type == 'dp_zero_gather_scatter':
             param_state = self.sharded_param_state_dp_zero(model_sharded_state_dict, is_loading)
         elif sharding_type == 'fully_sharded_model_space':
-            # In this approach the tensors could be directly related to model parameters
-            # by linking them with metadata from `model_sharded_state_dict`.
-            # This would allow changing TP and PP while using DistOpt (as with other optimizers).
-            # This implementation is more involved and left out for now.
-            raise NotImplementedError(
-                f'The fully sharded model space version for'
-                f' {self.__class__.__name__}.sharded_state_dict'
-                f' not implemented.'
+            param_state = self.sharded_param_state_fs_model_space(
+                model_sharded_state_dict, is_loading
             )
         else:
             raise NotImplementedError(f'Unknown sharding_type: {sharding_type}')
@@ -985,11 +999,81 @@ def sharded_param_state_fs_bucket_space(
                             )
         return state
 
+    def sharded_param_state_fs_model_space(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        """Sharded state dict where each buffer is mapped to corresponding model param.
+
+        In this approach the optimizer state tensors are directly related to model parameters
+        by linking them with metadata from `model_sharded_state_dict`.
+        This will allow changing TP and PP while using DistOpt (as with other optimizers).
+        """
+
+        param_to_sharded_metadata = {}
+        model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(
+            model_sharded_state_dict
+        )
+        for sh_base in nested_values(model_sharded_state_dict):
+            param_to_sharded_metadata[sh_base.data] = sh_base
+
+        prefix = 'optimizer.state'
+        state = {}
+        param_idx = 0  # this is not stored in the checkpoint, used only to identify params in `sharded_param_state_fs_model_space`
+        for gbuf_range_maps in self.gbuf_ranges:
+            for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
+                for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        param_range = param_range_map['param']
+
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        tensors = {
+                            "fp32_param": main_param,
+                            **optim_state,
+                        }
+                        # Match optimizer parameter with model ShardedTensor (or ShardedTensorFactory)
+                        try:
+                            sharded_metadata = param_to_sharded_metadata[model_param]
+                        except KeyError as e:
+                            raise ValueError(
+                                f'Model param {model_param} not in model_sharded_state_dict'
+                            ) from e
+
+                        # Set DP corresponding replica_id coordinate to 0
+                        assert (
+                            len(sharded_metadata.replica_id) == 3
+                        ), f'Expected replica_id format (PP, TP, DP), got: {sharded_metadata}'
+                        replica_id = (*sharded_metadata.replica_id[:2], 0)
+
+                        # Instantiate ShardedTensor (or ShardedTensorFactory) for optimizer params
+                        for state_key, state_ten in tensors.items():
+                            replace_kwargs = dict(
+                                key=f'{prefix}.{state_key}.{sharded_metadata.key}',
+                                data=state_ten,
+                                dtype=state_ten.dtype,
+                                flattened_range=slice(param_range.start, param_range.end),
+                                replica_id=replica_id,
+                            )
+                            if isinstance(sharded_metadata, ShardedTensorFactory):
+                                replace_kwargs.pop('dtype')
+                            tensors[state_key] = replace(sharded_metadata, **replace_kwargs)
+                            tensors[state_key].validate_metadata_integrity()
+                        state[param_idx] = tensors
+                        param_idx += 1
+        return state
+
     def load_parameter_state_from_fs_bucket_space(self, state_dict):
         """ Loads the parameter state from an internal representation.
 
-        Inverse of the `get_parameter_state_internal_repr` method.
+        Inverse of the `get_parameter_state_fs_bucket_space` method.
         """
+        logger.warning(
+            '`fully_sharded_bucket_space` sharding for DistributedOptimizer'
+            'checkpoint is deprecated. Please switch to `full_sharded_model_space`'
+        )
+
         if state_dict is not None and "per_bucket_numel_unpadded" in state_dict:
             per_bucket_numel_unpadded_in_checkpoint = state_dict["per_bucket_numel_unpadded"]
             assert self.per_bucket_numel_unpadded == per_bucket_numel_unpadded_in_checkpoint, (
@@ -1024,6 +1108,30 @@ def load_parameter_state_from_fs_bucket_space(self, state_dict):
                         for key in dst_tensors:
                             dst_tensors[key].copy_(src_tensors[key])
 
+    def load_parameter_state_from_fs_model_space(self, state_dict):
+        """Loads the parameter state from a "model space" representation.
+
+        Inverse of the `sharded_param_state_fs_model_space` method.
+        """
+        param_idx = 0  # matching order with `sharded_param_state_fs_model_space`
+        for gbuf_range_maps in self.gbuf_ranges:
+            for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
+                for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        src_tensors = state_dict[param_idx]
+                        dst_tensors = {
+                            "fp32_param": main_param,
+                            **optim_state,
+                        }
+                        for key in dst_tensors:
+                            dst_tensors[key].copy_(src_tensors[key])
+
+                        param_idx += 1
+
     def load_parameter_state_from_dp_zero(self, state_dict):
         """Load parameter state (i.e., parameter & optimizer tensors) from DP 0 rank,
         using the new checkpoint format with coalesced state across buckets.
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 426ef92ff2..e82d6ecd20 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 
@@ -134,44 +135,35 @@ def sharded_state_dict(
     ) -> ShardedStateDict:
         sharded_state_dict = {}
         for name, module in self._modules.items():
-            if name == 'linear_fc1' and self.config.gated_linear_unit:
-                sub_sd = self._sharded_state_dict_for_glu(
-                    name, module, prefix, sharded_offsets, metadata
-                )
-            else:
-                sub_sd = module.sharded_state_dict(f'{prefix}{name}.', sharded_offsets, metadata)
+            sub_sd = module.sharded_state_dict(f'{prefix}{name}.', sharded_offsets, metadata)
+            if self.config.gated_linear_unit and name == 'linear_fc1':
+                assert f'{prefix}{name}.weight' in sub_sd, sub_sd.keys()
+                for k, v in sub_sd.items():
+                    if k in (f'{prefix}{name}.weight', f'{prefix}{name}.bias'):
+                        sub_sd[k] = apply_swiglu_sharded_factory(v, sharded_offsets)
             sharded_state_dict.update(sub_sd)
         return sharded_state_dict
 
-    def _sharded_state_dict_for_glu(
-        self,
-        module_name: str,
-        module: torch.nn.Module,
-        prefix: str,
-        sharded_offsets: Tuple[Tuple[int, int, int]],
-        metadata: Optional[dict] = None,
+
+def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
+    # We must split the tensor into 2 parts, each sharded separately.
+    # This requires a ShardedTensorFactory which `chunk`s during saving
+    # and `cat`s during loading
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    swiglu_shard_axis = 0
+    prepend_axis_num = len(sharded_offsets)
+    original_shape = original_sh_ten.local_shape
+    original_numel = int(np.prod(original_shape))
+
+    @torch.no_grad()
+    def sh_ten_build_fn(
+        key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice]
     ):
-        assert module_name == 'linear_fc1', module_name
-        sharded_state_dict = module.sharded_state_dict(
-            f'{prefix}{module_name}.', sharded_offsets, metadata
-        )
-        weight_key = f'{prefix}{module_name}.weight'
-        prev_sh_ten = sharded_state_dict[weight_key]
-
-        # We must split the tensor into 2 parts, each sharded separately.
-        # This requires a ShardedTensorFactory which `chunk`s during saving
-        # and `cat`s during loading
-        tp_rank = parallel_state.get_tensor_model_parallel_rank()
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-
-        tp_shard_axis = 0
-        prepend_axis_num = len(sharded_offsets)
-
-        def sh_ten_build_fn(key: str, t: torch.Tensor, replica_id: ReplicaId):
-            offset_w = (tp_shard_axis + prepend_axis_num, tp_rank, tp_size * 2)
-            offset_v = (tp_shard_axis + prepend_axis_num, tp_size + tp_rank, tp_size * 2)
-            with torch.no_grad():
-                tensor_w, tensor_v = torch.chunk(t, 2, dim=tp_shard_axis)
+        offset_w = (swiglu_shard_axis + prepend_axis_num, tp_rank, tp_size * 2)
+        offset_v = (swiglu_shard_axis + prepend_axis_num, tp_size + tp_rank, tp_size * 2)
+        if flattened_range is None:
+            tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis)
             return [
                 ShardedTensor.from_rank_offsets(
                     key,
@@ -190,16 +182,74 @@ def sh_ten_build_fn(key: str, t: torch.Tensor, replica_id: ReplicaId):
                     prepend_axis_num=prepend_axis_num,
                 ),
             ]
+        else:
+            # Here we need to map a slice `t` (`flattened_range` specifies slice start and stop)
+            # of the *original* flattened tensor into slices `w` and `v` of chunked
+            # and flattened tensor.
+            # Example:
+            # If original tensor has (16, 5) shape and flattened_range is `slice(8, 64)`,
+            # then `t` has shape `(56,)` and we need to create 2 tensors:
+            # w: first 32 elements of `t` with flattened_range slice(8, 40)
+            # v: last 24 elements of `t` with flattened_range slice(0, 24)
+            # Global offsets are the same as in the non-flattened case
+            assert t.ndim == 1, (key, t.shape)
+            non_flat_local_shape = (original_shape[0] // 2, *original_shape[1:])
+            chunk_numel = original_numel // 2
+            result = []
+            if flattened_range.start < chunk_numel:
+                # Non-empty `w` chunk
+                tensor_w = t[: chunk_numel - flattened_range.start]
+                flattened_range_w = slice(
+                    flattened_range.start, min(chunk_numel, flattened_range.stop)
+                )
+                assert len(tensor_w) == flattened_range_w.stop - flattened_range_w.start
+                result.append(
+                    ShardedTensor.from_rank_offsets_flat(
+                        key,
+                        tensor_w,
+                        non_flat_local_shape,
+                        *sharded_offsets,
+                        offset_w,
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                        flattened_range=flattened_range_w,
+                    )
+                )
+            if flattened_range.stop > chunk_numel:
+                # Non-empty `v` chunk
+                tensor_v = t[-(flattened_range.stop - chunk_numel) :]
+                flattened_range_v = slice(
+                    max(chunk_numel, flattened_range.start) - chunk_numel,
+                    flattened_range.stop - chunk_numel,
+                )
+                assert len(tensor_v) == flattened_range_v.stop - flattened_range_v.start, (
+                    len(tensor_v),
+                    flattened_range_v,
+                )
 
-        def sh_ten_merge_fn(sub_state_dict):
-            with torch.no_grad():
-                return torch.cat(sub_state_dict)
-
-        sharded_state_dict[weight_key] = ShardedTensorFactory(
-            prev_sh_ten.key,
-            prev_sh_ten.data,
-            sh_ten_build_fn,
-            sh_ten_merge_fn,
-            prev_sh_ten.replica_id,
-        )
-        return sharded_state_dict
+                result.append(
+                    ShardedTensor.from_rank_offsets_flat(
+                        key,
+                        tensor_v,
+                        non_flat_local_shape,
+                        *sharded_offsets,
+                        offset_v,
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                        flattened_range=flattened_range_v,
+                    )
+                )
+            assert sum(sh_ten.data.numel() for sh_ten in result) == t.numel(), (result, t.shape)
+            return result
+
+    def sh_ten_merge_fn(sub_state_dict):
+        with torch.no_grad():
+            return torch.cat(sub_state_dict)
+
+    return ShardedTensorFactory(
+        original_sh_ten.key,
+        original_sh_ten.data,
+        sh_ten_build_fn,
+        sh_ten_merge_fn,
+        original_sh_ten.replica_id,
+    )
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index 22e3912c50..2d9f455a23 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -324,7 +324,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
 
         optim_sd_kwargs = {}
         if args.use_dist_ckpt and args.use_distributed_optimizer:
-            optim_sd_kwargs['sharding_type'] = ('fully_sharded_bucket_space'
+            optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
                                                 if args.ckpt_fully_parallel_save
                                                 else 'dp_zero_gather_scatter')
             print_rank_0(f'Storing distributed optimizer sharded state of type {optim_sd_kwargs["sharding_type"]}')
@@ -745,9 +745,16 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
 
 
                 if args.use_distributed_optimizer:
-                    optim_sd_kwargs['sharding_type'] = ('fully_sharded_bucket_space'
+                    optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
                                                         if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
                                                         else 'dp_zero_gather_scatter')
+                    # This is for backwards-compatibility. Can be removed once 'fully_sharded_bucket_space' loading is removed
+                    for maybe_dist_opt_optim_state in (state_dict['optimizer'], *state_dict['optimizer'].values()):
+                        if 'param_state_sharding_type' in maybe_dist_opt_optim_state:
+                            if maybe_dist_opt_optim_state['param_state_sharding_type'] == 'fully_sharded_bucket_space':
+                                print_rank_0('Detected deprecated `fully_sharded_bucket_space` DistributedOptimizer checkpoint format')
+                                optim_sd_kwargs['sharding_type'] = maybe_dist_opt_optim_state['param_state_sharding_type']
+                            break
             else:
                 gen_sd_optim = None
                 gen_sd_opt_param_scheduler = None
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index 63dc00c20a..edee11b287 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -73,7 +73,7 @@ products:
   - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]}
   - {tp_size: [2], pp_size: [1,2], ckpt_resume: [0, 1], allow_nondeterministic: [1], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"'], args_meta: ["cp2_nondeterministic"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel"]}
-  - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_8experts2parallel_dist_optimizer"]}
+  - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel_dist_optimizer"]}
     ## TODO: MoE GroupedMLP dist-ckpt not supported, so must use 'torch' ckpt format
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --overlap-grad-reduce --overlap-param-gather"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM"]}
   - {tp_size: [2], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_groupedGEMM"]}
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
new file mode 100644
index 0000000000..7378b0535e
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import io
+
+import numpy as np
+import pytest
+import torch
+from torch.distributed.checkpoint import CheckpointException
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor, save, load
+from megatron.core.dist_checkpointing.core import CheckpointingException, \
+    maybe_load_config
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.mapping import ShardedTensorFactory, \
+    ShardedObject
+from megatron.core.dist_checkpointing.serialization import load_tensors_metadata
+
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestFlattenedResharding:
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp',),
+        [
+            ((2, 4), (2, 4)),
+            # TODO: uncomment after implementing flattened resharding
+            # ((2, 4), (2, 2)),
+            # ((8, 1), (1, 2)),
+        ]
+    )
+    def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_flattened_partition_change_save_load') as ckpt_dir:
+            Utils.initialize_model_parallel(*src_tp_pp)
+            state_dict = self._build_state_dict()
+
+            save(state_dict, ckpt_dir)
+
+            # change TPxPP
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(*dest_tp_pp)
+            loaded_state_dict = load(self._build_state_dict(random=True), ckpt_dir)
+            expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()}
+
+            diffs = diff(expected_state_dict, loaded_state_dict)
+            assert not any(diffs), diffs
+            Utils.destroy_model_parallel()
+
+
+    def _build_state_dict(self, random=False):
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        dp_rank = parallel_state.get_data_parallel_rank()
+        dp_size = parallel_state.get_data_parallel_world_size()
+
+        init_fn = torch.rand if random else torch.arange
+        global_ten = init_fn(4 * 5 * 80).reshape(4, 5, 80)
+        local_ten = global_ten
+        local_ten = local_ten.chunk(tp_size, dim=0)[tp_rank]
+        local_ten = local_ten.chunk(pp_size, dim=2)[pp_rank]
+        assert local_ten.shape == (4 // tp_size, 5, 80 // pp_size)
+
+        local_ten_size_by_dp = local_ten.numel()
+        assert local_ten_size_by_dp % dp_size == 0, (local_ten_size_by_dp, dp_size)
+        local_ten_size_by_dp = local_ten_size_by_dp // dp_size
+        # make a bit shifted DP slices so that they are not equal
+        start_jitter = dp_rank
+        end_jitter = dp_rank + 1 if dp_rank + 1 < dp_size else 0
+        local_dp_slice = slice(
+            local_ten_size_by_dp * dp_rank + start_jitter,
+            local_ten_size_by_dp * (dp_rank + 1) + end_jitter
+        )
+        local_flat_ten = local_ten.flatten()[local_dp_slice]
+        if dp_rank == dp_size - 1:
+            assert local_flat_ten.numel() == local_ten_size_by_dp - dp_rank
+        else:
+            assert local_flat_ten.numel() == local_ten_size_by_dp + 1
+
+        state_dict = {
+            'sd_key_unflat': ShardedTensor.from_rank_offsets(
+                'unflat',
+                local_ten,
+                (0, tp_rank, tp_size),
+                (2, pp_rank, pp_size),
+                replica_id=dp_rank,
+            ),
+            'sd_key_flat': ShardedTensor.from_rank_offsets_flat(
+                'flat',
+                local_flat_ten,
+                local_ten.shape,
+                (0, tp_rank, tp_size),
+                (2, pp_rank, pp_size),
+                flattened_range=local_dp_slice
+            ),
+        }
+        return state_dict
diff --git a/tests/unit_tests/dist_checkpointing/test_mapping.py b/tests/unit_tests/dist_checkpointing/test_mapping.py
index fcd742ee65..ebd0d1ed15 100644
--- a/tests/unit_tests/dist_checkpointing/test_mapping.py
+++ b/tests/unit_tests/dist_checkpointing/test_mapping.py
@@ -5,6 +5,7 @@
 import torch
 
 from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.core import CheckpointingException
 from megatron.core.dist_checkpointing.mapping import is_main_replica, \
     ShardedTensorFactory, ShardedObject, apply_factories, apply_factory_merges
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -36,9 +37,61 @@ def test_from_rank_offsets_constructor(self, dtype=torch.float, device='cuda'):
         assert sh_ten.global_offset == (0, 0, shape[2] * 3, 0)
         assert sh_ten.axis_fragmentations == (10, 1, 6, 1)
 
+    def test_from_rank_offsets_flat_constructor(self, dtype=torch.float, device='cuda'):
+        data = torch.arange(28, dtype=dtype, device=device).reshape((1, 4, 7))
+        shape = data.shape
+        rank_offsets = [
+            (1, 0, 2),
+            (2, 3, 5)
+        ]
+        flattened_range = slice(4, 9)
+        flat_data = data.flatten()[flattened_range]
+        sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range)
+
+        # The main attributes properties are unchanged
+        assert isinstance(sh_ten, ShardedTensor)
+        assert sh_ten.dtype is dtype
+        assert sh_ten.local_shape == shape
+        assert sh_ten.global_shape == (shape[0], shape[1] * 2, shape[2] * 5)
+        assert sh_ten.global_offset == (0, 0, shape[2] * 3)
+        assert sh_ten.axis_fragmentations == (1, 2, 5)
+
+        assert torch.all(sh_ten.data == torch.arange(4, 9, device=device))
+
+    def test_metadata_integrity_violation(self):
+        data = torch.ones((1, 3, 7, 9), device='meta')
+        rank_offsets = [
+            (0, 0, 10),
+            (2, 3, 6)
+        ]
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+        sh_ten.validate_metadata_integrity()
+        with pytest.raises(CheckpointingException):
+            sh_ten.local_shape = (1, 2, 7, 9)
+            sh_ten.validate_metadata_integrity()
+
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+        with pytest.raises(CheckpointingException):
+            sh_ten.global_offset = (0, 1, 0)
+            sh_ten.validate_metadata_integrity()
+
+        with pytest.raises(CheckpointingException):
+            sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', data, data.shape, *rank_offsets,
+                                                          flattened_range=slice(4, 9))
+
+        sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', data.flatten()[4:9], data.shape, *rank_offsets,
+                                                      flattened_range=slice(4, 9))
+        assert sh_ten.local_shape == (1, 3, 7, 9)
+        with pytest.raises(CheckpointingException):
+            sh_ten.local_shape = (5,)
+            sh_ten.validate_metadata_integrity()
+
+
+
 class TestShardedTensorFactory:
     def test_build_and_merge(self):
-        def build_fn(key, tensor, replica_id):
+        def build_fn(key, tensor, replica_id, flattened_range):
+            assert flattened_range is None
             return {
                 'level2_a': ShardedTensor.from_rank_offsets(key + 'part1', tensor + 1, replica_id=replica_id),
                 'level2_b': ShardedTensor.from_rank_offsets(key + 'part2', tensor + 2, replica_id=replica_id)
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
index a0fb3bd58b..038bacc5b9 100644
--- a/tests/unit_tests/dist_checkpointing/test_optimizer.py
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -2,8 +2,9 @@
 from copy import deepcopy
 from functools import partial
 from time import sleep
-from types import SimpleNamespace
+from types import MethodType, SimpleNamespace
 from unittest import mock
+from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
@@ -12,7 +13,7 @@
 
 from megatron.core import parallel_state, DistributedDataParallel as DDP
 from megatron.core.dist_checkpointing import ShardedTensor, save, load, \
-    load_plain_tensors
+    load_tensors_metadata, load_plain_tensors
 from megatron.core.dist_checkpointing.dict_utils import nested_values, diff
 from megatron.core.dist_checkpointing.optimizer import \
     get_param_id_to_sharded_param_map, optim_state_to_sharding_state
@@ -27,6 +28,7 @@
     get_megatron_optimizer
 from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.mlp import apply_swiglu_sharded_factory
 from megatron.core.utils import get_model_config
 from megatron.training.checkpointing import load_checkpoint, save_checkpoint
 from megatron.training.training import get_model
@@ -41,7 +43,9 @@ class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = torch.nn.Conv1d(8, 16, 3)
-        self.proj = torch.nn.Linear(32, 7)
+        self.proj = torch.nn.Linear(8, 5)
+        self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1)
+
     def sharded_state_dict(self):
         sharded_state_dict = self.state_dict(keep_vars=True)
         # conv
@@ -64,6 +68,23 @@ def sharded_state_dict(self):
         return sharded_state_dict
 
 
+class SwigluFactoryModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(5, 64 // parallel_state.get_tensor_model_parallel_world_size(), bias=False)
+        self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1)
+
+    def sharded_state_dict(self):
+        sharded_state_dict = self.state_dict(keep_vars=True)
+        sharded_state_dict['linear.weight'] = ShardedTensor.from_rank_offsets(
+            'linear.weight', sharded_state_dict['linear.weight'],
+            ((0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size())),
+            replica_id=((parallel_state.get_pipeline_model_parallel_rank(), 0, parallel_state.get_data_parallel_rank(with_context_parallel=True)))
+        )
+        sharded_state_dict['linear.weight'] = apply_swiglu_sharded_factory(sharded_state_dict['linear.weight'], ())
+        return sharded_state_dict
+
+
 class TestOptimizer:
     def test_optimizer_params(self, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(1,1)
@@ -89,15 +110,13 @@ def test_optimizer_params(self, tmp_path_dist_ckpt):
         ])
 
 
-def initialize_gpt_model(pre_process=True, post_process=True, seed=0, **config_kwargs):
+def initialize_gpt_model(pre_process=True, post_process=True, seed=0, use_glu=True, **config_kwargs):
     torch.manual_seed(seed)
     model_parallel_cuda_manual_seed(seed)
 
     default_config_kwargs=dict(num_layers=8, hidden_size=16, num_attention_heads=8, use_cpu_initialization=True)
     default_config_kwargs.update(**config_kwargs)
-    transformer_config = TransformerConfig(**default_config_kwargs)
-    # pre_process = parallel_state.is_pipeline_first_stage()
-    # post_process = parallel_state.is_pipeline_last_stage()
+    transformer_config = TransformerConfig(**default_config_kwargs, gated_linear_unit=use_glu)
     model = GPTModel(config=transformer_config, transformer_layer_spec=get_gpt_layer_local_spec(), vocab_size=128, max_sequence_length=4,
                      pre_process=pre_process, post_process=post_process)
 
@@ -108,6 +127,13 @@ def initialize_gpt_model(pre_process=True, post_process=True, seed=0, **config_k
     return model
 
 
+def initialize_small_model(pre_process=True, post_process=True, seed=0, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    return SwigluFactoryModel()
+
+
 def init_basic_mock_args(args, bf16=True):
     args.data_parallel_random_init = False
     args.virtual_pipeline_model_parallel_size = None
@@ -151,11 +177,11 @@ def load_checkpoint_no_arg_checks(*args, **kwargs):
             return load_checkpoint(*args, **kwargs)
 
 
-def setup_model_and_optimizer(seed, bf16=True):
+def setup_model_and_optimizer(seed, initialize_fn, bf16=True):
     mock_args = SimpleNamespace()
     with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
         init_basic_mock_args(mock_args, bf16=bf16)
-        model = get_model(partial(initialize_gpt_model, seed=seed))
+        model = get_model(partial(initialize_fn, seed=seed))
 
     config = OptimizerConfig(bf16=bf16, params_dtype=torch.bfloat16 if bf16 else torch.float, use_distributed_optimizer=bf16)
     optimizer = get_megatron_optimizer(config, model)
@@ -175,27 +201,30 @@ def setup_model_and_optimizer(seed, bf16=True):
 
 
 class TestDistributedOptimizer:
+    @pytest.mark.parametrize("initialize_fn", [initialize_small_model, initialize_gpt_model])
     @pytest.mark.parametrize("use_fpsl", [False, True])
     @pytest.mark.parametrize("tp_pp,src_dp,dest_dp", [
         ((4, 1), 2, 2),
-        # ((1, 1), 8, 1),  # TODO: changing DP doesn't work for now
+        # ((1, 1), 8, 1),  # TODO: changing DP doesn't work in unit tests because of NCCL crashes
         # ((1, 1), 1, 8),
         # ((2, 1), 2, 1),
         # ((2, 1), 2, 2),
     ])
-    def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl):
+    def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn):
         src_world_size = tp_pp[0] * tp_pp[1] * src_dp
         dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp
         assert src_world_size <= Utils.world_size, (tp_pp, src_dp)
         assert dest_world_size <= Utils.world_size, (tp_pp, dest_dp)
 
+        sharding_type = 'fully_sharded_model_space' if use_fpsl else 'dp_zero_gather_scatter'
+
         with TempNamedDir(tmp_path_dist_ckpt / 'test_dp_sharding', sync=False) as ckpt_dir:
             try:
                 Utils.set_world_size(src_world_size)
                 if Utils.rank >= 0:
                     # Save checkpoint A
                     Utils.initialize_model_parallel(*tp_pp)
-                    model, optimizer_A = setup_model_and_optimizer(seed=2)
+                    model, optimizer_A = setup_model_and_optimizer(seed=2, initialize_fn=initialize_fn)
 
                     save_strategy = get_default_save_sharded_strategy()
                     if use_fpsl:
@@ -204,7 +233,7 @@ def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_
                             parallel_state.get_data_parallel_group(with_context_parallel=True),
                             True
                         )
-                    save(optimizer_A.sharded_state_dict(model[0].sharded_state_dict()), ckpt_dir, save_strategy)
+                    save(optimizer_A.sharded_state_dict(model[0].sharded_state_dict(), sharding_type=sharding_type), ckpt_dir, save_strategy)
                     optim_param_state_A = optimizer_A.get_parameter_state_dp_zero()
                     Utils.destroy_model_parallel()
                 else:
@@ -218,14 +247,19 @@ def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_
                 if Utils.rank >= 0:
                     Utils.initialize_model_parallel(*tp_pp)
 
-                    model, optimizer_B = setup_model_and_optimizer(seed=3)
+                    model, optimizer_B = setup_model_and_optimizer(seed=3, initialize_fn=initialize_fn)
                     optim_param_state_B = optimizer_B.get_parameter_state_dp_zero()
                     diffs = diff(optim_param_state_A, optim_param_state_B)
                     # Expect a mismatch in values - diffs[2] nonempty
                     if parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0:
                         assert not diffs[0] and not diffs[1] and diffs[2], diffs
 
-                    optim_state_dict = load(optimizer_B.sharded_state_dict(model[0].sharded_state_dict()), ckpt_dir)
+                    sharded_state_dict = optimizer_B.sharded_state_dict(
+                        model[0].sharded_state_dict(),
+                        is_loading=True,
+                        sharding_type=sharding_type,
+                    )
+                    optim_state_dict = load(sharded_state_dict, ckpt_dir)
                     optimizer_B.load_state_dict(optim_state_dict)
                     optim_param_state_B = optimizer_B.get_parameter_state_dp_zero()
 
@@ -241,14 +275,14 @@ def test_full_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_
                 Utils.set_world_size()
 
     @pytest.mark.parametrize(
-        ('src_tp_pp', 'dest_tp_pp',),
+        ('src_tp_pp', 'dest_tp_pp', 'use_glu'),
         [
-            ((2, 2), (2, 4)),
-            ((1, 8), (4, 1)),
-            ((2, 4), (4, 2)),
+            ((2, 2), (2, 4), False,),
+            ((1, 8), (4, 1), True),
+            ((2, 4), (4, 2), False),
         ]
     )
-    def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp,):
+    def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu):
         with TempNamedDir(tmp_path_dist_ckpt / 'test_finetune_doesnt_load_optimizer') as ckpt_dir:
             mock_args = SimpleNamespace()
             with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
@@ -256,7 +290,7 @@ def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, des
                 init_checkpointing_mock_args(mock_args, ckpt_dir, False)
 
                 Utils.initialize_model_parallel(*src_tp_pp)
-                model, optimizer = setup_model_and_optimizer(seed=2)
+                model, optimizer = setup_model_and_optimizer(seed=2, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu))
 
                 # We need to save the TPxPP of the source model
                 mock_args.tensor_model_parallel_size = src_tp_pp[0]
@@ -265,7 +299,7 @@ def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, des
                 Utils.destroy_model_parallel()
 
                 Utils.initialize_model_parallel(*dest_tp_pp)
-                model, optimizer = setup_model_and_optimizer(seed=3)
+                model, optimizer = setup_model_and_optimizer(seed=3, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu))
                 model_unloaded_state_dict = deepcopy(model[0].state_dict())
                 optim_unloaded_state_dict = deepcopy(optimizer.state_dict())
 
@@ -289,7 +323,7 @@ def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, des
                 assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
 
                 # ... or `no_load_optim` flag
-                model, optimizer = setup_model_and_optimizer(seed=3)
+                model, optimizer = setup_model_and_optimizer(seed=3, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu))
                 mock_args.finetune = False
                 mock_args.no_load_optim = True
                 mock_args.no_load_rng = True
@@ -303,6 +337,38 @@ def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, des
 
                 Utils.destroy_model_parallel()
 
+    def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_can_load_deprecated_bucket_space_format') as ckpt_dir:
+            mock_args = SimpleNamespace()
+            with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
+                init_basic_mock_args(mock_args)
+                init_checkpointing_mock_args(mock_args, ckpt_dir, True)
+
+                Utils.initialize_model_parallel(4, 2)
+                model, optimizer = setup_model_and_optimizer(seed=2, initialize_fn=initialize_gpt_model)
+
+                mock_args.tensor_model_parallel_size = 4
+                mock_args.pipeline_model_parallel_size = 2
+
+                # Mock optimizer sharded_state_dict so that it ignores the externally passed sharding_type and uses 'fully_sharded_bucket_space' instead
+                orig_optim_sharded_state_dict_fn = optimizer.sharded_state_dict
+                def sharded_state_dict_bucket_space(self, *args, sharding_type: str = 'fully_sharded_model_space', **kwargs):
+                    return orig_optim_sharded_state_dict_fn(*args, sharding_type='fully_sharded_bucket_space', **kwargs)
+
+                optimizer.sharded_state_dict = MethodType(sharded_state_dict_bucket_space, optimizer)
+                save_checkpoint(10, model, optimizer, None, 0)
+
+                torch.distributed.barrier()
+                if Utils.rank == 0:
+                    sharded_metadata = load_tensors_metadata(ckpt_dir / 'iter_0000010')
+                    # Check if actually using `fully_parallel_bucket_space` format
+                    assert 'optimizer.distributed.dp_group_idx_0.gbuf_idx_0.dtype_(torch.bfloat16, torch.bfloat16).bucket_idx_0.exp_avg_sq' in sharded_metadata, sharded_metadata.keys()
+
+                optimizer.sharded_state_dict = orig_optim_sharded_state_dict_fn
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+                Utils.destroy_model_parallel()
+
 
 class TestFP32Optimizer:
     @pytest.mark.parametrize(
@@ -317,14 +383,14 @@ def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_
         with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=False) as ckpt_dir_A:
             with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=False) as ckpt_dir_B:
                 Utils.initialize_model_parallel(*src_tp_pp)
-                model_A, optimizer_A = setup_model_and_optimizer(seed=2, bf16=False)
+                model_A, optimizer_A = setup_model_and_optimizer(seed=2, initialize_fn=initialize_small_model, bf16=False)
 
                 save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A)
                 Utils.destroy_model_parallel()
 
                 # Load checkpoint A with different TP/PP and save as checkpoint B
                 Utils.initialize_model_parallel(*dest_tp_pp)
-                model_B, optimizer_B = setup_model_and_optimizer(seed=3, bf16=False)
+                model_B, optimizer_B = setup_model_and_optimizer(seed=3, initialize_fn=initialize_small_model, bf16=False)
                 load_sharded_state_dict = optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict())
                 state_dict = load(load_sharded_state_dict, ckpt_dir_A)
 
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
index 5384c592a5..fe6eb04258 100644
--- a/tests/unit_tests/dist_checkpointing/test_serialization.py
+++ b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -198,7 +198,8 @@ def test_load_tensors_metadata(self, tmp_path_dist_ckpt):
     def test_can_mix_sharded_tensors_and_factories(self, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(1, 1)
 
-        def _build_fn(key, tensor, replica_id):
+        def _build_fn(key, tensor, replica_id, flattened_range):
+            assert flattened_range is None
             return [
                 ShardedTensor.from_rank_offsets(key + 'part1', tensor, replica_id=replica_id),
                 ShardedTensor.from_rank_offsets(key + 'part2', tensor, replica_id=replica_id),

From 3fe53de4cf11feddb3c6ec9c2cdae88687ff1584 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <selvaraja@nvidia.com>
Date: Wed, 12 Jun 2024 11:38:59 -0700
Subject: [PATCH 81/92] Added torch native embedding

---
 megatron/core/tensor_parallel/layers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 199170e9ec..3b62356de4 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -186,6 +186,7 @@ def __init__(
             self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size
         )
         self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+        self.deterministic_mode = config.deterministic_mode
 
         # Allocate weights and initialize.
         if config.use_cpu_initialization:
@@ -226,7 +227,11 @@ def forward(self, input_):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.weight[masked_input]
+        if self.deterministic_mode:
+            output_parallel = self.weight[masked_input]
+        else:
+            # F.embedding currently has a non-deterministic backward function
+            output_parallel = F.embedding(masked_input, self.weight)
         # Mask the output embedding.
         if self.tensor_model_parallel_size > 1:
             output_parallel[input_mask, :] = 0.0

From 9634c0e4a332875bea4fca5e280764cdde4eae80 Mon Sep 17 00:00:00 2001
From: Duncan Riach <duncan@nvidia.com>
Date: Thu, 13 Jun 2024 09:22:06 -0700
Subject: [PATCH 82/92] Reduce logit memory pressure by using in-place
 operation

---
 megatron/core/tensor_parallel/cross_entropy.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py
index 1614dbb45e..e1b3a68025 100644
--- a/megatron/core/tensor_parallel/cross_entropy.py
+++ b/megatron/core/tensor_parallel/cross_entropy.py
@@ -20,10 +20,10 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
         torch.distributed.all_reduce(
             logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
         )
-        # Subtract the maximum value.
-        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
+        # In-place subtraction reduces memory pressure.
+        vocab_parallel_logits -= logits_max.unsqueeze(dim=-1)
 
-        # Get the partition's vocab indecies
+        # Get the partition's vocab indices
         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
         partition_vocab_size = vocab_parallel_logits.size()[-1]
         rank = get_tensor_model_parallel_rank()
@@ -132,7 +132,7 @@ def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=
 
     Args:
         vocab_parallel_logits: logits split across tensor parallel ranks
-                               dimension is [sequence_length, batch_size, hidden_size]
+                               dimension is [sequence_length, batch_size, vocab_size/num_parallel_ranks]
 
         target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
 

From d9c3fc79431cf0c5018d25d0da970e78092bd544 Mon Sep 17 00:00:00 2001
From: Keval Morabia <kmorabia@nvidia.com>
Date: Thu, 13 Jun 2024 10:37:31 -0700
Subject: [PATCH 83/92] Update ModelOpt PTQ example version to 0.13 and rename
 path

---
 examples/inference/{ammo_ptq => modelopt}/README.md         | 6 +++---
 .../inference/{ammo_ptq => modelopt}/ptq_trtllm_llama_7b.sh | 0
 .../{ammo_ptq => modelopt}/ptq_trtllm_nemotron3_8b.sh       | 0
 .../inference/{ammo_ptq => modelopt}/text_generation_ptq.py | 0
 .../{ammo_ptq => modelopt}/trtllm_text_generation.py        | 0
 5 files changed, 3 insertions(+), 3 deletions(-)
 rename examples/inference/{ammo_ptq => modelopt}/README.md (97%)
 rename examples/inference/{ammo_ptq => modelopt}/ptq_trtllm_llama_7b.sh (100%)
 rename examples/inference/{ammo_ptq => modelopt}/ptq_trtllm_nemotron3_8b.sh (100%)
 rename examples/inference/{ammo_ptq => modelopt}/text_generation_ptq.py (100%)
 rename examples/inference/{ammo_ptq => modelopt}/trtllm_text_generation.py (100%)

diff --git a/examples/inference/ammo_ptq/README.md b/examples/inference/modelopt/README.md
similarity index 97%
rename from examples/inference/ammo_ptq/README.md
rename to examples/inference/modelopt/README.md
index a70ff84cc2..c825b76ce6 100644
--- a/examples/inference/ammo_ptq/README.md
+++ b/examples/inference/modelopt/README.md
@@ -7,7 +7,7 @@ and proceed with a containerized environment (`docker.io/tensorrt_llm/release:la
 ```sh
 git clone https://github.com/NVIDIA/TensorRT-LLM.git
 cd TensorRT-LLM
-git checkout v0.9.0
+git checkout v0.10.0
 make -C docker release_build
 ```
 
@@ -17,7 +17,7 @@ make -C docker release_build
 
 Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
 ```sh
-pip install "nvidia-modelopt[all]~=0.11.0" --extra-index-url https://pypi.nvidia.com
+pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
 pip install zarr tensorstore==0.1.45
 ```
 TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
@@ -69,7 +69,7 @@ git lfs install
 git clone git@hf.co:nvidia/nemotron-3-8b-base-4k
 cd nemotron-3-8b-base-4k
 tar -xvf Nemotron-3-8B-Base-4k.nemo
-mv 586f3f51a9cf43bc9369bd53fa08868c_a934dc7c3e1e46a6838bb63379916563_3feba89c944047c19d5a1d0c07a85c32_mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model
+mv 586f3f51a9cf43bc9369bd53fa08868c_a934dc7c3e1e46a6838bb63379916563_3feba89c944047c19d5a1d0c07a85c32_mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model tokenizer.model
 cd ..
 ```
 
diff --git a/examples/inference/ammo_ptq/ptq_trtllm_llama_7b.sh b/examples/inference/modelopt/ptq_trtllm_llama_7b.sh
similarity index 100%
rename from examples/inference/ammo_ptq/ptq_trtllm_llama_7b.sh
rename to examples/inference/modelopt/ptq_trtllm_llama_7b.sh
diff --git a/examples/inference/ammo_ptq/ptq_trtllm_nemotron3_8b.sh b/examples/inference/modelopt/ptq_trtllm_nemotron3_8b.sh
similarity index 100%
rename from examples/inference/ammo_ptq/ptq_trtllm_nemotron3_8b.sh
rename to examples/inference/modelopt/ptq_trtllm_nemotron3_8b.sh
diff --git a/examples/inference/ammo_ptq/text_generation_ptq.py b/examples/inference/modelopt/text_generation_ptq.py
similarity index 100%
rename from examples/inference/ammo_ptq/text_generation_ptq.py
rename to examples/inference/modelopt/text_generation_ptq.py
diff --git a/examples/inference/ammo_ptq/trtllm_text_generation.py b/examples/inference/modelopt/trtllm_text_generation.py
similarity index 100%
rename from examples/inference/ammo_ptq/trtllm_text_generation.py
rename to examples/inference/modelopt/trtllm_text_generation.py

From cfb0dcce6239f1bbe2ae0f9987f9ee22f02498d2 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jun 2024 20:22:45 -0700
Subject: [PATCH 84/92] Update owners

---
 CODEOWNERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f9b05a66b3..afdc201f67 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,6 +1,6 @@
 [MCORE][3]
-megatron/core/ @shanmugamr @maanug @jcasper @eharper
+megatron/core/ @shanmugamr @jcasper @eharper @terryk
 
 [TESTS]
-tests/ @shanmugamr @maanug @terryk
+tests/ @shanmugamr @terryk
 

From dad83ad21405c20f04a04e91e9a77c5e47703be5 Mon Sep 17 00:00:00 2001
From: Jon Barker <jbarker@nvidia.com>
Date: Fri, 14 Jun 2024 05:10:25 -0700
Subject: [PATCH 85/92] Fix typo in convert.py

---
 tools/checkpoint/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/checkpoint/convert.py b/tools/checkpoint/convert.py
index 7ead190046..935613b143 100644
--- a/tools/checkpoint/convert.py
+++ b/tools/checkpoint/convert.py
@@ -112,7 +112,7 @@ def main():
                                      allow_abbrev=False, conflict_handler='resolve')
 
     parser.add_argument('--model-type', type=str, required=True,
-                        choice=['GPT', 'BERT'],
+                        choices=['GPT', 'BERT'],
                         help='Type of the model')
     parser.add_argument('--loader', type=str, default='megatron',
                         help='Module name to load checkpoint, should be on python path')

From 022929d3bc5d58de34848c6619cb0a539cce673c Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Fri, 14 Jun 2024 09:52:27 -0700
Subject: [PATCH 86/92] Fix GPU device issue for FusedLayerNorm in nemo2

---
 megatron/core/fusions/fused_layer_norm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py
index 30fa5d4224..5189a75b0d 100644
--- a/megatron/core/fusions/fused_layer_norm.py
+++ b/megatron/core/fusions/fused_layer_norm.py
@@ -109,8 +109,9 @@ def __init__(
             hidden_size = (hidden_size,)
         self.hidden_size = torch.Size(hidden_size)
         self.eps = eps
-        self.weight = Parameter(torch.Tensor(*hidden_size))
-        self.bias = Parameter(torch.Tensor(*hidden_size))
+        # Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2.
+        self.weight = Parameter(torch.empty(*hidden_size))
+        self.bias = Parameter(torch.empty(*hidden_size))
         self.reset_parameters()
         self.persist_layer_norm = persist_layer_norm
         self.sequence_parallel = self.config.sequence_parallel

From 7b4a6d76c9ebdfef394e52a59f6362cde49f9346 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <selvaraja@nvidia.com>
Date: Fri, 14 Jun 2024 10:24:54 -0700
Subject: [PATCH 87/92] Added cross entropy fusion

---
 docs/source/api-guide/fusions.rst             |  10 ++
 megatron/core/fusions/fused_cross_entropy.py  | 139 ++++++++++++++++++
 megatron/core/model_parallel_config.py        |   5 +
 .../common/language_module/language_module.py |   6 +-
 .../core/tensor_parallel/cross_entropy.py     | 139 ++++++++++++++----
 megatron/core/tensor_parallel/utils.py        |   5 +
 megatron/training/arguments.py                |   3 +
 .../functional_tests/jet_recipes/MR-gpt.yaml  |   1 +
 ...ore_tp2_pp2_cross_entropy_loss_fusion.json |   1 +
 9 files changed, 280 insertions(+), 29 deletions(-)
 create mode 100644 megatron/core/fusions/fused_cross_entropy.py
 create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cross_entropy_loss_fusion.json

diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst
index 19e3ac0c5a..694ed129f4 100644
--- a/docs/source/api-guide/fusions.rst
+++ b/docs/source/api-guide/fusions.rst
@@ -53,3 +53,13 @@ This module provides wrappers around variations of Softmax in Apex.
    :undoc-members:
    :show-inheritance:
 
+fusions.fused\_cross\_entropy\_loss module
+------------------------------------------
+
+This module uses PyTorch JIT to fuse the cross entropy loss calculation and batches communication calls.
+
+.. automodule:: core.fusions.fused_softmax
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py
new file mode 100644
index 0000000000..bf8d366f73
--- /dev/null
+++ b/megatron/core/fusions/fused_cross_entropy.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Tuple
+
+import torch
+
+from megatron.core.jit import jit_fuser
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
+
+
+@jit_fuser
+def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+        vocab_parallel_logits
+    )
+
+    return vocab_parallel_logits, logits_max
+
+
+@jit_fuser
+def calculate_predicted_logits(
+    vocab_parallel_logits: torch.Tensor, target: torch.Tensor, logits_max: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    (
+        target_mask,
+        masked_target_1d,
+        predicted_logits,
+        sum_exp_logits,
+        exp_logits,
+    ) = VocabParallelCrossEntropy.calculate_predicted_logits(
+        vocab_parallel_logits, target, logits_max
+    )
+
+    predicted_logits_sum_exp_logits = torch.cat((predicted_logits, sum_exp_logits))
+
+    return target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits
+
+
+@jit_fuser
+def calculate_cross_entropy_loss(
+    exp_logits: torch.Tensor, predicted_logits_sum_exp_logits: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    split_val = predicted_logits_sum_exp_logits.size()[0] // 2
+    predicted_logits, sum_exp_logits = torch.split(predicted_logits_sum_exp_logits, split_val)
+
+    exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+        exp_logits, predicted_logits, sum_exp_logits
+    )
+
+    return exp_logits, loss
+
+
+@jit_fuser
+def calculate_gradients(
+    softmax: torch.Tensor,
+    grad_output: torch.Tensor,
+    target_mask: torch.Tensor,
+    masked_target_1d: torch.Tensor,
+) -> torch.Tensor:
+
+    (
+        grad_2d,
+        arange_1d,
+        softmax_update,
+        grad_input,
+    ) = VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
+
+    grad_input = VocabParallelCrossEntropy.calculate_gradients(
+        grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+    )
+
+    grad_input = grad_input.bfloat16()
+
+    return grad_input
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        vocab_parallel_logits, logits_max = calculate_logits_max(vocab_parallel_logits)
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+
+        (
+            target_mask,
+            masked_target_1d,
+            predicted_logits_sum_exp_logits,
+            exp_logits,
+        ) = calculate_predicted_logits(vocab_parallel_logits, target, logits_max)
+
+        # All reduce is needed to get the chunks from other GPUs.
+        # In the fused case, tensors are batches to invoke a single
+        # AllReduce call
+        torch.distributed.all_reduce(
+            predicted_logits_sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        exp_logits, loss = calculate_cross_entropy_loss(exp_logits, predicted_logits_sum_exp_logits)
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        grad_input = calculate_gradients(softmax, grad_output, target_mask, masked_target_1d)
+
+        return grad_input, None
+
+
+def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """
+    Performs cross entropy loss when logits are split across tensor parallel ranks
+
+    Args:
+        vocab_parallel_logits: logits split across tensor parallel ranks
+                               dimension is [sequence_length, batch_size, hidden_size]
+
+        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
+
+    """
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index 9be7cccedf..c54ff58317 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -198,6 +198,11 @@ class ModelParallelConfig:
        Reduce-Scatter both done atomically. Don't care if tp_comm_overlap is False.
     """
 
+    cross_entropy_loss_fusion: bool = False
+    """If this is enabled, the fused cross entropy implementation would be used.
+       Defaults to False.
+    """
+
     ###################
     # Pipeline Parallel
     ###################
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index 78d9f86aaa..fcd683cfb1 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -6,6 +6,7 @@
 
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
@@ -33,7 +34,10 @@ def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
         """
         # [b s] => [s b]
         labels = labels.transpose(0, 1).contiguous()
-        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
+        if self.config.cross_entropy_loss_fusion:
+            loss = fused_vocab_parallel_cross_entropy(logits, labels)
+        else:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(logits, labels)
 
         # [s b] => [b, s]
         loss = loss.transpose(0, 1).contiguous()
diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py
index e1b3a68025..294fc215c3 100644
--- a/megatron/core/tensor_parallel/cross_entropy.py
+++ b/megatron/core/tensor_parallel/cross_entropy.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Tuple
+
 import torch
 
 from megatron.core.parallel_state import (
@@ -11,15 +13,27 @@
 from .utils import VocabUtility
 
 
-class _VocabParallelCrossEntropy(torch.autograd.Function):
+class VocabParallelCrossEntropy:
+    """Computes the Cross Entropy Loss splitting the Vocab size across tensor parallel
+       ranks. This implementation is used in both fused and unfused cross entropy implementations
+    """
+
     @staticmethod
-    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
+    def calculate_logits_max(
+        vocab_parallel_logits: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
 
+        vocab_parallel_logits = vocab_parallel_logits.float()
         # Maximum value along vocab dimension across all GPUs.
         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
-        torch.distributed.all_reduce(
-            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
-        )
+
+        return vocab_parallel_logits, logits_max
+
+    @staticmethod
+    def calculate_predicted_logits(
+        vocab_parallel_logits: torch.Tensor, target: torch.Tensor, logits_max: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
         # In-place subtraction reduces memory pressure.
         vocab_parallel_logits -= logits_max.unsqueeze(dim=-1)
 
@@ -45,6 +59,83 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
         predicted_logits = predicted_logits_1d.view_as(target)
         predicted_logits[target_mask] = 0.0
+
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+
+        return target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits
+
+    @staticmethod
+    def calculate_cross_entropy_loss(
+        exp_logits: torch.Tensor, predicted_logits: torch.Tensor, sum_exp_logits: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        return exp_logits, loss
+
+    @staticmethod
+    def prepare_gradient_calculation_operands(
+        softmax: torch.Tensor, target_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        softmax_update = 1.0 - target_mask.view(-1).float()
+
+        return grad_2d, arange_1d, softmax_update, grad_input
+
+    @staticmethod
+    def calculate_gradients(
+        grad_2d: torch.Tensor,
+        arange_1d: torch.Tensor,
+        masked_target_1d: torch.Tensor,
+        softmax_update: torch.Tensor,
+        grad_input: torch.Tensor,
+        grad_output: torch.Tensor,
+    ) -> torch.Tensor:
+
+        grad_2d[arange_1d, masked_target_1d] -= softmax_update
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
+
+        vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+            vocab_parallel_logits
+        )
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+
+        (
+            target_mask,
+            masked_target_1d,
+            predicted_logits,
+            sum_exp_logits,
+            exp_logits,
+        ) = VocabParallelCrossEntropy.calculate_predicted_logits(
+            vocab_parallel_logits, target, logits_max
+        )
+
         # All reduce is needed to get the chunks from other GPUs.
         torch.distributed.all_reduce(
             predicted_logits,
@@ -52,21 +143,15 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
             group=get_tensor_model_parallel_group(),
         )
 
-        # Sum of exponential of logits along vocab dimension across all GPUs.
-        exp_logits = vocab_parallel_logits
-        torch.exp(vocab_parallel_logits, out=exp_logits)
-        sum_exp_logits = exp_logits.sum(dim=-1)
         torch.distributed.all_reduce(
             sum_exp_logits,
             op=torch.distributed.ReduceOp.SUM,
             group=get_tensor_model_parallel_group(),
         )
 
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        # Normalize and optionally smooth logits
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+            exp_logits, predicted_logits, sum_exp_logits
+        )
 
         vocab_size = exp_logits.size(-1)
         if label_smoothing > 0:
@@ -101,27 +186,25 @@ def backward(ctx, grad_output):
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
         label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
 
-        # All the inputs have softmax as thier gradient.
-        grad_input = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
-
-        softmax_update = 1.0 - target_mask.view(-1).float()
+        (
+            grad_2d,
+            arange_1d,
+            softmax_update,
+            grad_input,
+        ) = VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
 
         if label_smoothing > 0:
             smoothing = label_smoothing * vocab_size / (vocab_size - 1)
             grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
             average_grad = 1 / vocab_size
             grad_2d[arange_1d, :] -= smoothing * average_grad
-        else:
-            grad_2d[arange_1d, masked_target_1d] -= softmax_update
 
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+            # Finally elementwise multiplication with the output gradients.
+            grad_input.mul_(grad_output.unsqueeze(dim=-1))
+        else:
+            grad_input = VocabParallelCrossEntropy.calculate_gradients(
+                grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+            )
 
         return grad_input, None, None
 
diff --git a/megatron/core/tensor_parallel/utils.py b/megatron/core/tensor_parallel/utils.py
index fc0db15f88..53f0d60de0 100644
--- a/megatron/core/tensor_parallel/utils.py
+++ b/megatron/core/tensor_parallel/utils.py
@@ -5,6 +5,11 @@
 import torch
 
 from megatron.core import parallel_state
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from megatron.core.utils import divide
 
 
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index e0fe2e1dfa..a0fe8e0f4c 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1093,6 +1093,9 @@ def _add_training_args(parser):
                        help='Disable rope fusion, the fusion is available '
                        'only when using megatron-core.',
                        dest='apply_rope_fusion')
+    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
+                       help='Enabled fusion of cross entropy loss calculation.',
+                       dest='cross_entropy_loss_fusion')
     group.add_argument('--use-flash-attn', action='store_true',
                        help='use FlashAttention implementation of attention. '
                        'https://arxiv.org/abs/2205.14135')
diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml
index edee11b287..621791b322 100644
--- a/tests/functional_tests/jet_recipes/MR-gpt.yaml
+++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml
@@ -86,6 +86,7 @@ products:
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --untie-embeddings-and-output-weights"'], args_meta: ["dist_optimizer_overlap_grad_reduce_untied"]}
   - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather --check-weight-hash-across-dp-replicas-interval 10 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]}
+  - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--cross-entropy-loss-fusion"], args_meta: ["cross_entropy_loss_fusion"]}
   # Non-MCore, only legacy checkpoints supported
   - {use_mcore: [False], use_te: [False, True], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch]}
   - {use_mcore: [False], tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch]}
diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cross_entropy_loss_fusion.json b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cross_entropy_loss_fusion.json
new file mode 100644
index 0000000000..98ff45e7db
--- /dev/null
+++ b/tests/functional_tests/test_results/jet/gpt3_345m_merge-request_dgx_a100_1N8G_mcore_tp2_pp2_cross_entropy_loss_fusion.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}

From 998e75b3ff7102a5ce80f88318f5781dfacbb782 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 14 Jun 2024 10:54:43 -0700
Subject: [PATCH 88/92] Small improvements around the CI

---
 .gitignore                                    |  1 +
 .gitlab-ci.yml                                | 14 ++++++++++
 jet-tests.yml                                 |  3 ++-
 .../jet_recipes/build-pyt.yaml                | 26 +++----------------
 4 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5955b349f1..900ab517d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ build
 slurm*
 logs
 .vscode
+local/
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f71be75984..f43e0f566d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,6 +6,9 @@ workflow:
     - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
       variables:
         JET_CUSTOM_FILTER: "type == 'build' or 'merge-request' in spec.scope"
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/
+      variables:
+        JET_CUSTOM_FILTER: "type == 'build'"
     # always run MR pipelines
     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
     # always run web pipelines
@@ -70,6 +73,7 @@ unit_tests-data:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-dist-checkpointing:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -84,6 +88,7 @@ unit_tests-dist-checkpointing:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-fusions:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -98,6 +103,7 @@ unit_tests-fusions:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-inference:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -112,6 +118,7 @@ unit_tests-inference:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-models:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -126,6 +133,7 @@ unit_tests-models:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-pipeline-parallel:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -140,6 +148,7 @@ unit_tests-pipeline-parallel:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-tensor-parallel:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -154,6 +163,7 @@ unit_tests-tensor-parallel:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-transformer:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -168,6 +178,7 @@ unit_tests-transformer:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 unit_tests-top-py:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
@@ -182,6 +193,7 @@ unit_tests-top-py:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
       when: never
     - when: always
+  interruptible: true
 
 docs_build_test:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
@@ -197,6 +209,7 @@ docs_build_test:
   allow_failure: true
   except:
     - main
+  interruptible: true
 
 formatting:
   image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
@@ -208,3 +221,4 @@ formatting:
     - isort megatron/core --check
   rules:
     - when: always
+  interruptible: true
diff --git a/jet-tests.yml b/jet-tests.yml
index 4737a62050..ca23f16969 100644
--- a/jet-tests.yml
+++ b/jet-tests.yml
@@ -2,7 +2,8 @@
   stage: jet
   rules:
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
-    - if: $JET_CUSTOM_FILTER != "" && $CI_PIPELINE_SOURCE != 'merge_request_event'
+    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/'
+    # If either $JET_CUSTOM_FILTER or both $CI_MODEL and $CI_TASK are provided
     - when: never
 
 default:
diff --git a/tests/functional_tests/jet_recipes/build-pyt.yaml b/tests/functional_tests/jet_recipes/build-pyt.yaml
index b42a39f178..9ea823d539 100644
--- a/tests/functional_tests/jet_recipes/build-pyt.yaml
+++ b/tests/functional_tests/jet_recipes/build-pyt.yaml
@@ -1,34 +1,15 @@
 type: build
 format_version: 1
 maintainers: [maanug]
-spec:
-  name: pyt
-  platforms: [linux/amd64]
-  source:
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:24.01v3
-
----
-type: build
-format_version: 1
-maintainers: [maanug]
 spec:
   name: mcore-pyt
   platforms: [linux/amd64]
-  parent: pyt
   source:
     repo: https://gitlab-master.nvidia.com/ADLR/megatron-lm.git
     ref: main
     dockerfile: Dockerfile.ci
-
----
-type: build
-format_version: 1
-maintainers: [maanug]
-spec:
-  name: nemo
-  platforms: [linux/amd64]
-  source:
-    image: nvcr.io/nvidian/nemo:nightly
+    arguments:
+      FROM_IMAGE_NAME: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:24.01v3
 
 ---
 type: build
@@ -37,8 +18,9 @@ maintainers: [maanug]
 spec:
   name: mcore-nemo
   platforms: [linux/amd64]
-  parent: nemo
   source:
     repo: https://gitlab-master.nvidia.com/ADLR/megatron-lm.git
     ref: main
     dockerfile: Dockerfile.ci
+    arguments:
+      FROM_IMAGE_NAME: nvcr.io/nvidian/nemo:nightly

From 0c47d333a0cb7a252d3156c6697a28690cc9b8f3 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Date: Fri, 14 Jun 2024 15:43:18 -0700
Subject: [PATCH 89/92] Update QuickStart.md

---
 megatron/core/QuickStart.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
index ed8fbfed60..44dfb23e86 100644
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
@@ -21,7 +21,10 @@ The following steps will walk you through how you can create a sample GPT model
 
 <br>
 
-**NOTE: All of the folowing steps needs to be put into a script and then run as explained in the last step** 
+**NOTE: All of the following steps are already put into a script [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) which you can run as follows** 
+```
+PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
+```
 
 <br>
 
@@ -219,13 +222,7 @@ if __name__ == "__main__":
 ```
 <br>
 
-**STEP 7 - Running the full example**
-All the above steps are put to gether in a [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script in examples folder in megatron . You can run it as follows after completing all steps in the Environment Setup section.
 
-```
-PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
-```
-<br>
 
 ### Extending Further
 The above example introduced you to a basic training loop in MCore. To see more advanced examples please look at [pretrain_gpt.py]. That will show you how you can write more complex training loops, involving pipeline parallel, context parallel, rope embeddings, mixture of experts and all other functionalities present in mcore. 

From df61e60bf5670b1196fcae2264311401d3bb82db Mon Sep 17 00:00:00 2001
From: Duncan Riach <duncan@nvidia.com>
Date: Sat, 15 Jun 2024 18:14:48 -0700
Subject: [PATCH 90/92] Add mamba

---
 LICENSE                                       |   9 +-
 README.md                                     |   6 +-
 examples/mamba/.gitignore                     |   4 +
 examples/mamba/Dockerfile                     |  14 +
 examples/mamba/README.md                      |  91 ++++
 examples/mamba/run_text_gen_server_8b.sh      |  50 ++
 examples/mamba/run_text_gen_server_8b_gpt3.sh |  46 ++
 examples/mamba/train.sh                       | 105 ++++
 megatron/core/datasets/indexed_dataset.py     |   2 +-
 .../embeddings/language_model_embedding.py    |   2 +-
 megatron/core/models/gpt/gpt_model.py         |   2 +-
 megatron/core/models/mamba/__init__.py        |   1 +
 .../core/models/mamba/mamba_layer_specs.py    |  59 +++
 megatron/core/models/mamba/mamba_model.py     | 205 ++++++++
 megatron/core/ssm/__init__.py                 |   0
 megatron/core/ssm/mamba_block.py              | 234 +++++++++
 .../core/ssm/mamba_hybrid_layer_allocation.py | 191 +++++++
 megatron/core/ssm/mamba_layer.py              |  62 +++
 megatron/core/ssm/mamba_mixer.py              | 485 ++++++++++++++++++
 megatron/core/ssm/triton_cache_manager.py     |  44 ++
 megatron/core/tensor_parallel/__init__.py     |   5 +-
 megatron/core/tensor_parallel/mappings.py     |   2 +-
 .../inference/text_generation/tokenization.py |   1 +
 megatron/training/arguments.py                |  24 +-
 .../training/optimizer_param_scheduler.py     |  21 +-
 megatron/training/tokenizer/tokenizer.py      |  44 ++
 megatron/training/training.py                 |   8 +-
 pretrain_mamba.py                             | 239 +++++++++
 tools/checkpoint/hybrid_conversion.py         | 398 ++++++++++++++
 tools/run_mamba_text_generation_server.py     | 121 +++++
 30 files changed, 2461 insertions(+), 14 deletions(-)
 create mode 100644 examples/mamba/.gitignore
 create mode 100644 examples/mamba/Dockerfile
 create mode 100644 examples/mamba/README.md
 create mode 100755 examples/mamba/run_text_gen_server_8b.sh
 create mode 100644 examples/mamba/run_text_gen_server_8b_gpt3.sh
 create mode 100755 examples/mamba/train.sh
 create mode 100644 megatron/core/models/mamba/__init__.py
 create mode 100755 megatron/core/models/mamba/mamba_layer_specs.py
 create mode 100644 megatron/core/models/mamba/mamba_model.py
 create mode 100644 megatron/core/ssm/__init__.py
 create mode 100644 megatron/core/ssm/mamba_block.py
 create mode 100644 megatron/core/ssm/mamba_hybrid_layer_allocation.py
 create mode 100644 megatron/core/ssm/mamba_layer.py
 create mode 100644 megatron/core/ssm/mamba_mixer.py
 create mode 100644 megatron/core/ssm/triton_cache_manager.py
 create mode 100644 pretrain_mamba.py
 create mode 100644 tools/checkpoint/hybrid_conversion.py
 create mode 100644 tools/run_mamba_text_generation_server.py

diff --git a/LICENSE b/LICENSE
index b49c04ee33..4782df586e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -30,12 +30,13 @@ The following applies to all files unless otherwise noted:
 
 This repository also contains code from Hugging Face Inc., Google Research,
 Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their
-Swin-Transformer project) and Philip Popien. Files from these
-organizations have notices at the top of each file. Below are
-licenses used in those files, as indicated.
+Swin-Transformer project), Philip Popien, and the Mamba project (Tri Dao and
+Albert Gu). Files from these organizations have notices at the top of each file.
+Below are licenses used in those files, as indicated.
 
 
-------------- LICENSE FOR Facebook, huggingface, Google Research and LLaVA code  --------------
+--------------------------------------------------------------------------------
+-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code  --
 
 
                                  Apache License
diff --git a/README.md b/README.md
index f2e4fe84b1..ba678f94f3 100644
--- a/README.md
+++ b/README.md
@@ -247,7 +247,6 @@ In `examples/pretrain_gpt3_175B.sh` we have provided an example of how to config
 
 With full global batch size of 1536 on 1024 A100 GPUs, each iteration takes around 32 seconds resulting in 138 teraFLOPs per GPU which is 44% of the theoretical peak FLOPs.
 
-
 ## Retro and InstructRetro
 
 
@@ -270,6 +269,10 @@ In this repo, we provide an end-to-end reproduction guide to implement Retro and
 
 Please see [tools/retro/README.md](tools/retro/README.md) for a detailed overview.
 
+## Mamba-based Language Models
+
+Please see [examples/mamba](./examples/mamba) for details.
+
 <!--
 ## REALM Pipeline
 We are working on implementing the [REALM](https://arxiv.org/pdf/2002.08909.pdf) system. The following sections (will) reflect the three stages of training it. For now it's just the ICT code.
@@ -575,3 +578,4 @@ Below are some of the projects where we have directly used Megatron:
 * [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
 * [Shall We Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study](https://arxiv.org/abs/2304.06762)
 * [InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining](https://arxiv.org/abs/2310.07713)
+* [An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)
diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore
new file mode 100644
index 0000000000..940f4797e4
--- /dev/null
+++ b/examples/mamba/.gitignore
@@ -0,0 +1,4 @@
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile
new file mode 100644
index 0000000000..4adeaf7334
--- /dev/null
+++ b/examples/mamba/Dockerfile
@@ -0,0 +1,14 @@
+FROM nvcr.io/nvidia/pytorch:23.12-py3
+
+RUN pip uninstall -y causal-conv1d triton && \
+    pip install causal-conv1d==1.2.2.post1 sentencepiece==0.1.99 triton==2.1.0 flask-restful
+
+WORKDIR /tmp
+
+RUN git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    python setup.py install && \
+    cd .. && \
+    rm -rf mamba
+
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
new file mode 100644
index 0000000000..5c3934d27d
--- /dev/null
+++ b/examples/mamba/README.md
@@ -0,0 +1,91 @@
+# Mamba-based Language Models
+
+## Introduction
+
+This document is an entrypoint into the code used for
+<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
+
+We are releasing the parameters for some of the models described in that
+technical report via
+[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
+
+## Installation
+
+Create and run a Docker container using the [Dockerfile](./Dockerfile).
+
+```
+docker build -t your_image_name:your_tag .
+docker run --gpus all -it --rm \
+  -v /path/to/megatron:/workspace/megatron \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -w /workspace/megatron/examples/mamba \
+  your_image_name:your_tag
+```
+
+## Train
+
+[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
+a single node. Select between 800M-scale and 8B-scale models by setting the
+`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
+the one described in the technical report.
+
+## Text Generation
+
+Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
+generation server using an 8B hybrid checkpoint. This is configured to run the
+8B hybrid model described in the technical report, with tensor model parallel
+set to 1.
+
+The arguments in the script will need to be changed if using a checkpoint with a
+different model parallel configuration or other differences, such as model
+architecture. For example, to run the 8B pure Mamba-2 model, change
+`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
+
+Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
+a text generation server using the 8B reference Transformer checkpoint.
+
+## Checkpoint Formats
+
+For inference, the model must be configured to match the checkpoint file used,
+including the hybrid layer configuration and model parallel configuration.
+
+If you need to convert a hybrid checkpoint file to a different tensor parallel
+or pipeline parallel size, use
+[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
+There is an example run command at the end of that file.
+
+Before running that script, you will need to set `PYTHONPATH` to include the
+root directory of your Megatron-LM repository clone.
+
+```
+export PYTHONPATH=<path-to-megatron>:PYTHONPATH
+```
+
+## Hybrid Options
+
+`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
+to total layers. For example, 4 attention layers out of 48 total layers is
+specified by `--hybrid-attention-ratio 0.08`.
+
+`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
+layers. For example, 24 MLP layers out of 48 total layers is specified by
+`--hybrid-mlp-ratio 0.5`.
+
+* (`ATT` + `MLP`) must be less than or equal to 1.0.
+* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
+total layers.
+* `ATT` = `MLP` = 0 is a pure Mamba model.
+* `ATT` = `MLP` = 0.5 is a transfomer model.
+
+If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
+is specified, the logfile will include information about the hybrid layer
+pattern used. `--hybrid-override-pattern` can be used to specify a different
+pattern than the default, algorithmically-generated one.
+
+## Mamba vs Mamba-2
+
+This codebase currently only supports Mamba-2, and not the original version of
+Mamba. However, the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
+can be configured to run the original version of Mamba.
diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh
new file mode 100755
index 0000000000..8d3137f244
--- /dev/null
+++ b/examples/mamba/run_text_gen_server_8b.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --untie-embeddings-and-output-weights \
+       --num-layers 56  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type none \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --seed 42
diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh
new file mode 100644
index 0000000000..5413b245ed
--- /dev/null
+++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --use-flash-attn \
+       --apply-layernorm-1p \
+       --untie-embeddings-and-output-weights \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type rope \
+       --rotary-percent 0.5 \
+       --squared-relu \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --transformer-impl local \
+       --seed 42
diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh
new file mode 100755
index 0000000000..3952a997d4
--- /dev/null
+++ b/examples/mamba/train.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Use: ./train.sh <data-path> <tokenizer-path>
+
+MODEL_SCALE="800M" # or "8B"
+
+case "${MODEL_SCALE}" in
+    "800M")
+        TENSOR_MODEL_PARALLEL_SIZE=1
+        NUM_LAYERS=48
+        HIDDEN_SIZE=1024
+        NUM_ATTENTION_HEADS=16
+        GLOBAL_BATCH_SIZE=32
+        ;;
+    "8B")
+        TENSOR_MODEL_PARALLEL_SIZE=4
+        NUM_LAYERS=56
+        HIDDEN_SIZE=4096
+        NUM_ATTENTION_HEADS=32
+        GLOBAL_BATCH_SIZE=8
+        ;;
+    *)
+        echo "Invalid version specified"
+        exit 1
+        ;;
+esac
+
+DATA_PATH=$1
+TOKENIZER_PATH=$2
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+CHECKPOINT_DIR="./checkpoints"
+DATACACHE_DIR="./data-cache"
+TENSORBOARD_DIR="./tensorboard"
+
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${DATACACHE_DIR}
+mkdir -p ${TENSORBOARD_DIR}
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+SEQ_LEN=4096
+TRAIN_SAMPLES=73242188  # 300B tokens / 4096
+LR_WARMUP_SAMPLES=50000
+LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
+
+options=" \
+       --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
+       --sequence-parallel \
+       --pipeline-model-parallel-size 1 \
+       --use-distributed-optimizer \
+       --overlap-param-gather \
+       --overlap-grad-reduce \
+       --untie-embeddings-and-output-weights \
+       --init-method-std 0.02 \
+       --position-embedding-type none \
+       --num-layers ${NUM_LAYERS} \
+       --hidden-size ${HIDDEN_SIZE} \
+       --num-attention-heads ${NUM_ATTENTION_HEADS} \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --train-samples ${TRAIN_SAMPLES} \
+       --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+       --lr-decay-samples ${LR_DECAY_SAMPLES} \
+       --save ${CHECKPOINT_DIR} \
+       --load ${CHECKPOINT_DIR} \
+       --data-path ${DATA_PATH} \
+       --data-cache-path ${DATACACHE_DIR} \
+       --split 99,1,0 \
+       --tokenizer-type GPTSentencePieceTokenizer \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --micro-batch-size 4 \
+       --global-batch-size ${GLOBAL_BATCH_SIZE} \
+       --lr 2.5e-4 \
+       --min-lr 2.5e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 0.1 \
+       --clip-grad 1.0 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --adam-beta1 0.9 \
+       --adam-beta2 0.95 \
+       --log-interval 10 \
+       --save-interval 2000 \
+       --eval-interval 2000 \
+       --eval-iters 32 \
+       --bf16 \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --no-create-attention-mask-in-dataloader \
+       --tensorboard-dir ${TENSORBOARD_DIR}"
+
+torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
index 5f9fbe7238..b1ff497fe1 100644
--- a/megatron/core/datasets/indexed_dataset.py
+++ b/megatron/core/datasets/indexed_dataset.py
@@ -354,7 +354,7 @@ def initialize(self, path_prefix: str, multimodal: bool, mmap: bool) -> None:
         """Initialize the dataset
 
         This method is called by IndexedDataset.__init__ during object creation and by
-        IndexedDataset.__setstate__ during un-puckling
+        IndexedDataset.__setstate__ during un-pickling
 
         Args:
             path_prefix (str): The index (.idx) and data (.bin) prefix
diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py
index 3744eab7b8..bc1a2de9cb 100644
--- a/megatron/core/models/common/embeddings/language_model_embedding.py
+++ b/megatron/core/models/common/embeddings/language_model_embedding.py
@@ -28,7 +28,7 @@ def __init__(
         config: TransformerConfig,
         vocab_size: int,
         max_sequence_length: int,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
         num_tokentypes: int = 0,
     ):
         super().__init__(config=config)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 70f3f3b41c..3562e688b6 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -49,7 +49,7 @@ def __init__(
         fp16_lm_cross_entropy: bool = False,
         parallel_output: bool = True,
         share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
         rotary_percent: float = 1.0,
         rotary_base: int = 10000,
         seq_len_interpolation_factor: Optional[float] = None,
diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py
new file mode 100644
index 0000000000..f09944d18e
--- /dev/null
+++ b/megatron/core/models/mamba/__init__.py
@@ -0,0 +1 @@
+from .mamba_model import MambaModel
diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py
new file mode 100755
index 0000000000..1c7d300b50
--- /dev/null
+++ b/megatron/core/models/mamba/mamba_layer_specs.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import Mamba
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+mamba_stack_spec = ModuleSpec(
+    module=MambaStack,
+    submodules=MambaStackSubmodules(
+        mamba_layer=ModuleSpec(
+            module=MambaLayer, submodules=MambaLayerSubmodules(norm=TENorm, mixer=Mamba,),
+        ),
+        # Started with spec from gpt_layer_specs.py (with MLP removed)
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        attention_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=TELayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+            ),
+        ),
+        # Started with spec from gpt_layer_specs.py
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        mlp_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                mlp=ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
+                    ),
+                ),
+                mlp_bda=get_bias_dropout_add,
+            ),
+        ),
+    ),
+)
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
new file mode 100644
index 0000000000..f58af957fb
--- /dev/null
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Literal, Optional
+
+from torch import Tensor
+
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class MambaModel(LanguageModule):
+    """Mamba language model.
+
+    Args:
+        config (TransformerConfig): Transformer config
+        mamba_stack_spec (ModuleSpec): Specifies the modules to use for the various layer types
+        vocab_size (int): Vocabulary size
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+        pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True.
+        hybrid_attention_ratio (float, optional): The target ratio of attention layers to total layers
+        hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers
+        hybrid_override_pattern (str, optional): The hybrid layer pattern to override with
+        post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True.
+        fp16_lm_cross_entropy (bool, optional): Defaults to False.
+        parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True.
+        share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False.
+        position_embedding_type (Literal[learned_absolute,rope,none], optional):  Position embedding type. Defaults to 'none'.
+        rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
+        rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000.
+        seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        mamba_stack_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        hybrid_attention_ratio: float = 0.0,
+        hybrid_mlp_ratio: float = 0.0,
+        hybrid_override_pattern: str = None,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        # Mamba with no attention has no need for position embeddings, so none is default
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none',
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        self.mamba_stack_spec: ModuleSpec = mamba_stack_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.hybrid_attention_ratio = hybrid_attention_ratio
+        self.hybrid_mlp_ratio = hybrid_mlp_ratio
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+            )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+            )
+
+        self.decoder = build_module(
+            mamba_stack_spec,
+            self.config,
+            pre_process=self.pre_process,
+            hybrid_attention_ratio=self.hybrid_attention_ratio,
+            hybrid_mlp_ratio=self.hybrid_mlp_ratio,
+            hybrid_override_pattern=self.hybrid_override_pattern,
+            post_process=self.post_process,
+            dtype=config.params_dtype,
+        )
+
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+            )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params: InferenceParams = None,
+    ) -> Tensor:
+        """Forward function of the Mamba model. This function passes the input tensors
+        through the embedding layer, and then the decoder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given or the final hidden units
+        """
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # The following assert will currently fail when running inference.
+        # Commented out for now.
+        # TODO (duncan/rwaleffe): (1) confirm that the externally-generated
+        #   attention mask is not needed and is ignored by the model in
+        #   inference mode, (2) reduce the size of the externally-generated
+        #   attention mask to prevent CPU OOM (as we did for training), (3)
+        #   force the attention mask passed to the model in inference mode to
+        #   be None, so this assert will succeed.
+        # assert attention_mask is None, "The attention mask is ignored and should be set to None"
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
diff --git a/megatron/core/ssm/__init__.py b/megatron/core/ssm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
new file mode 100644
index 0000000000..f83ecc8711
--- /dev/null
+++ b/megatron/core/ssm/mamba_block.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Union
+
+from torch import Tensor, nn
+
+from megatron.core import parallel_state
+from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
+from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers
+from megatron.core.tensor_parallel import get_cuda_rng_tracker
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+
+def create_mamba_block(
+    config, mamba_layer_spec, residual_in_fp32=False, layer_idx=None,
+):
+    block = build_module(
+        mamba_layer_spec, config, residual_in_fp32=residual_in_fp32, layer_idx=layer_idx,
+    )
+    block.layer_idx = layer_idx
+    return block
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    with get_cuda_rng_tracker().fork():
+        if isinstance(module, nn.Linear):
+            if not getattr(module.weight, "_no_reinit", False):
+                nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+
+        for name, p in module.named_parameters():
+            if name in ["in_proj.weight", "x_proj.weight", "conv1d.weight", "out_proj.weight"]:
+                nn.init.kaiming_uniform(p, a=math.sqrt(5))
+
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight", "fc2.weight"]:
+                    # Special Scaled Initialization
+                    nn.init.normal_(
+                        p,
+                        mean=0.0,
+                        std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer),
+                    )
+
+
+@dataclass
+class MambaStackSubmodules:
+    mamba_layer: Union[ModuleSpec, type] = IdentityOp
+    attention_layer: Union[ModuleSpec, type] = IdentityOp
+    mlp_layer: Union[ModuleSpec, type] = IdentityOp
+
+
+class MambaStack(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MambaStackSubmodules,
+        residual_in_fp32=False,
+        pre_process: bool = True,
+        hybrid_attention_ratio: float = 0.0,
+        hybrid_mlp_ratio: float = 0.0,
+        hybrid_override_pattern: str = None,
+        post_layer_norm: bool = True,
+        post_process: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(config=config)
+        self.residual_in_fp32 = residual_in_fp32
+        self.pre_process = pre_process
+        self.post_layer_norm = post_layer_norm
+        self.post_process = post_process
+
+        # Required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.hybrid_attention_ratio = hybrid_attention_ratio
+        self.hybrid_mlp_ratio = hybrid_mlp_ratio
+        self.hybrid_override_pattern = hybrid_override_pattern
+
+        layer_type_list = allocate_layers(
+            self.config.num_layers,
+            self.hybrid_attention_ratio,
+            self.hybrid_mlp_ratio,
+            self.hybrid_override_pattern,
+        )
+
+        pp_layer_offset = 0
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            pp_layer_offset, layer_type_list = self._select_layers_for_pipeline_parallel(
+                layer_type_list
+            )
+
+        self.layers = nn.ModuleList()
+        for i, layer_type in enumerate(layer_type_list):
+            if layer_type == LayerSymbols.MAMBA:
+                layer_idx = i + pp_layer_offset
+                block = create_mamba_block(
+                    self.config,
+                    submodules.mamba_layer,
+                    residual_in_fp32=residual_in_fp32,
+                    layer_idx=layer_idx,
+                )
+            elif layer_type == LayerSymbols.ATTENTION:
+                # Wondering if layer_number should be i+1. See TransformerBlock
+                # and TransformerLayer::sharded_state_dict
+                # Also, transformer layers apply their own pp_layer_offset
+                block = build_module(submodules.attention_layer, config=self.config, layer_number=i)
+            elif layer_type == LayerSymbols.MLP:
+                # Wondering if layer_number should be i+1. See TransformerBlock
+                # and TransformerLayer::sharded_state_dict
+                # Also, transformer layers apply their own pp_layer_offset
+                block = build_module(submodules.mlp_layer, config=self.config, layer_number=i)
+            else:
+                assert True, "unexpected layer_type"
+            self.layers.append(block)
+
+        # Required for activation recomputation
+        self.num_layers_per_pipeline_rank = len(self.layers)
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_norm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+
+        self.apply(partial(_init_weights, n_layer=self.config.num_layers,))
+
+    def _select_layers_for_pipeline_parallel(self, layer_type_list):
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        assert parallel_state.get_virtual_pipeline_model_parallel_world_size() is None, (
+            "The Mamba hybrid model does not currently support "
+            "virtual/interleaved pipeline parallelism"
+        )
+
+        offset = pipeline_rank * num_layers_per_pipeline_rank
+        selected_list = layer_type_list[offset : offset + num_layers_per_pipeline_rank]
+
+        return offset, selected_list
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def set_input_tensor(self, input_tensor: Tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        inference_params=None,
+        rotary_pos_emb: Tensor = None,
+    ):
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        if inference_params:
+            # NOTE(bnorick): match InferenceParams attributes for mamba_ssm.utils.generation.InferenceParams,
+            # this hack supports eval
+            inference_params.max_seqlen = inference_params.max_sequence_length
+            inference_params.seqlen_offset = inference_params.sequence_len_offset
+
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask,
+                inference_params=inference_params,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+
+            # The attention layer (currently a simplified transformer layer)
+            # outputs a tuple of (hidden_states, context). Context is intended
+            # for cross-attention, and is not needed in our model.
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_norm(hidden_states)
+
+        # Ensure that the tensor passed between pipeline parallel stages is
+        # viewless. See related notes in TransformerBlock and TransformerLayer
+        output = make_viewless_tensor(
+            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
+        )
+
+        return hidden_states
diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
new file mode 100644
index 0000000000..abfa2ae305
--- /dev/null
+++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+
+if __name__ != "__main__":
+    from megatron.core.utils import log_single_rank
+else:
+    from typing import Any
+
+    def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
+        print(*args[1:], **kwargs)
+
+
+logger = logging.getLogger(__name__)
+
+
+class Symbols:
+    MAMBA = 'M'
+    ATTENTION = '*'
+    MLP = '-'
+    VALID = {MAMBA, ATTENTION, MLP}
+
+
+def _allocate_auto(
+    total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float
+) -> list:
+    # First, allocate attention (evenly spaced, starting and ending with mamba)
+    attention_layers_count: int = round(total_layers_count * target_attention_ratio)
+    mamba_layers_count: int = total_layers_count - attention_layers_count
+    mamba_sections_count: int = attention_layers_count + 1
+    mamba_section_length: float = mamba_layers_count / mamba_sections_count
+
+    layer_type_list = [Symbols.MAMBA] * total_layers_count
+    x: float = mamba_section_length
+    for l in range(total_layers_count):
+        if x < 0.5:
+            layer_type_list[l] = Symbols.ATTENTION
+            x += mamba_section_length
+        else:
+            x -= 1
+
+    # Next, allocate mlp
+    # (evenly distributed, but right-justified, not replacing attention)
+    mlp_layers_count: int = round(total_layers_count * target_mlp_ratio)
+    if mlp_layers_count > 0:
+        mamba_layers_count -= mlp_layers_count
+        mamba_to_mlp_ratio: float = mamba_layers_count / mlp_layers_count
+
+        x: float = mamba_to_mlp_ratio
+        for l in range(total_layers_count):
+            if layer_type_list[l] == Symbols.MAMBA:
+                if x < 0.5:
+                    layer_type_list[l] = Symbols.MLP
+                    x += mamba_to_mlp_ratio
+                else:
+                    x -= 1
+
+    return layer_type_list
+
+
+def _allocate_override(total_layers_count: int, override_pattern: str) -> list:
+    layer_type_list = list(override_pattern)
+    override_pattern_length = len(layer_type_list)
+    if override_pattern_length != total_layers_count:
+        raise ValueError(
+            "The hybrid override pattern is the wrong "
+            f"length: got {override_pattern_length}, expected "
+            f"{total_layers_count}"
+        )
+    for l in layer_type_list:
+        if l not in Symbols.VALID:
+            raise ValueError(f"In hybrid override pattern, '{l}' is not " f"one of {Symbols.VALID}")
+
+    return layer_type_list
+
+
+def _layer_counts_match(a: list, b: list) -> bool:
+    for s in Symbols.VALID:
+        if a.count(s) != b.count(s):
+            return False
+    return True
+
+
+def allocate_layers(
+    total_layers_count: int,
+    target_attention_ratio: float,
+    target_mlp_ratio: float,
+    override_pattern: str = None,
+) -> list:
+    assert total_layers_count > 0
+    assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0
+    assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0
+    assert target_attention_ratio + target_mlp_ratio <= 1.0
+    # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio
+
+    layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio)
+
+    if override_pattern is not None:
+        layer_type_list_override = _allocate_override(total_layers_count, override_pattern)
+        log_single_rank(logger, logging.INFO, "Using hybrid override pattern")
+        if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match(
+            layer_type_list_override, layer_type_list
+        ):
+            raise ValueError(
+                "The number of each type of layer in the override "
+                "pattern must match the number in the overridden "
+                "pattern."
+            )
+        if layer_type_list_override == layer_type_list:
+            log_single_rank(
+                logger, logging.INFO, "The override pattern matches the overridden pattern"
+            )
+        else:
+            log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B")
+            log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}")
+            log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}")
+        layer_type_list = layer_type_list_override
+
+    if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None:
+        actual_attention_layers_count = layer_type_list.count(Symbols.ATTENTION)
+        actual_attention_ratio = actual_attention_layers_count / total_layers_count
+        actual_mlp_layers_count = layer_type_list.count(Symbols.MLP)
+        actual_mlp_ratio = actual_mlp_layers_count / total_layers_count
+        allocation_string = ''.join(layer_type_list)
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Hybrid allocation ({Symbols.MAMBA} is mamba, "
+            f"{Symbols.ATTENTION} is attention, "
+            f"{Symbols.MLP} is mlp):",
+        )
+        log_single_rank(logger, logging.INFO, allocation_string)
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"{actual_attention_layers_count} attention layers in "
+            f"{total_layers_count} total layers.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Target attention ratio: {target_attention_ratio:.2f}. "
+            f"Actual attention ratio: {actual_attention_ratio:.2f}.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Target mlp ratio: {target_mlp_ratio:.2f}. "
+            f"Actual mlp ratio: {actual_mlp_ratio:.2f}.",
+        )
+    return layer_type_list
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # (10, 0.2, 0.0),
+        # (48, 0.0, 0.0), # will not print anything
+        # (48, 0.1, 0.0),
+        # 48, 0.3, 0.0),
+        # (48, 0.5, 0.0),
+        # (48, 0.6, 0.0),
+        # (48, 0.7, 0.0),
+        # (10, 0.0, 0.1),
+        # (10, 0.0, 0.3),
+        # (10, 0.0, 0.5),
+        # (10, 0.1, 0.1),
+        # (10, 0.2, 0.2),
+        # (10, 0.3, 0.3),
+        # (10, 0.5, 0.5),
+        # (48, 0.2, 0.3),
+        # (48, 0.5, 0.2),
+        # (48, 0.5, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.25, 0.25, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.25, 0.25, "MM-*MM-*MM*-MM*-MM*-MM*-M*M-M*M-M*M-M*M-*MM-*MM-"),
+        # (48, 0.0, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.2, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.0, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.5, 0.5),
+        # (10, 0.3, 0.2, "MMM*-*M*M-"),
+        # (10, 0.3, 0.2, "MM*M-*M*M-"),
+        (9, 0.0, 0.0, "M*-M*-M*-"),
+        (9, 0.0, 0.0, "MMMMMMMMM"),
+    ]
+    for t in test_cases:
+        print("")
+        allocate_layers(*t)
diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py
new file mode 100644
index 0000000000..b417202f78
--- /dev/null
+++ b/megatron/core/ssm/mamba_layer.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class MambaLayerSubmodules:
+    norm: Union[ModuleSpec, type] = IdentityOp
+    mixer: Union[ModuleSpec, type] = IdentityOp
+
+
+class MambaLayer(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MambaLayerSubmodules,
+        layer_idx=None,
+        residual_in_fp32=False,
+    ):
+        """
+        Top level Mamba Layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.mixer = build_module(
+            submodules.mixer, self.config, self.config.hidden_size, layer_idx=layer_idx,
+        )
+        self.norm = build_module(submodules.norm, self.config, self.config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,  # Not used in MambaLayer
+        inference_params=None,
+        rotary_pos_emb: Tensor = None,  # Not used in MambaLayer
+    ):
+
+        residual = hidden_states
+        hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states + residual
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype)
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
new file mode 100644
index 0000000000..3ab76d9702
--- /dev/null
+++ b/megatron/core/ssm/mamba_mixer.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from megatron.core.parallel_state import get_tensor_model_parallel_world_size
+from megatron.core.tensor_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    copy_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    get_cuda_rng_tracker,
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn = None
+    causal_conv1d_update = None
+
+try:
+    from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined
+except ImportError:
+    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
+
+try:
+    from einops import rearrange, repeat
+except ImportError:
+    raise ImportError("einops is required by the Mamba model but cannot be imported")
+
+
+class Mamba(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        d_model,
+        d_state=128,
+        d_conv=4,
+        conv_init=None,
+        expand=2,
+        headdim=64,
+        ngroups=8,
+        A_init_range=(1, 16),
+        D_has_hdim=False,
+        rmsnorm=True,
+        norm_before_gate=False,
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        bias=False,
+        conv_bias=True,
+        # Fused kernel and sharding options
+        chunk_size=128,
+        use_fast_path=True,
+        layer_idx=None,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.conv_init = conv_init
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.headdim = headdim
+        self.ngroups = ngroups
+        assert self.d_inner % self.headdim == 0
+        self.nheads = self.d_inner // self.headdim
+        self.D_has_hdim = D_has_hdim
+        self.rmsnorm = rmsnorm
+        self.norm_before_gate = norm_before_gate
+        self.chunk_size = chunk_size
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        assert self.d_inner % self.tensor_model_parallel_size == 0
+        assert self.ngroups % self.tensor_model_parallel_size == 0
+        assert self.nheads % self.tensor_model_parallel_size == 0
+        assert not bias
+
+        self.d_inner_local = self.d_inner // self.tensor_model_parallel_size
+        self.ngroups_local = self.ngroups // self.tensor_model_parallel_size
+        self.nheads_local = self.nheads // self.tensor_model_parallel_size
+
+        assert self.d_inner_local % self.ngroups_local == 0
+
+        # Assume sequence parallelism: input is already partitioned along the
+        # sequence dimension
+        self.in_proj = ColumnParallelLinear(
+            self.d_model,
+            self.d_inner * 2 + 2 * self.ngroups * self.d_state + self.nheads,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=bias,
+        )
+
+        conv_dim = self.d_inner_local + 2 * self.ngroups_local * self.d_state
+        with get_cuda_rng_tracker().fork():
+            self.conv1d = nn.Conv1d(
+                in_channels=conv_dim,
+                out_channels=conv_dim,
+                bias=conv_bias,
+                kernel_size=d_conv,
+                groups=conv_dim,
+                padding=d_conv - 1,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+            setattr(self.conv1d.weight, 'tensor_model_parallel', True)
+            setattr(self.conv1d.bias, 'tensor_model_parallel', True)
+
+            if self.conv_init is not None:
+                nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+
+        self.activation = "silu"
+        self.act = nn.SiLU()
+
+        with get_cuda_rng_tracker().fork():
+            # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+            dt = torch.exp(
+                torch.rand(
+                    self.nheads_local, device=torch.cuda.current_device(), dtype=config.params_dtype
+                )
+                * (math.log(dt_max) - math.log(dt_min))
+                + math.log(dt_min)
+            ).clamp(min=dt_init_floor)
+            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                self.dt_bias = nn.Parameter(inv_dt)
+            # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+            self.dt_bias._no_reinit = True
+            # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+            # name.endswith("bias") in param_grouping.py
+            self.dt_bias._no_weight_decay = True
+
+            assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+            A = torch.empty(
+                self.nheads_local, dtype=torch.float32, device=torch.cuda.current_device()
+            ).uniform_(*A_init_range)
+            A_log = torch.log(A)  # Keep A_log in fp32
+            self.A_log = nn.Parameter(A_log)
+            self.A_log._no_weight_decay = True
+            setattr(self.A_log, 'tensor_model_parallel', True)
+
+        # D "skip" parameter
+        self.D = nn.Parameter(
+            torch.ones(
+                self.d_inner_local if self.D_has_hdim else self.nheads_local,
+                device=torch.cuda.current_device(),
+            )
+        )  # Keep in fp32
+        self.D._no_weight_decay = True
+        setattr(self.D, 'tensor_model_parallel', True)
+
+        if self.rmsnorm:
+            assert RMSNormGated is not None
+            self.norm = RMSNormGated(
+                self.d_inner_local,
+                eps=1e-5,
+                group_size=self.d_inner_local // self.ngroups_local,
+                norm_before_gate=False,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+
+        # Assume sequence parallelism: input is partitioned along d_inner and
+        # output is partitioned along the sequence dimension
+        self.out_proj = RowParallelLinear(
+            self.d_inner,
+            self.d_model,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=bias,
+            input_is_parallel=True,
+            skip_bias_add=False,
+        )
+
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (nL, B, D) / (L B D)
+        Returns: same shape as hidden_states
+        """
+        _, batch, dim = hidden_states.shape
+
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            assert not self.config.sequence_parallel
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+
+        # (nheads_local)
+        A = -torch.exp(self.A_log.float())
+
+        # pl b d ->  l b p(2d)
+        # TODO move transpose to GEMM
+        if self.config.sequence_parallel:
+            # gather data along sequenece dimension
+            hidden_states = gather_from_sequence_parallel_region(hidden_states)
+        else:
+            hidden_states = copy_to_tensor_model_parallel_region(hidden_states)
+        xz = hidden_states @ self.in_proj.weight.t()
+
+        z, xBC, dt = torch.split(
+            xz,
+            [
+                self.d_inner_local,
+                self.d_inner_local + 2 * self.ngroups_local * self.d_state,
+                self.nheads_local,
+            ],
+            dim=-1,
+        )
+
+        # transpose: l b pd --> b pd l
+        xBC = rearrange(xBC, "l b d -> b d l")
+        xBC = xBC.contiguous()
+
+        # Compute short convolution
+        if conv_state is not None:
+            # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+            # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+            conv_state.copy_(F.pad(xBC, (self.d_conv - xBC.shape[-1], 0)))  # Update state (B D W)
+
+        seqlen = xBC.size(2)
+        if causal_conv1d_fn is None:
+            xBC = self.act(self.conv1d(xBC)[..., :seqlen])
+        else:
+            assert self.activation in ["silu", "swish"]
+            xBC = causal_conv1d_fn(
+                x=xBC,
+                weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                bias=self.conv1d.bias,
+                activation=self.activation,
+            )
+
+        # transpose b pd l --> l b pd
+        xBC = rearrange(xBC, "b d l ->  l b d")
+        xBC = xBC.contiguous()
+
+        x, B, C = torch.split(
+            xBC,
+            [
+                self.d_inner_local,
+                self.ngroups_local * self.d_state,
+                self.ngroups_local * self.d_state,
+            ],
+            dim=-1,
+        )
+
+        # TODO Vijay: fuse most of the transposes with the GEMMS
+        x = rearrange(x, "l b (h p) -> b l h p", p=self.headdim).contiguous()
+        dt = rearrange(dt, "l b d -> b l d").contiguous()
+        B = rearrange(B, "l b (g n) -> b l g n", n=self.d_state).contiguous()
+        C = rearrange(C, "l b (g n) -> b l g n", n=self.d_state).contiguous()
+        z = rearrange(z, "l b (h p) -> b l h p", p=self.headdim).contiguous()
+        y = mamba_chunk_scan_combined(
+            x,
+            dt,
+            A,
+            B,
+            C,
+            self.chunk_size,
+            D=rearrange(self.D.float(), "(h p) -> h p", p=self.headdim)
+            if self.D_has_hdim
+            else self.D,
+            z=z if not self.rmsnorm else None,
+            dt_bias=self.dt_bias.float(),
+            dt_softplus=True,
+            return_final_states=ssm_state is not None,
+        )
+
+        if ssm_state is not None:
+            y, last_state = y
+            ssm_state.copy_(last_state)
+
+        if self.rmsnorm:
+            y = rearrange(y, "b l h p -> b l (h p)").contiguous()
+            z = rearrange(z, "b l h p -> b l (h p)").contiguous()
+            y = self.norm(y, z)
+            y = rearrange(y, "b l d -> l b d").contiguous()
+        else:
+            y = rearrange(y, "b l h p -> l b (h p)").contiguous()
+
+        #  l b pd --> pl b d
+        out_full = y @ self.out_proj.weight.t()
+        if self.config.sequence_parallel:
+            out = reduce_scatter_to_sequence_parallel_region(out_full)
+        else:
+            out = reduce_from_tensor_model_parallel_region(out_full)
+        return out
+
+    def step(self, hidden_states, conv_state, ssm_state):
+        # assert self.ngroups_local == 1, "Only support ngroups=1 for inference for now"
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[0] == 1, "Only support decoding with 1 token at a time for now"
+
+        # l b d --> b d
+        hidden_states = hidden_states.squeeze(0)
+
+        #  b d_model --> b p(2d)
+        xz = hidden_states @ self.in_proj.weight.t()
+
+        z, xBC, dt = torch.split(
+            xz,
+            [
+                self.d_inner_local,
+                self.d_inner_local + 2 * self.ngroups_local * self.d_state,
+                self.nheads_local,
+            ],
+            dim=-1,
+        )
+
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = xBC
+            xBC = torch.sum(
+                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
+            )  # (B D)
+            if self.conv1d.bias is not None:
+                xBC = xBC + self.conv1d.bias
+            xBC = self.act(xBC).to(dtype=dtype)
+        else:
+            xBC = causal_conv1d_update(
+                xBC,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+        x, B, C = torch.split(
+            xBC,
+            [
+                self.d_inner_local,
+                self.ngroups_local * self.d_state,
+                self.ngroups_local * self.d_state,
+            ],
+            dim=-1,
+        )
+        A = -torch.exp(self.A_log.float())
+
+        # SSM step
+        if selective_state_update is None:
+            if self.ngroups_local > 1:
+                B = rearrange(B, "b (g n) -> b g n", n=self.d_state)
+                C = rearrange(C, "b (g n) -> b g n", n=self.d_state)
+                B = repeat(B, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local)
+                C = repeat(C, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local)
+
+                dt = repeat(dt, "b h -> b (h p)", p=self.headdim)
+                dt_bias = repeat(self.dt_bias, "h -> (h p)", p=self.headdim)
+                A = repeat(A, "h -> (h p) n", p=self.headdim, n=self.d_state)
+                D = repeat(self.D, "h -> (h p)", p=self.headdim)
+
+                dt = F.softplus(dt + dt_bias.to(dtype=dt.dtype))
+                dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+
+                dB_x = torch.einsum('bd,bdn,bd->bdn', dt, B, x)
+                ssm_state.copy_(
+                    ssm_state * rearrange(dA, "b (h p) n -> b h p n", p=self.headdim)
+                    + rearrange(dB_x, "b (h p) n -> b h p n", p=self.headdim)
+                )
+
+                y = torch.einsum(
+                    "bdn,bdn->bd",
+                    rearrange(ssm_state.to(dtype), "b h p n -> b (h p) n", p=self.headdim),
+                    C,
+                )
+                y = y + D.to(dtype) * x
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+            else:
+                # Discretize A and B (b (g n))
+                dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
+                dA = torch.exp(dt * A)
+                x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
+                ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
+                y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
+                y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
+                y = rearrange(y, "b h p -> b (h p)")
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+        else:
+            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
+            dt = repeat(dt, "b h -> b h p", p=self.headdim)
+            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
+            D = repeat(self.D, "h -> h p", p=self.headdim)
+            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups_local)
+            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups_local)
+            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+            if not self.rmsnorm:
+                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
+            y = selective_state_update(
+                ssm_state,
+                x_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=z if not self.rmsnorm else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            y = rearrange(y, "b h p -> b (h p)")
+
+        if self.rmsnorm:
+            y = self.norm(y, z)
+
+        # b pd --> b d
+        out = y @ self.out_proj.weight.t()
+        out = reduce_from_tensor_model_parallel_region(out)
+        return out.unsqueeze(0), conv_state, ssm_state
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.conv1d.weight.shape[0], self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size,
+            self.nheads_local,
+            self.headdim,
+            self.d_state,
+            device=device,
+            dtype=ssm_dtype,
+        )
+        return conv_state, ssm_state
+
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            conv_state = torch.zeros(
+                batch_size,
+                self.conv1d.weight.shape[0],
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.nheads_local,
+                self.headdim,
+                self.d_state,
+                device=self.in_proj.weight.device,
+                dtype=self.in_proj.weight.dtype,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
diff --git a/megatron/core/ssm/triton_cache_manager.py b/megatron/core/ssm/triton_cache_manager.py
new file mode 100644
index 0000000000..43b5b34f39
--- /dev/null
+++ b/megatron/core/ssm/triton_cache_manager.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import os
+import socket
+from pathlib import Path
+
+import torch
+
+try:
+    from triton.runtime.cache import FileCacheManager
+except ImportError:
+    raise ImportError("triton is required by the Mamba model but cannot be imported")
+
+
+def get_rank():
+    return torch.distributed.get_rank()
+
+
+def default_cache_dir():
+    return os.path.join(Path.home(), ".triton", "cache")
+
+
+class ParallelFileCacheManager(FileCacheManager):
+
+    # See https://github.com/triton-lang/triton/blob/main/python/triton/runtime/cache.py
+
+    # When running Triton with multiple ranks, they each create their own cache manager. Their input
+    # keys to that class are mostly (but not entirely) the same across ranks, which leads many ranks
+    # to write to the same 'key' directories in the cache dir at the same time during compilation,
+    # leading to conflicts.  This works around that by making each cache dir be rank specific by
+    # adding "rank_<host>_<pid>" to the cache directory.
+
+    def __init__(self, key):
+        self.key = key
+        self.lock_path = None
+        # create cache directory if it doesn't exist
+        self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir())
+        self.cache_dir = os.path.join(
+            self.cache_dir, "rank_{}_{}".format(socket.gethostname(), os.getpid())
+        )
+        if self.cache_dir:
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py
index 6b0aa59839..87f32a56a3 100644
--- a/megatron/core/tensor_parallel/__init__.py
+++ b/megatron/core/tensor_parallel/__init__.py
@@ -19,7 +19,9 @@
     gather_from_sequence_parallel_region,
     gather_from_sequence_parallel_region_to_moe,
     gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
     reduce_scatter_last_dim_to_tensor_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
     reduce_scatter_to_sequence_parallel_region_from_moe,
     scatter_to_sequence_parallel_region,
     scatter_to_tensor_model_parallel_region,
@@ -54,7 +56,8 @@
     "copy_to_tensor_model_parallel_region",
     "gather_from_tensor_model_parallel_region",
     "gather_from_sequence_parallel_region",
-    #    "reduce_from_tensor_model_parallel_region",
+    "reduce_from_tensor_model_parallel_region",
+    "reduce_scatter_to_sequence_parallel_region",
     "scatter_to_tensor_model_parallel_region",
     "scatter_to_sequence_parallel_region",
     # random.py
diff --git a/megatron/core/tensor_parallel/mappings.py b/megatron/core/tensor_parallel/mappings.py
index efc901fb0e..88e77541d1 100644
--- a/megatron/core/tensor_parallel/mappings.py
+++ b/megatron/core/tensor_parallel/mappings.py
@@ -22,7 +22,7 @@ def _reduce(input_):
         return input_
 
     # All-reduce.
-    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+    torch.distributed.all_reduce(input_.contiguous(), group=get_tensor_model_parallel_group())
 
     return input_
 
diff --git a/megatron/inference/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py
index cab2d2ea5a..8532be9621 100644
--- a/megatron/inference/text_generation/tokenization.py
+++ b/megatron/inference/text_generation/tokenization.py
@@ -32,6 +32,7 @@ def detokenize_generations(tokens_gpu_tensor,
             for token in sequence_tokens:
                 if args.tokenizer_type in ['SentencePieceTokenizer',
                                            'GPTSentencePieceTokenizer',
+                                           'HuggingFaceTokenizer',
                                            'Llama2Tokenizer',
                                            'MistralTokenizer']:
                     word = tokenizer.decoder[token]
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index a0fe8e0f4c..47b6c9f7ef 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -749,7 +749,7 @@ def _add_network_size_args(parser):
                        help='Maximum number of position embeddings to use. '
                        'This is the size of position embedding.')
     group.add_argument('--position-embedding-type', type=str, default='learned_absolute',
-                       choices=['learned_absolute', 'rope'],
+                       choices=['learned_absolute', 'rope', 'none'],
                        help='Position embedding type.')
     group.add_argument('--use-rotary-position-embeddings', action='store_true',
                        help='Use rotary positional embeddings or not. '
@@ -1186,14 +1186,21 @@ def _add_learning_rate_args(parser):
                        'and initial warmup, the learning rate at each '
                        'iteration would be different.')
     group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'inverse-square-root'],
+                       choices=['constant', 'linear', 'cosine', 'inverse-square-root', 'WSD'],
                        help='Learning rate decay function.')
+    group.add_argument('--lr-wsd-decay-style', type=str, default='exponential',
+                       choices=['exponential', 'linear', 'cosine'],
+                       help='Decay style for the annealing phase of WSD'),
     group.add_argument('--lr-decay-iters', type=int, default=None,
                        help='number of iterations to decay learning rate over,'
                        ' If None defaults to `--train-iters`')
     group.add_argument('--lr-decay-samples', type=int, default=None,
                        help='number of samples to decay learning rate over,'
                        ' If None defaults to `--train-samples`')
+    group.add_argument('--lr-wsd-decay-samples', type=int, default=None,
+                       help='number of samples for the annealing phase in the wsd schedule')
+    group.add_argument('--lr-wsd-decay-iters', type=int, default=None,
+                       help='number of iterations for the annealing phase in the wsd schedule')
     group.add_argument('--lr-warmup-fraction', type=float, default=None,
                        help='fraction of lr-warmup-(iters/samples) to use '
                        'for warmup (as a float)')
@@ -1488,6 +1495,7 @@ def _add_data_args(parser):
                                 'GPT2BPETokenizer',
                                 'SentencePieceTokenizer',
                                 'GPTSentencePieceTokenizer',
+                                'HuggingFaceTokenizer',
                                 'Llama2Tokenizer',
                                 'Llama3Tokenizer',
                                 'MistralTokenizer',
@@ -1700,6 +1708,18 @@ def _add_experimental_args(parser):
                        'To use local spec specify local as the argument.'
                        'For more details, see the model class, '
                        '`transformer_block.py`, or `transformer_layer.py`')
+    group.add_argument('--hybrid-attention-ratio', type=float, default=0.0,
+                       help='Ratio of attention layers to total layers, in the '
+                       'range [0.0, 1.0].')
+    group.add_argument('--hybrid-mlp-ratio', type=float, default=0.0,
+                       help='Ratio of mlp layers to total layers, in the '
+                       'range [0.0, 1.0].')
+    group.add_argument('--hybrid-override-pattern', type=str, default=None,
+                       help='Force a specific hybrid layer pattern. If a value'
+                       'greater than 0.0 is supplied to any of the hybrid ratio'
+                       'arguments, then the number of each type of layer in the'
+                       'override pattern must match number in the overidden'
+                       'pattern')
     group.add_argument('--yaml-cfg', type=str, default=None,
                        help = 'Config file to add additional arguments')
 
diff --git a/megatron/training/optimizer_param_scheduler.py b/megatron/training/optimizer_param_scheduler.py
index 54a45ef098..409e1dbc7d 100644
--- a/megatron/training/optimizer_param_scheduler.py
+++ b/megatron/training/optimizer_param_scheduler.py
@@ -13,7 +13,9 @@ def __init__(self, optimizer, init_lr, max_lr, min_lr,
                  lr_warmup_steps, lr_decay_steps, lr_decay_style,
                  start_wd, end_wd, wd_incr_steps, wd_incr_style,
                  use_checkpoint_opt_param_scheduler=True,
-                 override_opt_param_scheduler=False):
+                 override_opt_param_scheduler=False,
+                 wsd_decay_steps=None,
+                 lr_wsd_decay_style=None):
 
         # Class values.
         self.optimizer = optimizer
@@ -28,10 +30,14 @@ def __init__(self, optimizer, init_lr, max_lr, min_lr,
         self.lr_warmup_steps = lr_warmup_steps
         self.num_steps = 0
         self.lr_decay_steps = lr_decay_steps
+        self.wsd_decay_steps = wsd_decay_steps
+        self.lr_wsd_decay_style = lr_wsd_decay_style
         assert self.lr_decay_steps > 0
         assert self.lr_warmup_steps < self.lr_decay_steps
 
         self.lr_decay_style = lr_decay_style
+        if self.lr_decay_style == "WSD":
+            assert self.wsd_decay_steps is not None
 
         self.start_wd = start_wd
         self.end_wd = end_wd
@@ -120,6 +126,19 @@ def get_lr(self, param_group):
             coeff = (1.0 - decay_ratio)
         elif self.lr_decay_style == 'cosine':
             coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        elif self.lr_decay_style == 'WSD':
+            wsd_anneal_start_ = self.lr_decay_steps - self.wsd_decay_steps
+            if self.num_steps <= wsd_anneal_start_:
+                coeff = 1.0
+            else:
+                wsd_steps = self.num_steps - wsd_anneal_start_
+                wsd_decay_ratio = float(wsd_steps) / float(self.wsd_decay_steps)
+                if self.lr_wsd_decay_style == "linear":
+                    coeff = (1.0 - wsd_decay_ratio)
+                elif self.lr_wsd_decay_style == "cosine":
+                    coeff = 0.5 * (math.cos(math.pi * wsd_decay_ratio) + 1.0)
+                elif self.lr_wsd_decay_style == "exponential":
+                    coeff = ((2.0 * math.pow(0.5, wsd_decay_ratio)) - 1.0)
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.lr_decay_style))
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
index b5953a5c6c..b88909eea3 100644
--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -38,6 +38,8 @@ def build_tokenizer(args):
     elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
         assert args.tokenizer_model is not None
         tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'HuggingFaceTokenizer':
+        tokenizer = _HuggingFaceTokenizer(args.tokenizer_model)
     elif args.tokenizer_type == 'Llama2Tokenizer':
         assert args.tokenizer_model is not None
         tokenizer = _Llama2Tokenizer(args.tokenizer_model)
@@ -78,6 +80,48 @@ def _vocab_size_with_padding(orig_vocab_size, args):
     return after
 
 
+class _HuggingFaceTokenizer(MegatronTokenizer):
+    def __init__(self, pretrained_model_name_or_path):
+        super().__init__(pretrained_model_name_or_path)
+        try:
+            import transformers
+        except ImportError:
+            raise EnvironmentError(f"The transformers library must be installed to use huggingface_tokenizer_provider")
+
+        # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
+        self._vocab = self._tokenizer.get_vocab()
+        self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()}
+
+    @property
+    def vocab_size(self):
+        return len(self._tokenizer)
+
+    @property
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        return self._vocab
+
+    @property
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        return self._inv_vocab
+
+    @property
+    def decoder(self):
+        return self._inv_vocab
+
+    def tokenize(self, text):
+        return self._tokenizer(text).input_ids
+
+    def detokenize(self, token_ids):
+        return self._tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self._tokenizer.eos_token_id
+
+
 class _BertWordPieceTokenizer(MegatronTokenizer):
     """Original BERT wordpiece tokenizer."""
 
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 8c12268d24..3b6c437be5 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -449,6 +449,9 @@ def get_optimizer_param_scheduler(optimizer):
             args.lr_decay_iters = args.train_iters
         lr_decay_steps = args.lr_decay_iters * args.global_batch_size
         wd_incr_steps = args.train_iters * args.global_batch_size
+        wsd_decay_steps = None
+        if args.lr_wsd_decay_iters is not None:
+            wsd_decay_steps = args.lr_wsd_decay_iters * args.global_batch_size
         if args.lr_warmup_fraction is not None:
             lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
         else:
@@ -463,6 +466,7 @@ def get_optimizer_param_scheduler(optimizer):
             args.lr_decay_samples = args.train_samples
         lr_decay_steps = args.lr_decay_samples
         wd_incr_steps = args.train_samples
+        wsd_decay_steps = args.lr_wsd_decay_samples
         if args.lr_warmup_fraction is not None:
             lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
         else:
@@ -484,7 +488,9 @@ def get_optimizer_param_scheduler(optimizer):
         wd_incr_steps=wd_incr_steps,
         wd_incr_style=args.weight_decay_incr_style,
         use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
-        override_opt_param_scheduler=args.override_opt_param_scheduler)
+        override_opt_param_scheduler=args.override_opt_param_scheduler,
+        wsd_decay_steps=wsd_decay_steps,
+        lr_wsd_decay_style=args.lr_wsd_decay_style)
 
     return opt_param_scheduler
 
diff --git a/pretrain_mamba.py b/pretrain_mamba.py
new file mode 100644
index 0000000000..f2dbb97e67
--- /dev/null
+++ b/pretrain_mamba.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain Mamba."""
+
+import os
+import torch
+from functools import partial
+
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import mpu
+# from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
+from megatron.core.models.mamba import MambaModel
+from megatron.training import pretrain
+from megatron.core.utils import StragglerDetector
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+
+
+stimer = StragglerDetector()
+
+def count_parameters_in_layer(model, layer_name):
+    num_params = 0
+    for name, param in model.named_parameters():
+        if layer_name in name:
+            num_params += param.numel()
+            print_rank_0(f" - {name}: {param.numel()}")
+    return num_params
+
+
+def model_provider(pre_process=True, post_process=True) -> MambaModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        MambaModel: The returned model
+    """
+    args = get_args()
+
+    print_rank_0('building Mamba model ...')
+    config = core_transformer_config_from_args(get_args())
+
+    assert args.use_legacy_models == False, "Mamba only supported in Mcore!"
+
+    if args.spec is not None:
+        mamba_stack_spec = import_module(args.spec)
+    else:
+        raise("You must provide a valid Mamba layer spec!")
+
+    model = MambaModel(
+        config=config,
+        mamba_stack_spec=mamba_stack_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        hybrid_attention_ratio=args.hybrid_attention_ratio,
+        hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+        hybrid_override_pattern=args.hybrid_override_pattern,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type
+    )
+
+    for l in range(model.decoder.num_layers_per_pipeline_rank):
+        layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.')
+        print_rank_0(f" == params layer {l}: {layer_params}")
+
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch."""
+
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+    """Loss function.
+
+    Args:
+        loss_mask (torch.Tensor): Used to mask out some portions of the loss
+        output_tensor (torch.Tensor): The tensor with the losses
+
+    Returns:
+        the loss scalar for this micro-batch
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
+    """
+    args = get_args()
+
+    losses = output_tensor.float()
+    loss_mask = loss_mask.view(-1).float()
+    total_tokens = loss_mask.sum()
+    loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+
+    if args.context_parallel_size > 1:
+        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
+
+    # Check individual rank losses are not NaN prior to DP all-reduce.
+    if args.check_for_nan_in_loss_and_grad:
+        global_rank = torch.distributed.get_rank()
+        assert not loss[0].isnan(), (
+            f'Rank {global_rank}: found NaN in local forward loss calculation. '
+            f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
+        )
+
+    # Reduce loss for logging.
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+    return (
+        loss[0] * args.context_parallel_size,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
+
+
+def forward_step(data_iterator, model: MambaModel):
+    """Forward training step.
+
+    Args:
+        data_iterator : Input data iterator
+        model (MambaModel): The GPT Model
+    """
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            data_iterator)
+    timers('batch-generator').stop()
+
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+    )
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
+    args = get_args()
+
+    config = core_gpt_dataset_config_from_args(args)
+
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type,
+        train_val_test_num_samples,
+        is_dataset_built_on_rank,
+        config
+    ).build()
+
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(train_valid_test_datasets_provider,
+             model_provider,
+             ModelType.encoder_or_decoder,
+             forward_step,
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/tools/checkpoint/hybrid_conversion.py b/tools/checkpoint/hybrid_conversion.py
new file mode 100644
index 0000000000..737fac6b0f
--- /dev/null
+++ b/tools/checkpoint/hybrid_conversion.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+# Note (rwaleffe): This is a temporary file for hybrid mamba-transformer model checkpoint conversion.
+# This functionality should be integrated with the megatron core checkpoint loader/saver.
+
+
+import copy
+import os
+import re
+import shutil
+from collections import OrderedDict
+
+import torch
+import argparse
+
+
+tp_split_dim = {
+    'word_embeddings.weight': 0,
+    'norm.weight': -1,
+    'final_norm.weight': -1,
+    'output_layer.weight': 0,
+    # mamba1/2
+    'A_log': 0,
+    'D': 0,
+    'dt_bias': 0,
+    'in_proj.weight': 0,
+    'conv1d.weight': 0,
+    'conv1d.bias': 0,
+    'x_proj.weight': 1,
+    'dt_proj.weight': 0,
+    'dt_proj.bias': 0,
+    'out_proj.weight': 1,
+    'mixer.norm.weight': 0,
+    # mlp
+    'linear_fc1.layer_norm_weight': -1,
+    'linear_fc1.weight': 0,
+    'linear_fc2.weight': 1,
+    # attention
+    'self_attention.linear_proj.weight': 1,
+    'self_attention.linear_qkv.layer_norm_weight': -1,
+    'self_attention.linear_qkv.weight': 0,
+}
+
+
+def get_split_dim(tensor_name):
+    # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish
+    if 'norm.weight' in tensor_name:
+        if 'mixer.norm.weight' in tensor_name:
+            return tp_split_dim['mixer.norm.weight']
+        else:
+            return tp_split_dim['norm.weight']
+
+    for key in tp_split_dim.keys():
+        if key in tensor_name:
+            return tp_split_dim[key]
+    raise Exception("Unknown tensor name {}".format(tensor_name))
+
+
+def combine_tp_tensors(params, key, dim, tensors):
+    tp_size = len(tensors)
+
+    if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+        xs = []; zs = []
+        for tensor in tensors:
+            x, z = torch.split(tensor, [params.mamba_d_inner//tp_size,
+                                        params.mamba_d_inner//tp_size], dim=dim)
+            xs.append(x); zs.append(z)
+        return torch.cat([torch.cat(xs, dim=dim), torch.cat(zs, dim=dim)], dim=dim)
+
+    elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+        xs = []; zs = []; Bs = []; Cs = []; dts = []
+        for tensor in tensors:
+            x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner // tp_size,
+                                                  params.mamba_d_inner // tp_size,
+                                                  (params.mamba2_n_groups // tp_size) * args.mamba_d_state,
+                                                  (params.mamba2_n_groups // tp_size) * args.mamba_d_state,
+                                                  params.mamba2_n_heads // tp_size], dim=dim)
+            xs.append(x); zs.append(z); Bs.append(B); Cs.append(C); dts.append(dt)
+
+        for ii in range(len(Bs)):
+            Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-1]))
+            Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-1]))
+        B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim)
+        x = torch.cat(xs, dim=dim); z = torch.cat(zs, dim=dim); dt = torch.cat(dts, dim=dim)
+
+        return torch.cat([x, z, B.flatten(0, 1), C.flatten(0, 1), dt], dim=dim)
+
+    elif 'mixer.conv1d' in key and params.mamba_version == 2:
+        xs = []; Bs = []; Cs = []
+        for tensor in tensors:
+            x, B, C = torch.split(tensor, [params.mamba_d_inner//tp_size,
+                                           (params.mamba2_n_groups // tp_size) * params.mamba_d_state,
+                                           (params.mamba2_n_groups // tp_size) * params.mamba_d_state], dim=dim)
+            xs.append(x); Bs.append(B); Cs.append(C)
+
+        for ii in range(len(Bs)):
+            if 'weight' in key:
+                Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-2], Bs[ii].shape[-1]))
+                Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-2], Cs[ii].shape[-1]))
+            elif 'bias' in key:
+                Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state))
+                Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state))
+            else:
+                raise Exception("Unknown key")
+        B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim)
+        x = torch.cat(xs, dim=dim)
+
+        return torch.cat([x, B.flatten(0, 1), C.flatten(0, 1)], dim=dim)
+
+    else:
+        return torch.cat(tensors, dim=dim)
+
+
+def split_tensor_for_tp(params, key, dim, tensor):
+    tp_size = params.target_tp_size
+    tensor_sliced = []
+
+    if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+        x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+        z_sliced = torch.chunk(z, tp_size, dim=dim)
+        for (x, z) in zip(x_sliced, z_sliced):
+            tensor_sliced.append(torch.cat((x, z), dim=dim))
+
+    elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+        x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner,
+                                                      params.mamba2_n_groups * params.mamba_d_state,
+                                                      params.mamba2_n_groups * params.mamba_d_state,
+                                                      params.mamba2_n_heads], dim=dim)
+        B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1]))
+        C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1]))
+
+        B_sliced = torch.chunk(B, tp_size, dim=dim)
+        C_sliced = torch.chunk(C, tp_size, dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+        z_sliced = torch.chunk(z, tp_size, dim=dim)
+        dt_sliced = torch.chunk(dt, tp_size, dim=dim)
+
+        tensor_sliced = []
+        for (x, z, B, C, dt) in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced):
+            tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim))
+
+    elif 'mixer.conv1d' in key and params.mamba_version == 2:
+        x, B, C = torch.split(tensor, [params.mamba_d_inner,
+                                               params.mamba2_n_groups * params.mamba_d_state,
+                                               params.mamba2_n_groups * params.mamba_d_state], dim=dim)
+        if 'weight' in key:
+            B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1]))
+            C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1]))
+        elif 'bias' in key:
+            B = torch.reshape(B, (-1, params.mamba_d_state))
+            C = torch.reshape(C, (-1, params.mamba_d_state))
+        else:
+            raise Exception("Unknown key")
+
+        B_sliced = torch.chunk(B, tp_size, dim=dim)
+        C_sliced = torch.chunk(C, tp_size, dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+
+        tensor_sliced = []
+        for (x, B, C) in zip(x_sliced, B_sliced, C_sliced):
+            tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim))
+
+    else:
+        tensor_sliced = torch.chunk(tensor, tp_size, dim=dim)
+
+    return tensor_sliced
+
+
+def finalize_checkpoint(sample_model, model, params, verbose=False):
+    # make sure the rest of the checkpoint is how we want it from the original (i.e., other than the 'model')
+    reset_iterations = params.reset_iterations
+
+    # checkpoint 'args'
+    model['args'] = copy.deepcopy(sample_model['args'])
+    model['args'].tensor_model_parallel_size = params.target_tp_size
+    model['args'].pipeline_model_parallel_size = params.target_pp_size
+    if reset_iterations:
+        model['args'].iteration = 0
+        model['args'].consumed_valid_samples = 0
+        model['args'].consumed_train_samples = 0
+        model['args'].train_iters = 0
+        model['args'].train_samples = 0
+
+    # checkpoint 'checkpoint_version'
+    model['checkpoint_version'] = copy.deepcopy(sample_model['checkpoint_version'])
+
+    # checkpoint 'iteration'
+    model['iteration'] = copy.deepcopy(sample_model['iteration'])
+    if reset_iterations:
+        model['iteration'] = 0
+
+    # checkpoint 'optimizer'
+    # ignore
+
+    # checkpoint 'opt_param_scheduler'
+    if 'opt_param_scheduler' in sample_model.keys():
+        model['opt_param_scheduler'] = copy.deepcopy(sample_model['opt_param_scheduler'])
+
+    # checkpoint 'rng_state'
+    model['rng_state'] = copy.deepcopy(sample_model['rng_state'])
+
+    # report on argument difference
+    if verbose:
+        original_args = sample_model['args'].__dict__
+        final_args = model['args'].__dict__
+        for key in original_args:
+            if key in final_args:
+                if final_args[key] != original_args[key]:
+                    print("KEY MISMATCH: {}".format(key))
+                    print("\toriginal: {}\n\tfinal: {}".format(original_args[key], final_args[key]))
+            else:
+                print("KEY MISSING from final: {}, value {}".format(key, original_args[key]))
+        print("")
+        for key in final_args:
+            if key not in original_args:
+                print("KEY ADDED to final: {}, value {}".format(key, final_args[key]))
+
+    return model
+
+
+def main(args):
+    print("\n====RUNNING CHECKPOINT CONVERSION====\n")
+
+    args.mamba_d_inner = args.d_model * 2
+    args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim
+
+    # get the latest iteration
+    tracker_filename = os.path.join(args.load_dir, 'latest_checkpointed_iteration.txt')
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            raise Exception("")
+    out_iteration = iteration if not args.reset_iterations else 0
+
+    # get model directory and model parallel ranks
+    input_model_dir = os.path.join(args.load_dir, 'iter_{:07d}'.format(iteration))
+    input_sub_models = os.listdir(input_model_dir)
+    # input_sub_models = sorted(input_sub_models, key=lambda x: int(re.search(r'\d+', x).group()))
+
+    # load one of the model parallel ranks to get arguments
+    sample_model_file = os.path.join(input_model_dir, input_sub_models[0], "model_optim_rng.pt")
+    sample_model = torch.load(sample_model_file)
+    print(f"Sample model {sample_model_file} is loaded.\n")
+
+    # input tensor and pipeline parallel size
+    input_tp_rank = sample_model['args'].tensor_model_parallel_size
+    input_pp_rank = sample_model['args'].pipeline_model_parallel_size
+    num_layers_per_pipeline_rank = sample_model['args'].num_layers // input_pp_rank
+
+    # construct full model
+    full_model = OrderedDict()
+    for pp in range(input_pp_rank):
+        print("[INFO] Processing input pipeline rank {}".format(pp))
+        tp_models = []
+        for tp in range(input_tp_rank):
+            dir_name = "mp_rank_{:02d}".format(tp)
+            if input_pp_rank > 1:
+                dir_name += "_{:03d}".format(pp)
+            model_file = os.path.join(input_model_dir, dir_name, "model_optim_rng.pt")
+
+            tp_models.append(torch.load(model_file))
+            print(f"Model {model_file} is loaded.")
+
+        if input_tp_rank > 1:
+            combined_tp_model = OrderedDict()
+            for ii, (key, original_tensor) in enumerate(tp_models[0]['model'].items()):
+                if "_extra_state" in key:
+                    combined_tp_model[key] = original_tensor
+                    continue
+
+                split_dim = get_split_dim(key)
+                original_shape = list(original_tensor.shape)
+                combined_shape = copy.deepcopy(original_shape)
+                combined_shape[split_dim] *= input_tp_rank
+                # print("{}, {}, {}".format(ii, key, split_dim))
+
+                if split_dim != -1:
+                    # slice together model
+                    # print("\tshape mismatch: original {}, combined {}".format(original_shape, combined_shape))
+                    combined_tensor = combine_tp_tensors(args, key, split_dim,
+                                                    [tp_models[jj]['model'][key].cpu() for jj in range(input_tp_rank)])
+                    combined_tp_model[key] = combined_tensor
+                else:
+                    # copy model
+                    combined_tp_model[key] = original_tensor
+        else:
+            combined_tp_model = tp_models[0]['model']
+        # print("Combined tp model: {}".format(combined_tp_model.keys()))
+
+        for ii, (key, original_tensor) in enumerate(combined_tp_model.items()):
+            try:
+                layer_num = int(re.findall(r'\d+', key)[0])
+                new_key = key.replace(str(layer_num), str(layer_num + pp*num_layers_per_pipeline_rank), 1)
+            except:
+                new_key = key
+            full_model[new_key] = original_tensor
+    # print("Combined model: {}".format(full_model.keys()))
+    print("\n[INFO] Loaded combined model\n")
+
+    # sort by layer
+    # full_model_sorted = dict(sorted(people.items(), key=lambda item: item[1]))
+
+    # create new split model
+    pp_offset = 0
+    num_layers_per_pipeline_rank = sample_model['args'].num_layers // args.target_pp_size
+
+    for pp in range(args.target_pp_size):
+        print("[INFO] Processing output pipeline rank {}".format(pp))
+        tp_models = []
+        for ii in range(args.target_tp_size):
+            tp_models.append({'model': OrderedDict()})
+
+        for ii, (key, original_tensor) in enumerate(full_model.items()):
+            try:
+                layer_num = int(re.findall(r'\d+', key)[0])
+                if layer_num >= num_layers_per_pipeline_rank * (pp+1):
+                    break
+                new_key = key.replace(str(layer_num), str(layer_num - (pp * num_layers_per_pipeline_rank)), 1)
+            except:
+                new_key = key
+
+            if ii < pp_offset:
+                continue
+            else:
+                pp_offset += 1
+
+            if "_extra_state" in new_key:
+                # copy
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = original_tensor
+                continue
+
+            split_dim = get_split_dim(new_key)
+            original_shape = list(original_tensor.shape)
+            v0 = original_shape[split_dim]
+            split_size = v0 // args.target_tp_size
+            split_shape = copy.deepcopy(original_shape)
+            split_shape[split_dim] = split_size
+            # print("{}, {}, {}".format(ii, new_key, split_dim))
+
+            if split_dim != -1:
+                # split model
+                # print("\tshape mismatch: original {}, combined {}".format(original_shape, split_shape))
+                tensor_sliced = split_tensor_for_tp(args, new_key, split_dim, original_tensor)
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = tensor_sliced[jj]
+            else:
+                # copy model
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = original_tensor
+        # print(tp_models[0]['model'].keys())
+
+        for tp in range(args.target_tp_size):
+            dir_name = "mp_rank_{:02d}".format(tp)
+            if args.target_pp_size > 1:
+                dir_name += "_{:03d}".format(pp)
+
+            model = finalize_checkpoint(sample_model, tp_models[tp], args, verbose=False)
+
+            save_dir = os.path.join(args.save_dir, 'iter_{:07d}'.format(out_iteration), dir_name)
+            os.makedirs(save_dir, exist_ok=True)
+            model_file = os.path.join(save_dir, "model_optim_rng.pt")
+            torch.save(model, model_file)
+            print(f"Model {model_file} is saved.")
+
+    # shutil.copyfile(tracker_filename, os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt'))
+    tracker_filename = os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt')
+    with open(tracker_filename, 'w') as f:
+        f.write(str(out_iteration))
+
+
+if __name__ == "__main__":
+    # example run command:
+    # python hybrid_conversion.py
+    # --load-dir mamba2-840m-test/checkpoints/
+    # --save-dir mamba2-840m-test-conversion/checkpoints/
+    # --target-pp-size 1
+    # --target-tp-size 1
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--load-dir', type=str)
+    parser.add_argument('--save-dir', type=str)
+    parser.add_argument('--target-tp-size', type=int, default=1)
+    parser.add_argument('--target-pp-size', type=int, default=1)
+    parser.add_argument('--reset-iterations', action='store_true')
+
+    parser.add_argument('--d-model', type=int, default=4096)
+    parser.add_argument('--mamba-version', type=int, default=2)
+    parser.add_argument('--mamba-d-state', type=int, default=128)
+    parser.add_argument('--mamba2-n-groups', type=int, default=8)
+    parser.add_argument('--mamba2-head-dim', type=int, default=64)
+
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file
diff --git a/tools/run_mamba_text_generation_server.py b/tools/run_mamba_text_generation_server.py
new file mode 100644
index 0000000000..844d018055
--- /dev/null
+++ b/tools/run_mamba_text_generation_server.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Sample Generate Mamba"""
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.core import mpu
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_model
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.inference.text_generation_server import MegatronServer
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.inference.text_generation import beam_search_and_post_process
+
+import torch
+
+def count_parameters_in_layer(model, layer_name):
+    num_params = 0
+    for name, param in model.named_parameters():
+        if layer_name in name:
+            num_params += param.numel()
+            print_rank_0(f" - {name}: {param.numel()}")
+    return num_params
+
+# Taken from pretrain_mamba.py
+def model_provider(pre_process=True, post_process=True) -> MambaModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        MambaModel: The returned model
+    """
+    args = get_args()
+
+    print_rank_0('building Mamba model ...')
+    config = core_transformer_config_from_args(get_args())
+
+    assert args.use_legacy_models == False, "Mamba only supported in Mcore!"
+
+    if args.spec is not None:
+        mamba_stack_spec = import_module(args.spec)
+    else:
+        raise("You must provide a valid Mamba layer spec!")
+
+    model = MambaModel(
+        config=config,
+        mamba_stack_spec=mamba_stack_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        hybrid_attention_ratio=args.hybrid_attention_ratio,
+        hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+        hybrid_override_pattern=args.hybrid_override_pattern,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type
+    )
+
+    for l in range(model.decoder.num_layers_per_pipeline_rank):
+        layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.')
+        print_rank_0(f" == params layer {l}: {layer_params}")
+
+    return model
+
+def add_text_generate_args(parser):
+    group = parser.add_argument_group(title='text generation')
+    group.add_argument("--port", type=int, default=5000,
+                       help='port for text generation server to run on')
+    return parser
+
+
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
+                                       'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text "
+                 "generation.")
+    args.exit_on_missing_checkpoint = True
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+        server = MegatronServer(model)
+        server.run("0.0.0.0",port=args.port)
+
+    while True:
+        choice = torch.tensor(1, dtype=torch.long, device='cuda')
+        torch.distributed.broadcast(choice, 0)
+        if choice.item() == 0:
+            try:
+                generate_and_post_process(model)
+            except ValueError as ve:
+                pass
+        elif choice.item() == 1:
+            try:
+                beam_search_and_post_process(model)
+            except ValueError as ve:
+                pass

From e4e5dd6ad3b7d63dc112762bd4e4a755bd1fa534 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 15 Jun 2024 20:12:20 -0700
Subject: [PATCH 91/92] fix: Expose __version__ and other package metadata as
 package attributes

---
 megatron/core/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py
index b4165eb23d..3ecae0d1b0 100644
--- a/megatron/core/__init__.py
+++ b/megatron/core/__init__.py
@@ -4,6 +4,19 @@
 from megatron.core.distributed import DistributedDataParallel
 from megatron.core.inference_params import InferenceParams
 from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)
 from megatron.core.timers import Timers
 
 # Alias parallel_state as mpu, its legacy name

From 2b45e60ac359213387f863815e8b8a997fe16314 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Mon, 17 Jun 2024 10:04:33 -0700
Subject: [PATCH 92/92] Experimental Yi conversion support

---
 docs/llama_mistral.md                    | 10 ++++++++--
 tools/checkpoint/loader_llama_mistral.py |  7 ++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 0e3d4b2fb8..dd96923974 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -1,4 +1,4 @@
-# Llama and Mistral support in Megatron-LM
+# Llama, Mistral and other Llama-like model support in Megatron-LM
 
 NOTE: Llama-3 and Mistral support in Megatron is currently experimental and we are still evaluting benchmark results to confirm model conversion, training and inference correctness.
 
@@ -386,6 +386,12 @@ If loading for either inference or finetuning, use the following arguments:
 --attention-softmax-in-fp32
 ```
 
-# Benchmark results
+## Benchmark results
 
 Mistral-7B support in Megatron is currently experimental and we are still carrying out benchmark evaluations.
+
+# Other Llama-like model support
+
+*Note: Experimental*
+
+Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3).
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
index 52a8df7925..cba0bd3e1b 100644
--- a/tools/checkpoint/loader_llama_mistral.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -19,7 +19,7 @@ def add_arguments(parser):
 
     # TODO(jbarker): Need assertion to make sure *exactly* one of these is used
     parser.add_argument('--model-size', type=str, required=True,
-                        choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf'],
+                        choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf', 'yi-34B'],
                         help='Model size can be `llama2-7B`, `llama2-13B`, `llama2-70B`, `llama3-8B`, `llama3-70B`, `mistral-7B` (for pretrained models), '
                         'and `llama2-7Bf`, `llama2-13Bf`, `llama2-70Bf`, `llama3-8Bf`, `llama3-70bf` and `mistral-7Bf` (for chat-finetuned models).')
     parser.add_argument('--checkpoint-type', type=str, required=True,
@@ -58,6 +58,7 @@ def verify_transformers_version():
     "llama3-70Bf": 8,
     "mistral-7B": 1,
     "mistral-7Bf": 1,
+    "yi-34B": 8,
 }
 
 
@@ -394,7 +395,7 @@ def load_checkpoint_to_model(args):
     '''Set model params.'''
 
     from pretrain_gpt import model_provider
-    if "llama" in args.model_size:
+    if "llama" in args.model_size or "yi" in args.model_size:
         from transformers import LlamaForCausalLM as ModelForCausalLM
     elif "mistral" in args.model_size:
         from transformers import MistralForCausalLM as ModelForCausalLM
@@ -465,7 +466,7 @@ def _load_checkpoint(queue, args):
     margs.tokenizer_model = args.tokenizer_model
     load_args_from_checkpoint(margs)
 
-    if "llama2" in args.model_size:
+    if "llama2" in args.model_size or "yi" in args.model_size:
         margs.tokenizer_type = "Llama2Tokenizer"
     elif "llama3" in args.model_size:
         margs.tokenizer_type = "Llama3Tokenizer"