From e83d3eaa968120cd22112215510af0c7fd7ccc90 Mon Sep 17 00:00:00 2001
From: Michal Futrega <mfutrega@nvidia.com>
Date: Sat, 23 Nov 2024 22:08:00 +0100
Subject: [PATCH] Enable packed dataset for validation; add a2a_experimental
 argument (#11378)

* Enable packed dataset for validation; add a2a_experimental argument

* Apply isort and black reformatting

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>

---------

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>
Co-authored-by: michal2409 <michal2409@users.noreply.github.com>
---
 nemo/collections/llm/gpt/data/fine_tuning.py  | 49 ++++++++++++++-----
 .../llm/gpt/data/packed_sequence.py           | 30 +++++++++---
 nemo/collections/llm/peft/lora.py             |  3 ++
 3 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 8fcef72f3bd9..0d866bb600fe 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -117,17 +117,28 @@ def prepare_data(self) -> None:
         """
         Prepare packed sequence data
         """
-        if self.packed_sequence_size > 0 and not self.train_path_packed.is_file():
+        if self.packed_sequence_size > 0:
             from nemo.collections.llm.gpt.data.packed_sequence import prepare_packed_sequence_data
 
-            prepare_packed_sequence_data(
-                input_path=self.train_path,
-                output_path=self.train_path_packed,
-                packed_sequence_size=self.packed_sequence_size,
-                tokenizer=self.tokenizer,
-                max_seq_length=self.seq_length,
-                seed=self.seed,
-            )
+            if not self.train_path_packed.is_file():
+                prepare_packed_sequence_data(
+                    input_path=self.train_path,
+                    output_path=self.train_path_packed,
+                    packed_sequence_size=self.packed_sequence_size,
+                    tokenizer=self.tokenizer,
+                    max_seq_length=self.seq_length,
+                    seed=self.seed,
+                )
+
+            if not self.validation_path_packed.is_file():
+                prepare_packed_sequence_data(
+                    input_path=self.validation_path,
+                    output_path=self.validation_path_packed,
+                    packed_sequence_size=self.packed_sequence_size,
+                    tokenizer=self.tokenizer,
+                    max_seq_length=self.seq_length,
+                    seed=self.seed,
+                )
 
     def setup(self, stage: str):
         """Called by pytorch lightning in datamodule setup"""
@@ -195,7 +206,7 @@ def val_dataloader(self) -> DataLoader:
         # pylint: disable=C0115,C0116
         return self._create_dataloader(
             self._create_dataset(
-                self.validation_path,
+                self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed,
                 is_test=True,
                 **self.dataset_kwargs,
             ),
@@ -249,8 +260,8 @@ def train_path_packed(self) -> Path:
         """Path to training dataset file for packed sequence. The file path contains a reference to the
         tokenizer/model name since packed sequence dataset consists of tokenized indices."""
         if self.packed_sequence_size > 0:
-            if self.packed_sequence_specs.packed_data_path is not None:
-                return self.packed_sequence_specs.packed_data_path
+            if self.packed_sequence_specs.packed_train_data_path is not None:
+                return self.packed_sequence_specs.packed_train_data_path
             tokenizer_model_name = self._extract_tokenizer_model_name()
             folder_name = self.dataset_root / "packed" / tokenizer_model_name
             folder_name.mkdir(parents=True, exist_ok=True)
@@ -258,6 +269,20 @@ def train_path_packed(self) -> Path:
         else:
             raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
 
+    @property
+    def validation_path_packed(self) -> Path:
+        """Path to validation dataset file for packed sequence. The file path contains a reference to the
+        tokenizer/model name since packed sequence dataset consists of tokenized indices."""
+        if self.packed_sequence_size > 0:
+            if self.packed_sequence_specs.packed_val_data_path is not None:
+                return self.packed_sequence_specs.packed_val_data_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"validation_{self.packed_sequence_size}.npy"
+        else:
+            raise ValueError("`validation_path_packed` invalid since packed sequence size is not specified.")
+
     @property
     def validation_path(self) -> Path:
         """Path to validation dataset file"""
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 153e79f94391..345489ea0b63 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -101,15 +101,31 @@ class PackedSequenceSpecs:
     This field is set by llm.finetune api.
     """
 
-    packed_data_path: str = None
+    packed_train_data_path: str = None
     """
-    If specified, use the packed dataset from this file instead of the default path.
+    If specified, use this file for the packed training dataset instead of the default path.
+    """
+
+    packed_val_data_path: str = None
+    """
+    If specified, use this file for the packed validation dataset instead of the default path.
     """
 
     def __post_init__(self):
-        if self.packed_data_path is not None:
-            self.packed_data_path = Path(self.packed_data_path)
+        if self.packed_train_data_path is not None:
+            self.packed_train_data_path = Path(self.packed_train_data_path)
+            assert (
+                self.packed_train_data_path.suffix == ".npy"
+            ), f"packed training data file must be a .npy file: {self.packed_train_data_path}"
+            assert (
+                self.packed_train_data_path.exists()
+            ), f"packed training data file does not exist: {self.packed_train_data_path}"
+
+        if self.packed_val_data_path is not None:
+            self.packed_val_data_path = Path(self.packed_val_data_path)
+            assert (
+                self.packed_val_data_path.suffix == ".npy"
+            ), f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
             assert (
-                self.packed_data_path.suffix == ".npy"
-            ), f"packed data file must be a .npy file: {self.packed_data_path}"
-            assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}"
+                self.packed_val_data_path.exists()
+            ), f"packed validation data file does not exist: {self.packed_val_data_path}"
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 57cdda3a2871..205cde071fa7 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -124,6 +124,7 @@ class LoRA(PEFT):
         dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
         dropout_position (Literal['pre', 'post'], optional): Position for applying dropout.
             Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'post'.
+        a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False.
 
     Example:
     --------
@@ -151,6 +152,7 @@ class LoRA(PEFT):
     dropout_position: Literal['pre', 'post'] = 'post'
     lora_A_init_method: str = "xavier"
     lora_B_init_method: str = "zero"
+    a2a_experimental: bool = False
 
     def transform(self, m: nn.Module, name=None, prefix=None):
         """
@@ -224,6 +226,7 @@ def wildcard_match(pattern, key):
                 model_parallel_config=getattr(m, "config", None),
                 alpha=self.alpha,
                 is_expert=is_expert_linear(full_name),
+                a2a_experimental=self.a2a_experimental,
             )
             return AdapterParallelAdd(m, adapter)
         return m