From e83d3eaa968120cd22112215510af0c7fd7ccc90 Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Sat, 23 Nov 2024 22:08:00 +0100 Subject: [PATCH] Enable packed dataset for validation; add a2a_experimental argument (#11378) * Enable packed dataset for validation; add a2a_experimental argument * Apply isort and black reformatting Signed-off-by: michal2409 --------- Signed-off-by: michal2409 Co-authored-by: michal2409 --- nemo/collections/llm/gpt/data/fine_tuning.py | 49 ++++++++++++++----- .../llm/gpt/data/packed_sequence.py | 30 +++++++++--- nemo/collections/llm/peft/lora.py | 3 ++ 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 8fcef72f3bd9..0d866bb600fe 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -117,17 +117,28 @@ def prepare_data(self) -> None: """ Prepare packed sequence data """ - if self.packed_sequence_size > 0 and not self.train_path_packed.is_file(): + if self.packed_sequence_size > 0: from nemo.collections.llm.gpt.data.packed_sequence import prepare_packed_sequence_data - prepare_packed_sequence_data( - input_path=self.train_path, - output_path=self.train_path_packed, - packed_sequence_size=self.packed_sequence_size, - tokenizer=self.tokenizer, - max_seq_length=self.seq_length, - seed=self.seed, - ) + if not self.train_path_packed.is_file(): + prepare_packed_sequence_data( + input_path=self.train_path, + output_path=self.train_path_packed, + packed_sequence_size=self.packed_sequence_size, + tokenizer=self.tokenizer, + max_seq_length=self.seq_length, + seed=self.seed, + ) + + if not self.validation_path_packed.is_file(): + prepare_packed_sequence_data( + input_path=self.validation_path, + output_path=self.validation_path_packed, + packed_sequence_size=self.packed_sequence_size, + tokenizer=self.tokenizer, + max_seq_length=self.seq_length, + seed=self.seed, + ) def setup(self, stage: str): """Called by pytorch lightning in datamodule setup""" @@ -195,7 +206,7 @@ def val_dataloader(self) -> DataLoader: # pylint: disable=C0115,C0116 return self._create_dataloader( self._create_dataset( - self.validation_path, + self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed, is_test=True, **self.dataset_kwargs, ), @@ -249,8 +260,8 @@ def train_path_packed(self) -> Path: """Path to training dataset file for packed sequence. The file path contains a reference to the tokenizer/model name since packed sequence dataset consists of tokenized indices.""" if self.packed_sequence_size > 0: - if self.packed_sequence_specs.packed_data_path is not None: - return self.packed_sequence_specs.packed_data_path + if self.packed_sequence_specs.packed_train_data_path is not None: + return self.packed_sequence_specs.packed_train_data_path tokenizer_model_name = self._extract_tokenizer_model_name() folder_name = self.dataset_root / "packed" / tokenizer_model_name folder_name.mkdir(parents=True, exist_ok=True) @@ -258,6 +269,20 @@ def train_path_packed(self) -> Path: else: raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.") + @property + def validation_path_packed(self) -> Path: + """Path to validation dataset file for packed sequence. The file path contains a reference to the + tokenizer/model name since packed sequence dataset consists of tokenized indices.""" + if self.packed_sequence_size > 0: + if self.packed_sequence_specs.packed_val_data_path is not None: + return self.packed_sequence_specs.packed_val_data_path + tokenizer_model_name = self._extract_tokenizer_model_name() + folder_name = self.dataset_root / "packed" / tokenizer_model_name + folder_name.mkdir(parents=True, exist_ok=True) + return folder_name / f"validation_{self.packed_sequence_size}.npy" + else: + raise ValueError("`validation_path_packed` invalid since packed sequence size is not specified.") + @property def validation_path(self) -> Path: """Path to validation dataset file""" diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py index 153e79f94391..345489ea0b63 100644 --- a/nemo/collections/llm/gpt/data/packed_sequence.py +++ b/nemo/collections/llm/gpt/data/packed_sequence.py @@ -101,15 +101,31 @@ class PackedSequenceSpecs: This field is set by llm.finetune api. """ - packed_data_path: str = None + packed_train_data_path: str = None """ - If specified, use the packed dataset from this file instead of the default path. + If specified, use this file for the packed training dataset instead of the default path. + """ + + packed_val_data_path: str = None + """ + If specified, use this file for the packed validation dataset instead of the default path. """ def __post_init__(self): - if self.packed_data_path is not None: - self.packed_data_path = Path(self.packed_data_path) + if self.packed_train_data_path is not None: + self.packed_train_data_path = Path(self.packed_train_data_path) + assert ( + self.packed_train_data_path.suffix == ".npy" + ), f"packed training data file must be a .npy file: {self.packed_train_data_path}" + assert ( + self.packed_train_data_path.exists() + ), f"packed training data file does not exist: {self.packed_train_data_path}" + + if self.packed_val_data_path is not None: + self.packed_val_data_path = Path(self.packed_val_data_path) + assert ( + self.packed_val_data_path.suffix == ".npy" + ), f"packed validation data file must be a .npy file: {self.packed_val_data_path}" assert ( - self.packed_data_path.suffix == ".npy" - ), f"packed data file must be a .npy file: {self.packed_data_path}" - assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}" + self.packed_val_data_path.exists() + ), f"packed validation data file does not exist: {self.packed_val_data_path}" diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 57cdda3a2871..205cde071fa7 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -124,6 +124,7 @@ class LoRA(PEFT): dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0. dropout_position (Literal['pre', 'post'], optional): Position for applying dropout. Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'post'. + a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False. Example: -------- @@ -151,6 +152,7 @@ class LoRA(PEFT): dropout_position: Literal['pre', 'post'] = 'post' lora_A_init_method: str = "xavier" lora_B_init_method: str = "zero" + a2a_experimental: bool = False def transform(self, m: nn.Module, name=None, prefix=None): """ @@ -224,6 +226,7 @@ def wildcard_match(pattern, key): model_parallel_config=getattr(m, "config", None), alpha=self.alpha, is_expert=is_expert_linear(full_name), + a2a_experimental=self.a2a_experimental, ) return AdapterParallelAdd(m, adapter) return m