diff --git a/src/modalities/checkpointing/fsdp/fsdp_checkpoint_saving.py b/src/modalities/checkpointing/fsdp/fsdp_checkpoint_saving.py index 2a774934..684847f0 100644 --- a/src/modalities/checkpointing/fsdp/fsdp_checkpoint_saving.py +++ b/src/modalities/checkpointing/fsdp/fsdp_checkpoint_saving.py @@ -1,6 +1,6 @@ from enum import Enum from pathlib import Path -from typing import Callable, List +from typing import List import torch import torch.distributed as dist @@ -41,7 +41,6 @@ def __init__( checkpoint_path: Path, experiment_id: str, global_rank: int, - get_num_tokens_from_num_steps_callable: Callable[[int], int], ): """ Initializes the FSDPCheckpointSaving class. @@ -50,8 +49,6 @@ def __init__( checkpoint_path (Path): folder path to the checkpoint experiment_id (str): ID of the experiment global_rank (int): global rank within the current process group - get_num_tokens_from_num_steps_callable (Callable[[int], int]): callable to get the number - of tokens for a given number of train steps Returns: None @@ -59,7 +56,6 @@ def __init__( self.checkpoint_path = checkpoint_path self.global_rank = global_rank self.experiment_id = experiment_id - self.get_num_tokens_from_num_steps_callable = get_num_tokens_from_num_steps_callable def _get_checkpointing_path( self, diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py index 75e2455f..836ac7c2 100644 --- a/src/modalities/config/config.py +++ b/src/modalities/config/config.py @@ -1,7 +1,7 @@ import os from functools import partial from pathlib import Path -from typing import Annotated, Callable, Dict, List, Literal, Optional, Tuple +from typing import Annotated, Dict, List, Literal, Optional, Tuple import torch from omegaconf import OmegaConf @@ -112,7 +112,6 @@ class FSDPCheckpointSavingConfig(BaseModel): checkpoint_path: Path global_rank: Annotated[int, Field(strict=True, ge=0)] experiment_id: str - get_num_tokens_from_num_steps_callable: Callable[[int], int] class CheckpointSavingConfig(BaseModel): diff --git a/src/modalities/dataloader/create_packed_data.py b/src/modalities/dataloader/create_packed_data.py index 7f6dee98..33775fcd 100644 --- a/src/modalities/dataloader/create_packed_data.py +++ b/src/modalities/dataloader/create_packed_data.py @@ -337,7 +337,7 @@ def __init__(self, data_path: Path): self._data_path = data_path if not self._data_path.is_file(): raise FileNotFoundError( - f"Packed Data was not found at {self._data_path}." + f"Packed Data was not found at {self._data_path.absolute()}." f"Create on in advance by using `modalities data pack_encoded_data`." ) diff --git a/src/modalities/utils/number_conversion.py b/src/modalities/utils/number_conversion.py index a794ab6c..dbbf89e1 100644 --- a/src/modalities/utils/number_conversion.py +++ b/src/modalities/utils/number_conversion.py @@ -302,7 +302,6 @@ def get_num_tokens_from_packed_mem_map_dataset_continuous( Returns: int: Number of tokens that will be effectively used during training. """ - dataset = DatasetFactory.get_packed_mem_map_dataset_continuous( raw_data_path=dataset_path, sequence_length=sequence_length, sample_key="text" ) diff --git a/tests/checkpointing/test_fsdp_to_disc_checkpointing.py b/tests/checkpointing/test_fsdp_to_disc_checkpointing.py index daa2e66e..9a56d274 100644 --- a/tests/checkpointing/test_fsdp_to_disc_checkpointing.py +++ b/tests/checkpointing/test_fsdp_to_disc_checkpointing.py @@ -26,7 +26,7 @@ from modalities.registry.registry import Registry from modalities.running_env.cuda_env import CudaEnv from modalities.running_env.env_utils import MixedPrecisionSettings -from modalities.utils.number_conversion import NumberConversion +from modalities.training.training_progress import TrainingProgress # NOTE: We need to run the tests in a torch distributed environment with at least two GPUs. # CUDA_VISIBLE_DEVICES=0,1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 \ @@ -199,16 +199,15 @@ def test_save_checkpoint_after_backward_pass( ): experiment_id = "0" num_train_steps_done = 1 - + num_ranks = 2 + local_micro_batch_size = 4 + gradient_accumulation_steps = 1 sequence_length = gpt2_model_config_dict["model_raw"]["config"]["sequence_length"] - get_num_tokens_from_num_steps_callable = NumberConversion.get_num_tokens_from_num_steps_callable( - num_ranks=2, local_micro_batch_size=4, sequence_length=sequence_length - ) + checkpoint_saving = FSDPCheckpointSaving( checkpoint_path=temporary_checkpoint_folder_path, experiment_id=experiment_id, global_rank=dist.get_rank(), - get_num_tokens_from_num_steps_callable=get_num_tokens_from_num_steps_callable, ) checkpoint_loading = FSDPCheckpointLoading( @@ -234,15 +233,33 @@ def test_save_checkpoint_after_backward_pass( updated_optimizer_state_dict = deepcopy(optimizer.state_dict()) # save model and optimizer before backward pass + training_progress = TrainingProgress( + num_seen_steps_current_run=num_train_steps_done, + num_seen_tokens_current_run=num_train_steps_done + * local_micro_batch_size + * sequence_length + * num_ranks + * gradient_accumulation_steps, + num_target_steps=num_train_steps_done * 2, + num_target_tokens=num_train_steps_done + * local_micro_batch_size + * sequence_length + * num_ranks + * gradient_accumulation_steps + * 2, + ) checkpoint_saving._save_checkpoint( - model=fsdp_wrapped_model, optimizer=optimizer, num_train_steps_done=num_train_steps_done + model=fsdp_wrapped_model, optimizer=optimizer, training_progress=training_progress ) # load the model checkpoint model_checkpointing_path = checkpoint_saving._get_checkpointing_path( experiment_id=experiment_id, - num_seen_steps=num_train_steps_done, entity_type=CheckpointingEntityType.MODEL, + num_seen_steps=training_progress.num_seen_steps_total, + num_seen_tokens=training_progress.num_seen_tokens_total, + num_target_steps=training_progress.num_target_steps, + num_target_tokens=training_progress.num_target_tokens, ) fsdp_wrapped_model_2 = checkpoint_loading.load_model_checkpoint( model=gpt2_model_2, file_path=model_checkpointing_path @@ -252,8 +269,11 @@ def test_save_checkpoint_after_backward_pass( optimizer_checkpointing_path = checkpoint_saving._get_checkpointing_path( experiment_id=experiment_id, - num_seen_steps=num_train_steps_done, entity_type=CheckpointingEntityType.OPTIMIZER, + num_seen_steps=training_progress.num_seen_steps_total, + num_seen_tokens=training_progress.num_seen_tokens_total, + num_target_steps=training_progress.num_target_steps, + num_target_tokens=training_progress.num_target_tokens, ) checkpoint_loading.load_optimizer_checkpoint( optimizer=optimizer_2, model=fsdp_wrapped_model_2, file_path=optimizer_checkpointing_path diff --git a/tests/dataloader/yaml_configs/dataloader_with_fixed_num_batches.yaml b/tests/dataloader/yaml_configs/dataloader_with_fixed_num_batches.yaml index beda569f..cb2385f5 100644 --- a/tests/dataloader/yaml_configs/dataloader_with_fixed_num_batches.yaml +++ b/tests/dataloader/yaml_configs/dataloader_with_fixed_num_batches.yaml @@ -1,4 +1,4 @@ -# NOTE, settings is not type checked in the instantiation model, as the settings are not used in the pydantic model. +# NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model. # Therefore, we can place arbitrary values in the settings field. # Only train_dataloader and fixed_num_batches are type checked in the instantiation model. diff --git a/tests/dataloader/yaml_configs/skipped_dataloader.yaml b/tests/dataloader/yaml_configs/skipped_dataloader.yaml index 1b58c2e9..9dbd91c4 100644 --- a/tests/dataloader/yaml_configs/skipped_dataloader.yaml +++ b/tests/dataloader/yaml_configs/skipped_dataloader.yaml @@ -1,4 +1,4 @@ -# NOTE, settings is not type checked in the instantiation model, as the settings are not used in the pydantic model. +# NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model. # Therefore, we can place arbitrary values in the settings field. settings: diff --git a/tests/end2end_tests/gpt2_train_num_steps_8.yaml b/tests/end2end_tests/gpt2_train_num_steps_8.yaml index f28790dd..897b6bb3 100644 --- a/tests/end2end_tests/gpt2_train_num_steps_8.yaml +++ b/tests/end2end_tests/gpt2_train_num_steps_8.yaml @@ -1,23 +1,53 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids - training: - training_log_interval_in_steps: 1 - checkpointing_interval_in_steps: 4 - evaluation_interval_in_steps: 1 - global_num_seen_tokens: 0 - do_apply_activation_checkpointing: false - gradient_acc_steps: 1 - local_train_micro_batch_size: 1 - sequence_length: 256 + prediction_key: logits cuda_env: local_rank: ${cuda_env:LOCAL_RANK} global_rank: ${cuda_env:RANK} world_size: ${cuda_env:WORLD_SIZE} paths: - checkpointing_path: tmp/checkpoints + checkpoint_saving_path: tmp/checkpoints + train_dataset_path: tests/end2end_tests/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 4 + evaluation_interval_in_steps: 1 + consistency_enforcement: + enforce_tokens_per_step_conistency: true + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 1 + sequence_length: 256 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: num_tokens_from_packed_mem_map_dataset_continuous + config: + dataset_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + num_ranks: ${settings.cuda_env.world_size} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + num_target_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + num_ranks: ${settings.cuda_env.world_size} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + local_num_seen_batches: 0 + last_step: -1 collate_fn: component_key: collate_fn @@ -30,8 +60,8 @@ train_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: lorem_ipsum.pbin - sequence_length: ${settings.training.sequence_length} + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} sample_key: ${settings.referencing_keys.sample_key} train_dataloader: @@ -40,17 +70,8 @@ train_dataloader: config: num_workers: 2 pin_memory: true - shuffle: false - dataloader_tag: "train" - skip_num_batches: - component_key: number_conversion - variant_key: local_num_batches_from_num_tokens - config: - num_ranks: ${settings.cuda_env.world_size} - global_num_tokens: ${settings.training.global_num_seen_tokens} - sequence_length: ${settings.training.sequence_length} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} - + dataloader_tag: train + skip_num_batches: ${settings.training_progress.local_num_seen_batches} dataset: instance_key: train_dataset pass_type: BY_REFERENCE @@ -58,7 +79,7 @@ train_dataloader: component_key: batch_sampler variant_key: default config: - batch_size: ${settings.training.local_train_micro_batch_size} + batch_size: ${settings.step_profile.local_train_micro_batch_size} drop_last: true sampler: component_key: sampler @@ -67,6 +88,8 @@ train_dataloader: rank: ${settings.cuda_env.global_rank} num_replicas: ${settings.cuda_env.world_size} shuffle: true + drop_last: true + seed: 42 dataset: instance_key: train_dataset pass_type: BY_REFERENCE @@ -89,25 +112,16 @@ checkpoint_saving: component_key: checkpoint_saving_execution variant_key: fsdp config: - checkpoint_path: ${settings.paths.checkpointing_path} # TODO + checkpoint_path: ${settings.paths.checkpoint_saving_path} global_rank: ${settings.cuda_env.global_rank} - experiment_id: ${settings.experiment_id} - get_num_tokens_from_num_steps_callable: - component_key: number_conversion - variant_key: num_tokens_from_num_steps_callable - config: - num_ranks: ${settings.cuda_env.world_size} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} - sequence_length: ${settings.training.sequence_length} - + experiment_id: ${settings.experiment_id} -# resolving class types via different enums sucks... loss_fn: component_key: loss variant_key: clm_cross_entropy_loss config: - target_key: target_ids - prediction_key: logits + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} wrapped_model: component_key: model @@ -144,7 +158,7 @@ model_raw: config: sample_key: ${settings.referencing_keys.sample_key} poe_type: NOPE - sequence_length: ${settings.training.sequence_length} + sequence_length: ${settings.step_profile.sequence_length} prediction_key: ${loss_fn.config.prediction_key} vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 2 @@ -187,13 +201,20 @@ model_raw: scheduler: component_key: scheduler - variant_key: dummy_lr + variant_key: onecycle_lr config: optimizer: instance_key: optimizer pass_type: BY_REFERENCE - -optimizer: + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: component_key: optimizer variant_key: adam_w config: @@ -201,7 +222,7 @@ optimizer: betas: [0.9, 0.95] eps: 1e-8 weight_decay: 1e-1 - weight_decay_groups_excluded: ["embedding", "layernorm"] + weight_decay_groups_excluded: [embedding, layernorm] wrapped_model: instance_key: wrapped_model pass_type: BY_REFERENCE @@ -221,21 +242,13 @@ batch_progress_subscriber: variant_key: rich config: global_rank: ${settings.cuda_env.global_rank} - global_num_seen_steps: - component_key: number_conversion - variant_key: num_steps_from_num_tokens - config: - num_ranks: ${settings.cuda_env.world_size} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} - global_num_tokens: ${settings.training.global_num_seen_tokens} - sequence_length: ${settings.training.sequence_length} - gradient_acc_steps: ${settings.training.gradient_acc_steps} - train_dataloader: - instance_key: train_dataloader + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders pass_type: BY_REFERENCE - eval_dataloaders: [] - evaluation_subscriber: component_key: results_subscriber variant_key: save_all diff --git a/tests/end2end_tests/gpt2_warm_start_from_step_4.yaml b/tests/end2end_tests/gpt2_warm_start_from_step_4.yaml index 269eec54..e634abbd 100644 --- a/tests/end2end_tests/gpt2_warm_start_from_step_4.yaml +++ b/tests/end2end_tests/gpt2_warm_start_from_step_4.yaml @@ -1,23 +1,69 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids - training: - training_log_interval_in_steps: 1 - checkpointing_interval_in_steps: 4 - evaluation_interval_in_steps: 1 - global_num_seen_tokens: 2048 # 4 steps * 256 tokens per step * 2 ranks - do_apply_activation_checkpointing: false - gradient_acc_steps: 1 - local_train_micro_batch_size: 1 - sequence_length: 256 + prediction_key: logits cuda_env: local_rank: ${cuda_env:LOCAL_RANK} global_rank: ${cuda_env:RANK} world_size: ${cuda_env:WORLD_SIZE} paths: - checkpointing_path: tmp/checkpoints + checkpoint_saving_path: tmp/checkpoints + train_dataset_path: tests/end2end_tests/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 4 + evaluation_interval_in_steps: 1 + consistency_enforcement: + enforce_tokens_per_step_conistency: true + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 1 + sequence_length: 256 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: global_num_target_tokens_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + num_target_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_target_steps_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + training_progress: + global_num_seen_tokens: # used below + component_key: number_conversion + variant_key: global_num_seen_tokens_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + num_seen_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_seen_steps_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + local_num_seen_batches: # for the dataloader + component_key: number_conversion + variant_key: local_num_batches_from_num_tokens + config: + num_ranks: ${settings.cuda_env.world_size} + global_num_tokens: ${settings.training_progress.global_num_seen_tokens} + sequence_length: ${settings.step_profile.sequence_length} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + last_step: # for the scheduler + component_key: number_conversion + variant_key: last_step_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + warmstart_checkpoint_paths: + # we pass in the checkpoint paths as filenames such that the num_target_tokens and num_target_steps can be calculated and correctly passed to the training loop + # Within the test is replaced with the actual path to the checkpoint. + model_checkpoint_path: eid_0-model-seen_steps_4-seen_tokens_2048-target_steps_15-target_tokens_7680.bin + optimizer_checkpoint_path: eid_0-optimizer-seen_steps_4-seen_tokens_2048-target_steps_15-target_tokens_7680.bin collate_fn: component_key: collate_fn @@ -30,8 +76,8 @@ train_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: lorem_ipsum.pbin - sequence_length: ${settings.training.sequence_length} + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} sample_key: ${settings.referencing_keys.sample_key} train_dataloader: @@ -40,16 +86,8 @@ train_dataloader: config: num_workers: 2 pin_memory: true - shuffle: false - dataloader_tag: "train" - skip_num_batches: - component_key: number_conversion - variant_key: local_num_batches_from_num_tokens - config: - num_ranks: ${settings.cuda_env.world_size} - global_num_tokens: ${settings.training.global_num_seen_tokens} - sequence_length: ${settings.training.sequence_length} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} + dataloader_tag: train + skip_num_batches: ${settings.training_progress.local_num_seen_batches} dataset: instance_key: train_dataset pass_type: BY_REFERENCE @@ -57,7 +95,7 @@ train_dataloader: component_key: batch_sampler variant_key: default config: - batch_size: ${settings.training.local_train_micro_batch_size} + batch_size: ${settings.step_profile.local_train_micro_batch_size} drop_last: true sampler: component_key: sampler @@ -66,6 +104,8 @@ train_dataloader: rank: ${settings.cuda_env.global_rank} num_replicas: ${settings.cuda_env.world_size} shuffle: true + drop_last: true + seed: 42 dataset: instance_key: train_dataset pass_type: BY_REFERENCE @@ -97,24 +137,16 @@ checkpoint_saving: component_key: checkpoint_saving_execution variant_key: fsdp config: - checkpoint_path: ${settings.paths.checkpointing_path} # TODO + checkpoint_path: ${settings.paths.checkpoint_saving_path} global_rank: ${settings.cuda_env.global_rank} - experiment_id: ${settings.experiment_id} - get_num_tokens_from_num_steps_callable: - component_key: number_conversion - variant_key: num_tokens_from_num_steps_callable - config: - num_ranks: ${settings.cuda_env.world_size} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} - sequence_length: ${settings.training.sequence_length} + experiment_id: ${settings.experiment_id} -# resolving class types via different enums sucks... loss_fn: component_key: loss variant_key: clm_cross_entropy_loss config: - target_key: target_ids - prediction_key: logits + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} wrapped_model: component_key: model @@ -126,7 +158,7 @@ wrapped_model: checkpoint_loading: instance_key: checkpoint_loading pass_type: BY_REFERENCE - checkpoint_path: must_be_set_in_test_impl + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} model: component_key: model @@ -151,7 +183,7 @@ model_raw: config: sample_key: ${settings.referencing_keys.sample_key} poe_type: NOPE - sequence_length: ${settings.training.sequence_length} + sequence_length: ${settings.step_profile.sequence_length} prediction_key: ${loss_fn.config.prediction_key} vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 2 @@ -194,11 +226,18 @@ model_raw: scheduler: component_key: scheduler - variant_key: dummy_lr + variant_key: onecycle_lr config: optimizer: instance_key: optimizer pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} optimizer: component_key: optimizer @@ -213,7 +252,7 @@ optimizer: checkpoint_loading: instance_key: checkpoint_loading pass_type: BY_REFERENCE - checkpoint_path: must_be_set_in_test_impl + checkpoint_path: ${settings.warmstart_checkpoint_paths.optimizer_checkpoint_path} optimizer_original: component_key: optimizer @@ -223,7 +262,7 @@ optimizer_original: betas: [0.9, 0.95] eps: 1e-8 weight_decay: 1e-1 - weight_decay_groups_excluded: ["embedding", "layernorm"] + weight_decay_groups_excluded: [embedding, layernorm] wrapped_model: instance_key: wrapped_model pass_type: BY_REFERENCE @@ -243,21 +282,13 @@ batch_progress_subscriber: variant_key: rich config: global_rank: ${settings.cuda_env.global_rank} - global_num_seen_steps: - component_key: number_conversion - variant_key: num_steps_from_num_tokens - config: - num_ranks: ${settings.cuda_env.world_size} - local_micro_batch_size: ${settings.training.local_train_micro_batch_size} - global_num_tokens: ${settings.training.global_num_seen_tokens} - sequence_length: ${settings.training.sequence_length} - gradient_acc_steps: ${settings.training.gradient_acc_steps} - train_dataloader: - instance_key: train_dataloader + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders pass_type: BY_REFERENCE - eval_dataloaders: [] - evaluation_subscriber: component_key: results_subscriber variant_key: save_all diff --git a/tests/end2end_tests/test_fsdp_warmstart.py b/tests/end2end_tests/test_fsdp_warmstart.py index 4085d827..b2bc0cce 100644 --- a/tests/end2end_tests/test_fsdp_warmstart.py +++ b/tests/end2end_tests/test_fsdp_warmstart.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any, Dict, List +import debugpy import pytest import torch import torch.distributed as dist @@ -18,6 +19,16 @@ from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.running_env.cuda_env import CudaEnv +# Get the rank of the process (0 or 1 in this case) +rank = int(os.getenv("RANK")) + +# Use a different port for each process +port = 9875 + rank +debugpy.listen(("0.0.0.0", port)) # Listening on all interfaces to allow debugger to attach +print(f"Rank {rank}: Waiting for debugger to attach on port {port}...") +debugpy.wait_for_client() # Pause here until the debugger attaches + + # NOTE: We need to run the tests in a torch distributed environment with at least two GPUs. # CUDA_VISIBLE_DEVICES=0,1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 \ # $(which pytest) path/to/test_fsdp_to_disc_checkpointing.py @@ -84,9 +95,6 @@ def test_warm_start(self): gpt2_8_steps_config_dict["settings"]["experiment_id"] = experiment_id_0 loss_values_experiment_0_path = checkpoint_path + "/experiment_0_loss_scores.txt" - # adopt dataset path - gpt2_8_steps_config_dict["train_dataset"]["config"]["raw_data_path"] = working_dir / "lorem_ipsum.pbin" - # config for one step model gpt2_warm_start_after_4_steps_config_file_path = working_dir / "gpt2_warm_start_from_step_4.yaml" gpt2_warm_start_after_4_steps_dict = load_app_config_dict(gpt2_warm_start_after_4_steps_config_file_path) @@ -94,10 +102,11 @@ def test_warm_start(self): # adopt the checkpoint path experiment_id_1 = "1" gpt2_warm_start_after_4_steps_dict["wrapped_model"]["config"]["checkpoint_path"] = ( - checkpoint_path + "/0/eid_0-model-num_steps_4-num_tokens_2048.bin" + checkpoint_path + "/0/eid_0-model-seen_steps_4-seen_tokens_2048-target_steps_15-target_tokens_7680.bin" ) gpt2_warm_start_after_4_steps_dict["optimizer"]["config"]["checkpoint_path"] = ( - checkpoint_path + "/0/eid_0-optimizer-num_steps_4-num_tokens_2048.bin" + checkpoint_path + + "/0/eid_0-optimizer-seen_steps_4-seen_tokens_2048-target_steps_15-target_tokens_7680.bin" ) gpt2_warm_start_after_4_steps_dict["checkpoint_saving"]["config"]["checkpoint_saving_execution"]["config"][ "checkpoint_path" @@ -109,13 +118,14 @@ def test_warm_start(self): gpt2_warm_start_after_4_steps_dict["settings"]["experiment_id"] = experiment_id_1 loss_values_experiment_1_path = checkpoint_path + "/experiment_1_loss_scores.txt" - # adopt dataset path - gpt2_warm_start_after_4_steps_dict["train_dataset"]["config"]["raw_data_path"] = ( - working_dir / "lorem_ipsum.pbin" - ) + # # adopt dataset path + # gpt2_warm_start_after_4_steps_dict["train_dataset"]["config"]["raw_data_path"] = ( + # working_dir / "lorem_ipsum.pbin" + # ) main_obj_0 = Main(gpt2_8_steps_config_file_path) main_obj_0.config_dict = gpt2_8_steps_config_dict + with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): main_obj_0.add_custom_component( component_key="results_subscriber", @@ -123,6 +133,9 @@ def test_warm_start(self): custom_component=SaveAllResultSubscriber, custom_config=SaveAllResultSubscriberConfig, ) + print( + main_obj_0.config_dict["settings"]["training_target"]["num_target_tokens"]["config"]["dataset_path"] + ) components_0 = main_obj_0.build_components(components_model_type=TrainingComponentsInstantiationModel) main_obj_0.run(components_0) @@ -143,6 +156,12 @@ def test_warm_start(self): custom_config=SaveAllResultSubscriberConfig, ) components_1 = main_obj_1.build_components(components_model_type=TrainingComponentsInstantiationModel) + + assert ( + components_0.scheduler.base_lrs == components_1.scheduler.base_lrs + ) # make sure that the initial learning rates are the same + assert components_1.scheduler.last_epoch == 4 # we start from step 4 + main_obj_1.run(components_1) # we collect the loss values from rank 0 for the warmstart model @@ -166,18 +185,18 @@ def test_warm_start(self): # and the warm start model have the same loss values assert loaded_loss_values_0[4:] == pytest.approx(loaded_loss_values_1, abs=1e-16) + # assert that the scheduler state is the same for both models + assert components_1.scheduler.last_epoch == components_0.scheduler.last_epoch + assert components_0.scheduler.get_last_lr() == components_1.scheduler.get_last_lr() + def test_warmstart_dataloader(self): # non-skipped config gpt2_two_steps_config_file_path = working_dir / "gpt2_train_num_steps_8.yaml" gpt2_two_steps_config_dict = load_app_config_dict(gpt2_two_steps_config_file_path) - # adopt dataset path - gpt2_two_steps_config_dict["train_dataset"]["config"]["raw_data_path"] = working_dir / "lorem_ipsum.pbin" # skipped config gpt2_warm_start_from_step_1_config_file_path = working_dir / "gpt2_warm_start_from_step_4.yaml" gpt2_warm_start_from_step_1_dict = load_app_config_dict(gpt2_warm_start_from_step_1_config_file_path) - # adopt dataset path - gpt2_warm_start_from_step_1_dict["train_dataset"]["config"]["raw_data_path"] = working_dir / "lorem_ipsum.pbin" main_obj_1 = Main(gpt2_two_steps_config_file_path) main_obj_1.config_dict = gpt2_two_steps_config_dict