Skip to content

Commit

Permalink
fix: fixed two failing multi-gpu tests
Browse files Browse the repository at this point in the history
  • Loading branch information
le1nux committed Sep 12, 2024
1 parent e8e5d76 commit 3d9c0a1
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 141 deletions.
6 changes: 1 addition & 5 deletions src/modalities/checkpointing/fsdp/fsdp_checkpoint_saving.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum
from pathlib import Path
from typing import Callable, List
from typing import List

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -41,7 +41,6 @@ def __init__(
checkpoint_path: Path,
experiment_id: str,
global_rank: int,
get_num_tokens_from_num_steps_callable: Callable[[int], int],
):
"""
Initializes the FSDPCheckpointSaving class.
Expand All @@ -50,16 +49,13 @@ def __init__(
checkpoint_path (Path): folder path to the checkpoint
experiment_id (str): ID of the experiment
global_rank (int): global rank within the current process group
get_num_tokens_from_num_steps_callable (Callable[[int], int]): callable to get the number
of tokens for a given number of train steps
Returns:
None
"""
self.checkpoint_path = checkpoint_path
self.global_rank = global_rank
self.experiment_id = experiment_id
self.get_num_tokens_from_num_steps_callable = get_num_tokens_from_num_steps_callable

def _get_checkpointing_path(
self,
Expand Down
3 changes: 1 addition & 2 deletions src/modalities/config/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from functools import partial
from pathlib import Path
from typing import Annotated, Callable, Dict, List, Literal, Optional, Tuple
from typing import Annotated, Dict, List, Literal, Optional, Tuple

import torch
from omegaconf import OmegaConf
Expand Down Expand Up @@ -112,7 +112,6 @@ class FSDPCheckpointSavingConfig(BaseModel):
checkpoint_path: Path
global_rank: Annotated[int, Field(strict=True, ge=0)]
experiment_id: str
get_num_tokens_from_num_steps_callable: Callable[[int], int]


class CheckpointSavingConfig(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/modalities/dataloader/create_packed_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def __init__(self, data_path: Path):
self._data_path = data_path
if not self._data_path.is_file():
raise FileNotFoundError(
f"Packed Data was not found at {self._data_path}."
f"Packed Data was not found at {self._data_path.absolute()}."
f"Create on in advance by using `modalities data pack_encoded_data`."
)

Expand Down
1 change: 0 additions & 1 deletion src/modalities/utils/number_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,6 @@ def get_num_tokens_from_packed_mem_map_dataset_continuous(
Returns:
int: Number of tokens that will be effectively used during training.
"""

dataset = DatasetFactory.get_packed_mem_map_dataset_continuous(
raw_data_path=dataset_path, sequence_length=sequence_length, sample_key="text"
)
Expand Down
38 changes: 29 additions & 9 deletions tests/checkpointing/test_fsdp_to_disc_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from modalities.registry.registry import Registry
from modalities.running_env.cuda_env import CudaEnv
from modalities.running_env.env_utils import MixedPrecisionSettings
from modalities.utils.number_conversion import NumberConversion
from modalities.training.training_progress import TrainingProgress

# NOTE: We need to run the tests in a torch distributed environment with at least two GPUs.
# CUDA_VISIBLE_DEVICES=0,1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 \
Expand Down Expand Up @@ -199,16 +199,15 @@ def test_save_checkpoint_after_backward_pass(
):
experiment_id = "0"
num_train_steps_done = 1

num_ranks = 2
local_micro_batch_size = 4
gradient_accumulation_steps = 1
sequence_length = gpt2_model_config_dict["model_raw"]["config"]["sequence_length"]
get_num_tokens_from_num_steps_callable = NumberConversion.get_num_tokens_from_num_steps_callable(
num_ranks=2, local_micro_batch_size=4, sequence_length=sequence_length
)

checkpoint_saving = FSDPCheckpointSaving(
checkpoint_path=temporary_checkpoint_folder_path,
experiment_id=experiment_id,
global_rank=dist.get_rank(),
get_num_tokens_from_num_steps_callable=get_num_tokens_from_num_steps_callable,
)

checkpoint_loading = FSDPCheckpointLoading(
Expand All @@ -234,15 +233,33 @@ def test_save_checkpoint_after_backward_pass(
updated_optimizer_state_dict = deepcopy(optimizer.state_dict())

# save model and optimizer before backward pass
training_progress = TrainingProgress(
num_seen_steps_current_run=num_train_steps_done,
num_seen_tokens_current_run=num_train_steps_done
* local_micro_batch_size
* sequence_length
* num_ranks
* gradient_accumulation_steps,
num_target_steps=num_train_steps_done * 2,
num_target_tokens=num_train_steps_done
* local_micro_batch_size
* sequence_length
* num_ranks
* gradient_accumulation_steps
* 2,
)
checkpoint_saving._save_checkpoint(
model=fsdp_wrapped_model, optimizer=optimizer, num_train_steps_done=num_train_steps_done
model=fsdp_wrapped_model, optimizer=optimizer, training_progress=training_progress
)

# load the model checkpoint
model_checkpointing_path = checkpoint_saving._get_checkpointing_path(
experiment_id=experiment_id,
num_seen_steps=num_train_steps_done,
entity_type=CheckpointingEntityType.MODEL,
num_seen_steps=training_progress.num_seen_steps_total,
num_seen_tokens=training_progress.num_seen_tokens_total,
num_target_steps=training_progress.num_target_steps,
num_target_tokens=training_progress.num_target_tokens,
)
fsdp_wrapped_model_2 = checkpoint_loading.load_model_checkpoint(
model=gpt2_model_2, file_path=model_checkpointing_path
Expand All @@ -252,8 +269,11 @@ def test_save_checkpoint_after_backward_pass(

optimizer_checkpointing_path = checkpoint_saving._get_checkpointing_path(
experiment_id=experiment_id,
num_seen_steps=num_train_steps_done,
entity_type=CheckpointingEntityType.OPTIMIZER,
num_seen_steps=training_progress.num_seen_steps_total,
num_seen_tokens=training_progress.num_seen_tokens_total,
num_target_steps=training_progress.num_target_steps,
num_target_tokens=training_progress.num_target_tokens,
)
checkpoint_loading.load_optimizer_checkpoint(
optimizer=optimizer_2, model=fsdp_wrapped_model_2, file_path=optimizer_checkpointing_path
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# NOTE, settings is not type checked in the instantiation model, as the settings are not used in the pydantic model.
# NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model.
# Therefore, we can place arbitrary values in the settings field.
# Only train_dataloader and fixed_num_batches are type checked in the instantiation model.

Expand Down
2 changes: 1 addition & 1 deletion tests/dataloader/yaml_configs/skipped_dataloader.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# NOTE, settings is not type checked in the instantiation model, as the settings are not used in the pydantic model.
# NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model.
# Therefore, we can place arbitrary values in the settings field.

settings:
Expand Down
123 changes: 68 additions & 55 deletions tests/end2end_tests/gpt2_train_num_steps_8.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,53 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
training:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 4
evaluation_interval_in_steps: 1
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
prediction_key: logits
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: tmp/checkpoints
checkpoint_saving_path: tmp/checkpoints
train_dataset_path: tests/end2end_tests/lorem_ipsum.pbin
intervals:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 4
evaluation_interval_in_steps: 1
consistency_enforcement:
enforce_tokens_per_step_conistency: true
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
config:
dataset_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps: # for the batch progress subscriber
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
global_num_tokens: ${settings.training_target.num_target_tokens}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
local_num_seen_batches: 0
last_step: -1

collate_fn:
component_key: collate_fn
Expand All @@ -30,8 +60,8 @@ train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: lorem_ipsum.pbin
sequence_length: ${settings.training.sequence_length}
raw_data_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
Expand All @@ -40,25 +70,16 @@ train_dataloader:
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "train"
skip_num_batches:
component_key: number_conversion
variant_key: local_num_batches_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}

dataloader_tag: train
skip_num_batches: ${settings.training_progress.local_num_seen_batches}
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
Expand All @@ -67,6 +88,8 @@ train_dataloader:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
drop_last: true
seed: 42
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
Expand All @@ -89,25 +112,16 @@ checkpoint_saving:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpointing_path} # TODO <replaced_in_test>
checkpoint_path: ${settings.paths.checkpoint_saving_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
get_num_tokens_from_num_steps_callable:
component_key: number_conversion
variant_key: num_tokens_from_num_steps_callable
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
sequence_length: ${settings.training.sequence_length}

experiment_id: ${settings.experiment_id}

# resolving class types via different enums sucks...
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: target_ids
prediction_key: logits
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}

wrapped_model:
component_key: model
Expand Down Expand Up @@ -144,7 +158,7 @@ model_raw:
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
sequence_length: ${settings.training.sequence_length}
sequence_length: ${settings.step_profile.sequence_length}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 2
Expand Down Expand Up @@ -187,21 +201,28 @@ model_raw:

scheduler:
component_key: scheduler
variant_key: dummy_lr
variant_key: onecycle_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE

optimizer:
max_lr: 6e-4
div_factor: 10
final_div_factor: 1
total_steps: ${settings.training_target.num_target_steps}
pct_start: 0.01
anneal_strategy: cos
last_epoch: ${settings.training_progress.last_step}

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: ["embedding", "layernorm"]
weight_decay_groups_excluded: [embedding, layernorm]
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
Expand All @@ -221,21 +242,13 @@ batch_progress_subscriber:
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
gradient_acc_steps: ${settings.training.gradient_acc_steps}
train_dataloader:
instance_key: train_dataloader
num_seen_steps: ${settings.training_progress.num_seen_steps}
num_target_steps: ${settings.training_target.num_target_steps}
train_dataloader_tag: ${train_dataloader.config.dataloader_tag}
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE
eval_dataloaders: []


evaluation_subscriber:
component_key: results_subscriber
variant_key: save_all
Expand Down
Loading

0 comments on commit 3d9c0a1

Please sign in to comment.