Skip to content

Commit

Permalink
Merge pull request #191 from Modalities/fix_failing_gpu_tests
Browse files Browse the repository at this point in the history
Fix failing gpu tests
  • Loading branch information
mali-git authored Aug 6, 2024
2 parents 0780916 + 120596d commit ec3319b
Show file tree
Hide file tree
Showing 10 changed files with 41 additions and 32 deletions.
14 changes: 8 additions & 6 deletions examples/library_usage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

from modalities.__main__ import Main
from modalities.batch import DatasetBatch
from modalities.config.config import load_app_config_dict
from modalities.config.config import ProcessGroupBackendType
from modalities.config.instantiation_models import TrainingComponentsInstantiationModel
from modalities.models.gpt2.collator import CollateFnIF
from modalities.running_env.cuda_env import CudaEnv


class CustomGPT2LLMCollateFnConfig(BaseModel):
Expand Down Expand Up @@ -35,10 +37,8 @@ def __call__(self, batch: List[Dict[str, torch.Tensor]]) -> DatasetBatch:
def main():
# load and parse the config file
config_file_path = Path("config_lorem_ipsum.yaml")
config_dict = load_app_config_dict(config_file_path)

# instantiate the Main entrypoint of modalities by passing in the config
modalities_main = Main(config_dict=config_dict, config_path=config_file_path)
# instantiate the Main entrypoint of modalities by passing in the config path
modalities_main = Main(config_path=config_file_path)

# add the custom component to modalities
modalities_main.add_custom_component(
Expand All @@ -48,7 +48,9 @@ def main():
custom_config=CustomGPT2LLMCollateFnConfig,
)
# run the experiment
modalities_main.run()
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
components = modalities_main.build_components(components_model_type=TrainingComponentsInstantiationModel)
modalities_main.run(components)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions src/modalities/logging_broker/subscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ class MessageSubscriberIF(ABC, Generic[T]):
def consume_message(self, message: Message[T]):
raise NotImplementedError

@abstractmethod
def consume_dict(self, mesasge_dict: Dict[str, Any]):
raise NotImplementedError
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Tuple
from typing import Any, Dict, Tuple

from rich.console import Group
from rich.live import Live
Expand All @@ -14,7 +14,7 @@ class DummyProgressSubscriber(MessageSubscriberIF[BatchProgressUpdate]):
def consume_message(self, message: Message[BatchProgressUpdate]):
pass

def consume_dict(self, key: str, value: str):
def consume_dict(self, mesasge_dict: Dict[str, Any]):
pass


Expand Down Expand Up @@ -95,3 +95,6 @@ def consume_message(self, message: Message[BatchProgressUpdate]):
task_id=task_id,
completed=batch_progress.num_steps_done,
)

def consume_dict(self, mesasge_dict: Dict[str, Any]):
raise NotImplementedError
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class RegexFilter(BaseModel):
WeightInitTypes.SCALED: RegexFilter(
weights=[
r"transformer\.h\.\d+\.attn\.c_proj\.weight",
r"transformer\.h\.\w+\.mlp\.W_2.weight" r"transformer\.h\.\w+\.mlp\.c_proj\.weight", # SwiGLU # gelu
r"transformer\.h\.\w+\.mlp\.W_2.weight", # SwiGLU
r"transformer\.h\.\w+\.mlp\.c_proj\.weight", # gelu
]
),
WeightInitTypes.SCALED_EMBED: RegexFilter(
Expand Down
14 changes: 5 additions & 9 deletions tests/dataloader/distributed/test_distributed_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel

from modalities.__main__ import Main
from modalities.config.config import ProcessGroupBackendType, PydanticLLMDataLoaderIFType, load_app_config_dict
from modalities.config.config import ProcessGroupBackendType, PydanticLLMDataLoaderIFType
from modalities.running_env.cuda_env import CudaEnv
from tests.dataloader.dummy_sequential_dataset import TestDataset, TestDatasetConfig

Expand All @@ -29,9 +29,8 @@ def test_resumable_dataloader_without_shuffling():
# Given a sequence of [0, 1, 2, 3, 4, 5, 6, 7, 8] we want each of the two processes
# to receive [[0, 2], [4, 6]] and [[1, 3], [5, 7]], respectively.
config_file_path = working_dir / "dist_dataloader_config_without_shuffling.yaml"
config_dict = load_app_config_dict(config_file_path)

main = Main(config_dict, config_file_path)
main = Main(config_file_path)
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main.add_custom_component(
component_key="dataset",
Expand Down Expand Up @@ -75,9 +74,8 @@ def test_resumable_dataloader_with_shuffling_without_skipping():
# to receive two batches of size two without overlap, e.g., [[2, 0], [5, 6]] and [[7, 3], [4, 1]], respectively.

config_file_path = working_dir / "dist_dataloader_config_with_shuffling.yaml"
config_dict = load_app_config_dict(config_file_path)

main = Main(config_dict, config_file_path)
main = Main(config_file_path)
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main.add_custom_component(
component_key="dataset",
Expand Down Expand Up @@ -122,15 +120,13 @@ def test_resumable_dataloader_with_shuffling_and_skipped_batches():
# to receive one batch of size two without overlap, e.g., [[5, 6]] and [[4, 1]], respectively.

config_shuffled_file_path = working_dir / "dist_dataloader_config_with_shuffling.yaml"
config_shuffled_dict = load_app_config_dict(config_shuffled_file_path)
main_shuffled = Main(config_shuffled_dict, config_shuffled_file_path)
main_shuffled = Main(config_shuffled_file_path)

config_shuffled_and_skipped_file_path = (
working_dir / "dist_dataloader_config_with_shuffling_and_skipped_batches.yaml"
)

config_shuffled_and_skipped_dict = load_app_config_dict(config_shuffled_and_skipped_file_path)
main_shuffled_and_skipped = Main(config_shuffled_and_skipped_dict, config_shuffled_and_skipped_file_path)
main_shuffled_and_skipped = Main(config_shuffled_and_skipped_file_path)

with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main_shuffled.add_custom_component(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel

from modalities.__main__ import Main
from modalities.config.config import ProcessGroupBackendType, PydanticLLMDataLoaderIFType, load_app_config_dict
from modalities.config.config import ProcessGroupBackendType, PydanticLLMDataLoaderIFType
from modalities.running_env.cuda_env import CudaEnv
from tests.dataloader.dummy_sequential_dataset import TestDataset, TestDatasetConfig

Expand All @@ -35,9 +35,7 @@ def test_resumable_dataloader_without_shuffling():

config_file_path = working_dir / "dist_repeating_dataloader_config_without_shuffling_but_skipped_batch.yaml"

config_dict = load_app_config_dict(config_file_path)

main = Main(config_dict, config_file_path)
main = Main(config_file_path)
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main.add_custom_component(
component_key="dataset",
Expand Down
2 changes: 1 addition & 1 deletion tests/end2end_tests/gpt2_train_num_steps_8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand Down
2 changes: 1 addition & 1 deletion tests/end2end_tests/gpt2_warm_start_from_step_4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand Down
22 changes: 15 additions & 7 deletions tests/end2end_tests/test_fsdp_warmstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import tempfile
from pathlib import Path
from typing import List
from typing import Any, Dict, List

import pytest
import torch
Expand Down Expand Up @@ -37,6 +37,9 @@ def consume_message(self, message: Message[EvaluationResultBatch]):
"""Consumes a message from a message broker."""
self.message_list.append(message)

def consume_dict(self, mesasge_dict: Dict[str, Any]):
pass


class SaveAllResultSubscriberConfig(BaseModel):
pass
Expand Down Expand Up @@ -111,7 +114,8 @@ def test_warm_start(self):
working_dir / "lorem_ipsum.pbin"
)

main_obj_0 = Main(gpt2_8_steps_config_dict, gpt2_8_steps_config_file_path)
main_obj_0 = Main(gpt2_8_steps_config_file_path)
main_obj_0.config_dict = gpt2_8_steps_config_dict
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main_obj_0.add_custom_component(
component_key="results_subscriber",
Expand All @@ -125,11 +129,12 @@ def test_warm_start(self):
# we collect the loss values from rank 0 and store them in the temporary experiment folder
if dist.get_rank() == 0:
messages_0: List[Message[EvaluationResultBatch]] = components_0.evaluation_subscriber.message_list
loss_scores_0 = TestWarmstart.get_loss_scores(messages_0, "CLMCrossEntropyLoss average")
loss_scores_0 = TestWarmstart.get_loss_scores(messages_0, "train loss avg")
with open(loss_values_experiment_0_path, "w") as f:
json.dump(loss_scores_0, f)

main_obj_1 = Main(gpt2_warm_start_after_4_steps_dict, gpt2_warm_start_after_4_steps_config_file_path)
main_obj_1 = Main(gpt2_warm_start_after_4_steps_config_file_path)
main_obj_1.config_dict = gpt2_warm_start_after_4_steps_dict

main_obj_1.add_custom_component(
component_key="results_subscriber",
Expand All @@ -144,7 +149,7 @@ def test_warm_start(self):
# and store them in the temporary experiment folder
if dist.get_rank() == 0:
messages_1: List[Message[EvaluationResultBatch]] = components_1.evaluation_subscriber.message_list
loss_scores_1 = TestWarmstart.get_loss_scores(messages_1, "CLMCrossEntropyLoss average")
loss_scores_1 = TestWarmstart.get_loss_scores(messages_1, "train loss avg")
with open(loss_values_experiment_1_path, "w") as f:
json.dump(loss_scores_1, f)

Expand Down Expand Up @@ -174,8 +179,11 @@ def test_warmstart_dataloader(self):
# adopt dataset path
gpt2_warm_start_from_step_1_dict["train_dataset"]["config"]["raw_data_path"] = working_dir / "lorem_ipsum.pbin"

main_obj_1 = Main(gpt2_two_steps_config_dict, gpt2_two_steps_config_file_path)
main_obj_2 = Main(gpt2_warm_start_from_step_1_dict, gpt2_warm_start_from_step_1_config_file_path)
main_obj_1 = Main(gpt2_two_steps_config_file_path)
main_obj_1.config_dict = gpt2_two_steps_config_dict

main_obj_2 = Main(gpt2_warm_start_from_step_1_config_file_path)
main_obj_2.config_dict = gpt2_warm_start_from_step_1_dict

with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
main_obj_1.add_custom_component(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def get_group_params(model: FSDP, model_name: str) -> Dict[str, Optional[torch.T
GPT2_VOCAB_SIZE = 50304
GPT2_SEQUENCE_LENGTH = 2048
GPT2_HIDDEN_DIM = 768
GPT2_ALL = 106374912
GPT2_ALL = 106375680
GPT2_EMBEDDING = GPT2_HIDDEN_DIM * (
GPT2_VOCAB_SIZE + GPT2_SEQUENCE_LENGTH
) # parameters for token embeddings and positional embeddings
Expand Down

0 comments on commit ec3319b

Please sign in to comment.