Skip to content
This repository has been archived by the owner on Nov 1, 2024. It is now read-only.

Add test_sequence_parallel #605

Draft
wants to merge 49 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
48e1956
Add test_sequence_parallel
bashnick Jan 16, 2023
4a3f111
try different version of apex
bashnick Jan 16, 2023
c90bbd2
try without sequence_parallel
bashnick Jan 16, 2023
b8a6afc
try sequence_parallel separately
bashnick Jan 17, 2023
2978c77
try update torch to 1.13.1+cu117
bashnick Jan 17, 2023
67f765f
try with torch=1.12.1+cu116
bashnick Jan 17, 2023
65d886b
try update environment
bashnick Jan 18, 2023
0bbebb9
upd the config file
bashnick Jan 18, 2023
08e9d88
debug config file env setup
bashnick Jan 18, 2023
0fcf0a5
add "source $BASH_ENV" before "source activate"
bashnick Jan 18, 2023
cf59b73
skip warning about the cuda versions mismatch
bashnick Jan 18, 2023
ccc838c
try different version of apex
bashnick Jan 18, 2023
8f5723a
try revert back the ubuntu image and cuda 11.1
bashnick Jan 18, 2023
95ab3f1
checkout main branch in apex
bashnick Jan 18, 2023
8e28d57
add back the checkout for apex
bashnick Jan 18, 2023
8e42490
try cuda 11.6 image
bashnick Jan 18, 2023
c9cd83c
revert back to cuda 11.4
bashnick Jan 18, 2023
75c7d4c
try update cuda to 11.6
bashnick Jan 18, 2023
f69786d
fix format
bashnick Jan 18, 2023
4e81303
try update cuda install
bashnick Jan 18, 2023
3334922
update the keys for cuda repo
bashnick Jan 18, 2023
b83768e
remove cuda-toolkit
bashnick Jan 18, 2023
31257da
try update the saving location
bashnick Jan 18, 2023
72177a0
try install cuda using the ubuntu package manager
bashnick Jan 18, 2023
c446855
try most recent cuda version
bashnick Jan 18, 2023
a7cfcdd
try ubuntu 11.6
bashnick Jan 18, 2023
d1da22b
clean previous cuda versions first
bashnick Jan 18, 2023
ec4903a
try change image without cuda
bashnick Jan 18, 2023
dee4d68
remove nvcc --version
bashnick Jan 18, 2023
df56434
remove conda-pack
bashnick Jan 18, 2023
7affc3c
try reload CUDA
bashnick Jan 18, 2023
b5cfc41
try unload the nvidia kernel module first
bashnick Jan 19, 2023
3d616f7
try stop other nvidia processes before restarting
bashnick Jan 19, 2023
9dff621
revert back to cu11.1 and try with PT1.13+cu11.1
bashnick Jan 19, 2023
25778c1
try with PT1.13+cu11.1 from source
bashnick Jan 19, 2023
ed559e8
small fix
bashnick Jan 19, 2023
8236c9c
small fix with PATH
bashnick Jan 19, 2023
0b8050c
revert to PT 1.10.2+cu111
bashnick Jan 19, 2023
50d9645
take PT from torch_stable
bashnick Jan 19, 2023
a0f2807
small changes
bashnick Jan 19, 2023
3b9ba76
try original config
bashnick Jan 19, 2023
89b1ba7
try change to PT10.2
bashnick Jan 19, 2023
f4cc998
change checkouts for apex and fairscale
bashnick Jan 19, 2023
125800f
remove checkout for apex
bashnick Jan 19, 2023
e65d479
try python3.9
bashnick Jan 19, 2023
428adb7
revert to initial setup
bashnick Jan 20, 2023
198f3b7
try upgrade pytorch after apex installation
bashnick Jan 20, 2023
e23fd62
try with PT1.10.2+cu111
bashnick Jan 20, 2023
dd2b6f5
try using a dockerhub image
bashnick Jan 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 5 additions & 16 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ version: 2.1
# -------------------------------------------------------------------------------------
gpu: &gpu
environment:
CUDA_VERSION: "11.1"
CUDA_VERSION: "11.6"
machine:
image: ubuntu-1604-cuda-11.1:202012-01
image: nvidia/cuda:11.6.1-base-ubuntu20.04
resource_class: gpu.nvidia.medium.multi


Expand All @@ -30,7 +30,6 @@ install_dep_common: &install_dep_common
# Need to install ninja build system
sudo apt-get update
sudo apt-get install ninja-build

install_dep_fused_ops: &install_dep_fused_ops
- run:
name: Install Megatron/Apex Dependencies
Expand All @@ -41,7 +40,7 @@ install_dep_fused_ops: &install_dep_fused_ops
if ! python -c 'import apex'; then
git clone https://github.com/NVIDIA/apex
cd apex
git checkout e2083df5eb96643c61613b9df48dd4eea6b07690
git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
cd ~/
fi
Expand All @@ -52,7 +51,6 @@ install_dep_fused_ops: &install_dep_fused_ops
pip install -e .
cd ~/
fi

# Remove this when we get a new fairscale release
install_fairscale: &install_fairscale
- run:
Expand All @@ -67,17 +65,14 @@ install_fairscale: &install_fairscale
pip install .
cd ~/
fi

install_dep_pt19: &install_dep_pt19
- run:
name: Install Pytorch Dependencies
command: |
source activate metaseq
pip install --upgrade setuptools
pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html
python -c 'import torch; print("Torch version:", torch.__version__)'


install_pytorch_dep: &install_pytorch_dep
- parameters:
version_str:
Expand All @@ -91,24 +86,20 @@ install_pytorch_dep: &install_pytorch_dep
echo "<<parameters.version_str>>"
pip install <<parameters.version_str>> -f https://download.pytorch.org/whl/torch_stable.html
python -c 'import torch; print("Torch version:", torch.__version__)'

install_repo: &install_repo
- run:
name: Install Repository
command: |
source activate metaseq
pip install -e .[dev,few_shot,gpu]
python setup.py build_ext --inplace


check_nvidia_driver: &check_nvidia_driver
- run:
name: Check NVIDIA Driver
working_directory: ~/
command: |
pyenv versions
nvidia-smi

create_conda_env: &create_conda_env
run:
name: Install and Create Conda Environment
Expand All @@ -126,7 +117,6 @@ create_conda_env: &create_conda_env
source activate metaseq
python --version
pip install --upgrade pip

download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies
- run:
name: Download and configure a 125m checkpoint with HF dependencies
Expand All @@ -137,7 +127,6 @@ download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_w
tar -xvzf ./125m_with_hf_dependencies.tar.gz -C .
python -m metaseq.scripts.convert_to_singleton ./125m
python -m transformers.models.opt.convert_opt_original_pytorch_checkpoint_to_pytorch --pytorch_dump_folder_path ./125m/ --hf_config ./125m/config.json --fairseq_path ./125m/restored.pt

commands:

gpu_pre: &gpu_pre
Expand Down Expand Up @@ -191,4 +180,4 @@ workflows:
version: 2
build:
jobs:
- gpu_tests_pt19
- gpu_tests_pt19
1 change: 1 addition & 0 deletions fairscale
Submodule fairscale added at 1bc96f
8 changes: 4 additions & 4 deletions gpu_tests/test_model_parallel_mp1_mp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
)
class TestModelParallelMP1(unittest.TestCase):
"""
The test will verify that the model can be trained with
model_parallel = 1
The test checks hat the number of trianing steps performed is correct
The tests will verify that the model can be trained with both
model_parallel = 1 and model_parallel = 2
The tests check that the number of training steps performed is correct
and that the required loss is achieved on the last iteration
"""

Expand Down Expand Up @@ -142,7 +142,7 @@ def run_training(max_update, events, argv_injection, size_patch_dict):

def local_run_mock(args, env, train_cmd, dry_run, max_update, events):
"""
The function introduces several pathces for the argumets of the
The function introduces several patches for the argumets of the
model training. These patches are needed to pass gpu tests on
circleci GPUs (empirical knowledge)
"""
Expand Down
190 changes: 190 additions & 0 deletions gpu_tests/test_sequence_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import subprocess
import json
import multiprocessing
from functools import partial, partialmethod
import unittest
from unittest.mock import patch
import torch
from metaseq.dataclass.configs import DistributedTrainingConfig
from metaseq.launcher.opt_baselines import cli_main as sweep_cli_main
from metaseq.cli.train import cli_main as train_cli_main
from metaseq.launcher.opt_job_constants import Size, M


@unittest.skipIf(not torch.cuda.is_available(), "test requires 4 GPUs, none found")
@unittest.skipIf(
DistributedTrainingConfig.distributed_world_size != 4,
"test requires 4 GPUs",
)
class TestSequenceParallel(unittest.TestCase):
"""
The tests check rough equivalence between going through the
sequence-parallel code-path with MP 2 vs the current non
sequence-parallel run for the 8M model.
"""

def test_sequence_parallel(self):
# parameters to train an mp2 model with sequence_parallel flag
argv_injection = (
"python3 metaseq/launcher/opt_baselines.py "
"--prefix train.8m --model-size 8m --checkpoints-dir ./test-checkpoint "
"--tensorboard-logdir ./test-checkpoint --num-trials 1 --azure "
"--num-gpus 4 --num-nodes 1 --seed 1 "
"--local --disable-validation --max-epoch 5 --max-update 5 --benchmark "
)
max_update_first_run = 20
size_patch_dict = {"8m": Size(4, 128, 2, 64, int(0.03125 * M), 1.0e-3, 2)}

# train model with sequence_parallel flag
# training_log_events_seq = self._test_model_parallel(
# max_update_first_run=max_update_first_run,
# argv_injection=argv_injection,
# size_patch_dict=size_patch_dict,
# is_sequence_parallel=True,
# )
# train model without sequence_parallel flag
training_log_events = self._test_model_parallel(
max_update_first_run=max_update_first_run,
argv_injection=argv_injection,
size_patch_dict=size_patch_dict,
is_sequence_parallel=True,
)

# check that training ran correctly
# check that the number of updates was correct
# self.assertNotEqual(training_log_events_seq, [])
self.assertNotEqual(training_log_events, [])
# self.assertIsNotNone(training_log_events_seq[-1]["num_updates"])
self.assertIsNotNone(training_log_events[-1]["num_updates"])
self.assertEqual(
int(training_log_events[-1]["num_updates"]), max_update_first_run
)
# self.assertEqual(
# int(training_log_events_seq[-1]["num_updates"]), max_update_first_run
# )
# check the achieved loss is similar between seq and non-seq
# loss_val_seq = float(training_log_events_seq[-1]["loss"])
loss_val = float(training_log_events[-1]["loss"])

# print("loss_val_seq: {} | loss_val: {}".format(loss_val_seq, loss_val))
# self.assertAlmostEqual(
# loss_val, loss_val_seq, 1
# ) # 1 digit precision; 14.702 - seq; 14.735 - non seq

def _test_model_parallel(
self,
max_update_first_run,
argv_injection,
size_patch_dict,
is_sequence_parallel,
):
"""
Helper function to run the test
"""
# start the process for the model run
multiprocessing.set_start_method("spawn", force=True)
with torch.multiprocessing.Manager() as manager:
events = manager.list()
p = multiprocessing.Process(
target=run_training,
args=(
max_update_first_run,
events,
argv_injection,
size_patch_dict,
is_sequence_parallel,
),
)
p.start()
p.join()
events_first_run = list(events)

# cleanup of the checkpoints files
cleanup_checkpoints = subprocess.Popen(
"rm -r ./test-checkpoint".split(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
_, _ = cleanup_checkpoints.communicate()

# parse the log events from the log_to_events()
training_log_events = [
json.loads(event["message"])
for event in events_first_run
if event["type"] == "log" and event["message"].startswith('{"epoch"')
]

return training_log_events


def run_training(
max_update, events, argv_injection, size_patch_dict, is_sequence_parallel
):
# clean any unused cach to reduce CUDA OOM
torch.cuda.empty_cache()
# main arguments to run the training script
# both patches are aneeded to run the job of the circleci GPUs
with patch("sys.argv", argv_injection.split()[1:]), patch(
"metaseq.launcher.slurm.local_run",
partial(
local_run_mock,
max_update=max_update,
events=events,
is_sequence_parallel=is_sequence_parallel,
),
), patch.dict(
"metaseq.launcher.opt_job_constants.MODEL_SIZES",
# reduce the batch size for CUDA memory optimization
size_patch_dict,
):
sweep_cli_main()


def local_run_mock(
args, env, train_cmd, dry_run, max_update, events, is_sequence_parallel
):
"""
The function introduces several patches for the argumets of the
model training. These patches are needed to pass gpu tests on
circleci GPUs and enable sequence_parallel parameter
"""
# update the parameters of the model training
train_cmd[train_cmd.index("--max-update") + 1] = str(max_update)
train_cmd[train_cmd.index("--num-workers") + 1] = "1"
train_cmd[train_cmd.index("--dropout") + 1] = "0.0"
train_cmd.remove("--checkpoint-activations")
train_cmd.remove("--distribute-checkpointed-activations")
# add sequence_parallel argument to the model arguments
if is_sequence_parallel:
train_cmd.append("--sequence-parallel")

with patch("logging.Logger._log", partialmethod(log_to_events, events=events)):
with patch.dict("os.environ", env, clear=True):
with patch("sys.argv", train_cmd[1:]):
train_cli_main()


def log_to_events(self, info, message, args, events, **kwargs):
"""
The function is used to collect logging info from the subprocesses
and store it in the 'events' variable, which is then passed over
to the main process for asserting that the model ran correctly
"""
print(self, message)
if isinstance(message, str):
events.append(
{
"type": "log",
"message": message,
}
)


if __name__ == "__main__":
unittest.main()