From 48e19563fd1378b5a4ccc2b1a1b3f2e2beb8daba Mon Sep 17 00:00:00 2001 From: bashnick Date: Mon, 16 Jan 2023 14:40:24 +0000 Subject: [PATCH 01/49] Add test_sequence_parallel --- gpu_tests/test_model_parallel_mp1_mp2.py | 8 +- gpu_tests/test_sequence_parallel.py | 190 +++++++++++++++++++++++ 2 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 gpu_tests/test_sequence_parallel.py diff --git a/gpu_tests/test_model_parallel_mp1_mp2.py b/gpu_tests/test_model_parallel_mp1_mp2.py index 939f448ca..7ea5bbebd 100644 --- a/gpu_tests/test_model_parallel_mp1_mp2.py +++ b/gpu_tests/test_model_parallel_mp1_mp2.py @@ -23,9 +23,9 @@ ) class TestModelParallelMP1(unittest.TestCase): """ - The test will verify that the model can be trained with - model_parallel = 1 - The test checks hat the number of trianing steps performed is correct + The tests will verify that the model can be trained with both + model_parallel = 1 and model_parallel = 2 + The tests check that the number of training steps performed is correct and that the required loss is achieved on the last iteration """ @@ -142,7 +142,7 @@ def run_training(max_update, events, argv_injection, size_patch_dict): def local_run_mock(args, env, train_cmd, dry_run, max_update, events): """ - The function introduces several pathces for the argumets of the + The function introduces several patches for the argumets of the model training. These patches are needed to pass gpu tests on circleci GPUs (empirical knowledge) """ diff --git a/gpu_tests/test_sequence_parallel.py b/gpu_tests/test_sequence_parallel.py new file mode 100644 index 000000000..b8a21f653 --- /dev/null +++ b/gpu_tests/test_sequence_parallel.py @@ -0,0 +1,190 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import subprocess +import json +import multiprocessing +from functools import partial, partialmethod +import unittest +from unittest.mock import patch +import torch +from metaseq.dataclass.configs import DistributedTrainingConfig +from metaseq.launcher.opt_baselines import cli_main as sweep_cli_main +from metaseq.cli.train import cli_main as train_cli_main +from metaseq.launcher.opt_job_constants import Size, M + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires 4 GPUs, none found") +@unittest.skipIf( + DistributedTrainingConfig.distributed_world_size != 4, + "test requires 4 GPUs", +) +class TestSequenceParallel(unittest.TestCase): + """ + The tests check rough equivalence between going through the + sequence-parallel code-path with MP 2 vs the current non + sequence-parallel run for the 8M model. + """ + + def test_sequence_parallel(self): + # parameters to train an mp2 model with sequence_parallel flag + argv_injection = ( + "python3 metaseq/launcher/opt_baselines.py " + "--prefix train.8m --model-size 8m --checkpoints-dir ./test-checkpoint " + "--tensorboard-logdir ./test-checkpoint --num-trials 1 --azure " + "--num-gpus 4 --num-nodes 1 --seed 1 " + "--local --disable-validation --max-epoch 5 --max-update 5 --benchmark " + ) + max_update_first_run = 20 + size_patch_dict = {"8m": Size(4, 128, 2, 64, int(0.03125 * M), 1.0e-3, 2)} + + # train model with sequence_parallel flag + training_log_events_seq = self._test_model_parallel( + max_update_first_run=max_update_first_run, + argv_injection=argv_injection, + size_patch_dict=size_patch_dict, + is_sequence_parallel=True, + ) + # train model without sequence_parallel flag + training_log_events = self._test_model_parallel( + max_update_first_run=max_update_first_run, + argv_injection=argv_injection, + size_patch_dict=size_patch_dict, + is_sequence_parallel=False, + ) + + # check that training ran correctly + # check that the number of updates was correct + self.assertNotEqual(training_log_events_seq, []) + self.assertNotEqual(training_log_events, []) + self.assertIsNotNone(training_log_events_seq[-1]["num_updates"]) + self.assertIsNotNone(training_log_events[-1]["num_updates"]) + self.assertEqual( + int(training_log_events[-1]["num_updates"]), max_update_first_run + ) + self.assertEqual( + int(training_log_events_seq[-1]["num_updates"]), max_update_first_run + ) + # check the achieved loss is similar between seq and non-seq + loss_val_seq = float(training_log_events_seq[-1]["loss"]) + loss_val = float(training_log_events[-1]["loss"]) + + print("loss_val_seq: {} | loss_val: {}".format(loss_val_seq, loss_val)) + self.assertAlmostEqual( + loss_val, loss_val_seq, 1 + ) # 1 digit precision; 14.702 - seq; 14.735 - non seq + + def _test_model_parallel( + self, + max_update_first_run, + argv_injection, + size_patch_dict, + is_sequence_parallel, + ): + """ + Helper function to run the test + """ + # start the process for the model run + multiprocessing.set_start_method("spawn", force=True) + with torch.multiprocessing.Manager() as manager: + events = manager.list() + p = multiprocessing.Process( + target=run_training, + args=( + max_update_first_run, + events, + argv_injection, + size_patch_dict, + is_sequence_parallel, + ), + ) + p.start() + p.join() + events_first_run = list(events) + + # cleanup of the checkpoints files + cleanup_checkpoints = subprocess.Popen( + "rm -r ./test-checkpoint".split(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + _, _ = cleanup_checkpoints.communicate() + + # parse the log events from the log_to_events() + training_log_events = [ + json.loads(event["message"]) + for event in events_first_run + if event["type"] == "log" and event["message"].startswith('{"epoch"') + ] + + return training_log_events + + +def run_training( + max_update, events, argv_injection, size_patch_dict, is_sequence_parallel +): + # clean any unused cach to reduce CUDA OOM + torch.cuda.empty_cache() + # main arguments to run the training script + # both patches are aneeded to run the job of the circleci GPUs + with patch("sys.argv", argv_injection.split()[1:]), patch( + "metaseq.launcher.slurm.local_run", + partial( + local_run_mock, + max_update=max_update, + events=events, + is_sequence_parallel=is_sequence_parallel, + ), + ), patch.dict( + "metaseq.launcher.opt_job_constants.MODEL_SIZES", + # reduce the batch size for CUDA memory optimization + size_patch_dict, + ): + sweep_cli_main() + + +def local_run_mock( + args, env, train_cmd, dry_run, max_update, events, is_sequence_parallel +): + """ + The function introduces several patches for the argumets of the + model training. These patches are needed to pass gpu tests on + circleci GPUs and enable sequence_parallel parameter + """ + # update the parameters of the model training + train_cmd[train_cmd.index("--max-update") + 1] = str(max_update) + train_cmd[train_cmd.index("--num-workers") + 1] = "1" + train_cmd[train_cmd.index("--dropout") + 1] = "0.0" + train_cmd.remove("--checkpoint-activations") + train_cmd.remove("--distribute-checkpointed-activations") + # add sequence_parallel argument to the model arguments + if is_sequence_parallel: + train_cmd.append("--sequence-parallel") + + with patch("logging.Logger._log", partialmethod(log_to_events, events=events)): + with patch.dict("os.environ", env, clear=True): + with patch("sys.argv", train_cmd[1:]): + train_cli_main() + + +def log_to_events(self, info, message, args, events, **kwargs): + """ + The function is used to collect logging info from the subprocesses + and store it in the 'events' variable, which is then passed over + to the main process for asserting that the model ran correctly + """ + print(self, message) + if isinstance(message, str): + events.append( + { + "type": "log", + "message": message, + } + ) + + +if __name__ == "__main__": + unittest.main() From 4a3f111c68848213295b302c096a1dd61c8fa217 Mon Sep 17 00:00:00 2001 From: bashnick Date: Mon, 16 Jan 2023 16:33:39 +0000 Subject: [PATCH 02/49] try different version of apex --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 877ed569f..891bb594f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -41,7 +41,7 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone https://github.com/NVIDIA/apex cd apex - git checkout e2083df5eb96643c61613b9df48dd4eea6b07690 + git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi From c90bbd2f68cdf6f84a239bd9041625b3826d2d79 Mon Sep 17 00:00:00 2001 From: bashnick Date: Mon, 16 Jan 2023 18:30:45 +0000 Subject: [PATCH 03/49] try without sequence_parallel --- gpu_tests/test_sequence_parallel.py | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/gpu_tests/test_sequence_parallel.py b/gpu_tests/test_sequence_parallel.py index b8a21f653..582670eab 100644 --- a/gpu_tests/test_sequence_parallel.py +++ b/gpu_tests/test_sequence_parallel.py @@ -29,7 +29,7 @@ class TestSequenceParallel(unittest.TestCase): """ def test_sequence_parallel(self): - # parameters to train an mp2 model with sequence_parallel flag + # parameters to train an mp2 model with sequence_parallel flag argv_injection = ( "python3 metaseq/launcher/opt_baselines.py " "--prefix train.8m --model-size 8m --checkpoints-dir ./test-checkpoint " @@ -41,12 +41,12 @@ def test_sequence_parallel(self): size_patch_dict = {"8m": Size(4, 128, 2, 64, int(0.03125 * M), 1.0e-3, 2)} # train model with sequence_parallel flag - training_log_events_seq = self._test_model_parallel( - max_update_first_run=max_update_first_run, - argv_injection=argv_injection, - size_patch_dict=size_patch_dict, - is_sequence_parallel=True, - ) + # training_log_events_seq = self._test_model_parallel( + # max_update_first_run=max_update_first_run, + # argv_injection=argv_injection, + # size_patch_dict=size_patch_dict, + # is_sequence_parallel=True, + # ) # train model without sequence_parallel flag training_log_events = self._test_model_parallel( max_update_first_run=max_update_first_run, @@ -57,24 +57,24 @@ def test_sequence_parallel(self): # check that training ran correctly # check that the number of updates was correct - self.assertNotEqual(training_log_events_seq, []) + # self.assertNotEqual(training_log_events_seq, []) self.assertNotEqual(training_log_events, []) - self.assertIsNotNone(training_log_events_seq[-1]["num_updates"]) + # self.assertIsNotNone(training_log_events_seq[-1]["num_updates"]) self.assertIsNotNone(training_log_events[-1]["num_updates"]) self.assertEqual( int(training_log_events[-1]["num_updates"]), max_update_first_run ) - self.assertEqual( - int(training_log_events_seq[-1]["num_updates"]), max_update_first_run - ) + # self.assertEqual( + # int(training_log_events_seq[-1]["num_updates"]), max_update_first_run + # ) # check the achieved loss is similar between seq and non-seq - loss_val_seq = float(training_log_events_seq[-1]["loss"]) + # loss_val_seq = float(training_log_events_seq[-1]["loss"]) loss_val = float(training_log_events[-1]["loss"]) - print("loss_val_seq: {} | loss_val: {}".format(loss_val_seq, loss_val)) - self.assertAlmostEqual( - loss_val, loss_val_seq, 1 - ) # 1 digit precision; 14.702 - seq; 14.735 - non seq + # print("loss_val_seq: {} | loss_val: {}".format(loss_val_seq, loss_val)) + # self.assertAlmostEqual( + # loss_val, loss_val_seq, 1 + # ) # 1 digit precision; 14.702 - seq; 14.735 - non seq def _test_model_parallel( self, From b8a6afc0e8c586e45389f1a9a8454f60ddc718cd Mon Sep 17 00:00:00 2001 From: bashnick Date: Tue, 17 Jan 2023 10:22:14 +0000 Subject: [PATCH 04/49] try sequence_parallel separately --- gpu_tests/test_sequence_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_tests/test_sequence_parallel.py b/gpu_tests/test_sequence_parallel.py index 582670eab..282d4819d 100644 --- a/gpu_tests/test_sequence_parallel.py +++ b/gpu_tests/test_sequence_parallel.py @@ -52,7 +52,7 @@ def test_sequence_parallel(self): max_update_first_run=max_update_first_run, argv_injection=argv_injection, size_patch_dict=size_patch_dict, - is_sequence_parallel=False, + is_sequence_parallel=True, ) # check that training ran correctly From 2978c7793575f51529473c05256eeab534dbd19c Mon Sep 17 00:00:00 2001 From: bashnick Date: Tue, 17 Jan 2023 11:27:24 +0000 Subject: [PATCH 05/49] try update torch to 1.13.1+cu117 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 891bb594f..5d13cc11b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -74,7 +74,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' From 67f765fa3dccf32b8b22173aebce41f7b26951ab Mon Sep 17 00:00:00 2001 From: bashnick Date: Tue, 17 Jan 2023 12:10:07 +0000 Subject: [PATCH 06/49] try with torch=1.12.1+cu116 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5d13cc11b..0ce571017 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -74,7 +74,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.12.1+cu116 torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' From 65d886be59cf3a9819641dfeca377956b4f40dd7 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 11:11:47 +0000 Subject: [PATCH 07/49] try update environment --- .circleci/config.yml | 81 ++++++++++++----------------- gpu_tests/test_sequence_parallel.py | 2 +- 2 files changed, 33 insertions(+), 50 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0ce571017..9ade222c8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,85 +21,68 @@ install_dep_common: &install_dep_common - run: name: Install Common Dependencies command: | - source activate metaseq - # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - pip install setuptools==59.5.0 - pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U - python -c 'import torch; print("Torch version:", torch.__version__)' - python -m torch.utils.collect_env + source activate fairseq-20221101 + pip install "flake8==3.9.2" "black==22.3.0" "transformers" "pyarrow" "boto3" "pandas" "protobuf==3.20.2" "aim>=3.9.4" "azure-storage-blob" "click==8.0.4" "cython" "dataclasses" "editdistance" "fire" "flask==2.1.1" "hydra-core==1.1.0" "ipdb" "ipython" "Jinja2==3.1.1" "markupsafe" "more_itertools" "mypy" "ninja" "numpy" "omegaconf==2.1.1" "portalocker>=2.5" "pre-commit" "pytest" "pytest-regressions" "regex" "scikit-learn" "sacrebleu" "tensorboard==2.8.0" "timeout-decorator" "tokenizers" "tqdm" "typing_extensions" "bitarray" "sacremoses" "sentencepiece" "pybind11" "pyre-extensions==0.0.23" "typing-inspect==0.8.0" "iopath" + + # install cudatoolkit to enable sequence_parallel + conda install cudatoolkit + # Need to install ninja build system sudo apt-get update sudo apt-get install ninja-build - install_dep_fused_ops: &install_dep_fused_ops - run: name: Install Megatron/Apex Dependencies working_directory: ~/ - # because of https://github.com/NVIDIA/apex/issues/1252 we need to pin to a specific apex commit command: | - source activate metaseq + source activate fairseq-20221101 if ! python -c 'import apex'; then - git clone https://github.com/NVIDIA/apex + git clone --recursive https://github.com/NVIDIA/apex.git cd apex - git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea - pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ + python -m pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi if ! python -c 'import megatron_lm'; then git clone --depth=1 --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git cd Megatron-LM - pip install -r requirements.txt pip install -e . cd ~/ fi - -# Remove this when we get a new fairscale release +Remove this when we get a new fairscale release install_fairscale: &install_fairscale - run: name: Install Fairscale from Source working_directory: ~/ command: | - source activate metaseq + source activate fairseq-20221101 if ! python -c 'import fairscale'; then git clone https://github.com/facebookresearch/fairscale.git cd fairscale - git checkout 1bc96fa8c69def6d990e42bfbd75f86146ce29bd - pip install . + git checkout ngoyal_bf16_changes + pip install --no-build-isolation -e . cd ~/ fi - install_dep_pt19: &install_dep_pt19 - run: name: Install Pytorch Dependencies + working_directory: ~/ command: | - source activate metaseq - pip install --upgrade setuptools - pip install torch==1.12.1+cu116 torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html - python -c 'import torch; print("Torch version:", torch.__version__)' - - -install_pytorch_dep: &install_pytorch_dep - - parameters: - version_str: - type: string - default: "/dev/non_exist" # Default to error out - - run: - name: Install Pytorch Dependencies - command: | - source activate metaseq + source activate fairseq-20221101 pip install --upgrade setuptools - echo "<>" - pip install <> -f https://download.pytorch.org/whl/torch_stable.html + pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 python -c 'import torch; print("Torch version:", torch.__version__)' - install_repo: &install_repo - run: name: Install Repository + working_directory: ~/ command: | - source activate metaseq - pip install -e .[dev,few_shot,gpu] - python setup.py build_ext --inplace - + source activate fairseq-20221101 + if ! python -c 'import fairscale'; then + git clone git@github.com:facebookresearch/metaseq.git + cd metaseq + pip install --no-build-isolation -e . + cd ~/ + fi check_nvidia_driver: &check_nvidia_driver - run: @@ -108,7 +91,6 @@ check_nvidia_driver: &check_nvidia_driver command: | pyenv versions nvidia-smi - create_conda_env: &create_conda_env run: name: Install and Create Conda Environment @@ -119,20 +101,21 @@ create_conda_env: &create_conda_env rm ~/miniconda.sh echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV source $BASH_ENV - if [ ! -d ~/miniconda/envs/metaseq ] + if [ ! -d ~/miniconda/envs/fairseq-20221101 ] then - conda create -y -n metaseq python=3.8 + conda create --name fairseq-20221101 python=3.9 -y fi - source activate metaseq + source activate fairseq-20221101 python --version pip install --upgrade pip + conda install -y conda-pack download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: name: Download and configure a 125m checkpoint with HF dependencies working_directory: ~/metaseq/gpu_tests command: | - source activate metaseq + source activate fairseq-20221101 wget https://dl.fbaipublicfiles.com/opt/test_artifacts/125m_with_hf_dependencies.tar.gz tar -xvzf ./125m_with_hf_dependencies.tar.gz -C . python -m metaseq.scripts.convert_to_singleton ./125m @@ -160,12 +143,12 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/metaseq/lib/python3.8/site-packages + - ~/miniconda/envs/fairseq-20221101/lib/python3.9/site-packages key: *cache_key - run: name: Run Unit Tests command: | - source activate metaseq + source activate fairseq-20221101 python -m pytest --junitxml=test-results/junit.xml gpu_tests - store_test_results: path: test-results @@ -191,4 +174,4 @@ workflows: version: 2 build: jobs: - - gpu_tests_pt19 + - gpu_tests_pt19 \ No newline at end of file diff --git a/gpu_tests/test_sequence_parallel.py b/gpu_tests/test_sequence_parallel.py index 282d4819d..e386cc695 100644 --- a/gpu_tests/test_sequence_parallel.py +++ b/gpu_tests/test_sequence_parallel.py @@ -29,7 +29,7 @@ class TestSequenceParallel(unittest.TestCase): """ def test_sequence_parallel(self): - # parameters to train an mp2 model with sequence_parallel flag + # parameters to train an mp2 model with sequence_parallel flag argv_injection = ( "python3 metaseq/launcher/opt_baselines.py " "--prefix train.8m --model-size 8m --checkpoints-dir ./test-checkpoint " From 0bbebb9d678492e092127075364e0cfb5c70a0cf Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 11:15:43 +0000 Subject: [PATCH 08/49] upd the config file --- .circleci/config.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9ade222c8..8f07c95c9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ gpu: &gpu environment: CUDA_VERSION: "11.1" machine: - image: ubuntu-1604-cuda-11.1:202012-01 + image: ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi @@ -30,6 +30,7 @@ install_dep_common: &install_dep_common # Need to install ninja build system sudo apt-get update sudo apt-get install ninja-build + install_dep_fused_ops: &install_dep_fused_ops - run: name: Install Megatron/Apex Dependencies @@ -48,7 +49,8 @@ install_dep_fused_ops: &install_dep_fused_ops pip install -e . cd ~/ fi -Remove this when we get a new fairscale release + +#Remove this when we get a new fairscale release install_fairscale: &install_fairscale - run: name: Install Fairscale from Source @@ -62,6 +64,7 @@ install_fairscale: &install_fairscale pip install --no-build-isolation -e . cd ~/ fi + install_dep_pt19: &install_dep_pt19 - run: name: Install Pytorch Dependencies @@ -71,6 +74,7 @@ install_dep_pt19: &install_dep_pt19 pip install --upgrade setuptools pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 python -c 'import torch; print("Torch version:", torch.__version__)' + install_repo: &install_repo - run: name: Install Repository @@ -91,6 +95,7 @@ check_nvidia_driver: &check_nvidia_driver command: | pyenv versions nvidia-smi + create_conda_env: &create_conda_env run: name: Install and Create Conda Environment From 08e9d8809bf80f12ac3e07121c1d1ba28575c67e Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 11:54:04 +0000 Subject: [PATCH 09/49] debug config file env setup --- .circleci/config.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8f07c95c9..31f8ac31a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -70,6 +70,8 @@ install_dep_pt19: &install_dep_pt19 name: Install Pytorch Dependencies working_directory: ~/ command: | + source $BASH_ENV + cat $BASH_ENV source activate fairseq-20221101 pip install --upgrade setuptools pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 @@ -114,6 +116,7 @@ create_conda_env: &create_conda_env python --version pip install --upgrade pip conda install -y conda-pack + cat $BASH_ENV download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: From 0fcf0a5456c3b2618742b3872e04d3f1d135918e Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 12:05:56 +0000 Subject: [PATCH 10/49] add "source $BASH_ENV" before "source activate" --- .circleci/config.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 31f8ac31a..731b030c6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,6 +21,7 @@ install_dep_common: &install_dep_common - run: name: Install Common Dependencies command: | + source $BASH_ENV source activate fairseq-20221101 pip install "flake8==3.9.2" "black==22.3.0" "transformers" "pyarrow" "boto3" "pandas" "protobuf==3.20.2" "aim>=3.9.4" "azure-storage-blob" "click==8.0.4" "cython" "dataclasses" "editdistance" "fire" "flask==2.1.1" "hydra-core==1.1.0" "ipdb" "ipython" "Jinja2==3.1.1" "markupsafe" "more_itertools" "mypy" "ninja" "numpy" "omegaconf==2.1.1" "portalocker>=2.5" "pre-commit" "pytest" "pytest-regressions" "regex" "scikit-learn" "sacrebleu" "tensorboard==2.8.0" "timeout-decorator" "tokenizers" "tqdm" "typing_extensions" "bitarray" "sacremoses" "sentencepiece" "pybind11" "pyre-extensions==0.0.23" "typing-inspect==0.8.0" "iopath" @@ -36,6 +37,7 @@ install_dep_fused_ops: &install_dep_fused_ops name: Install Megatron/Apex Dependencies working_directory: ~/ command: | + source $BASH_ENV source activate fairseq-20221101 if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git @@ -56,6 +58,7 @@ install_fairscale: &install_fairscale name: Install Fairscale from Source working_directory: ~/ command: | + source $BASH_ENV source activate fairseq-20221101 if ! python -c 'import fairscale'; then git clone https://github.com/facebookresearch/fairscale.git @@ -71,7 +74,6 @@ install_dep_pt19: &install_dep_pt19 working_directory: ~/ command: | source $BASH_ENV - cat $BASH_ENV source activate fairseq-20221101 pip install --upgrade setuptools pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 @@ -82,6 +84,7 @@ install_repo: &install_repo name: Install Repository working_directory: ~/ command: | + source $BASH_ENV source activate fairseq-20221101 if ! python -c 'import fairscale'; then git clone git@github.com:facebookresearch/metaseq.git @@ -116,13 +119,13 @@ create_conda_env: &create_conda_env python --version pip install --upgrade pip conda install -y conda-pack - cat $BASH_ENV download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: name: Download and configure a 125m checkpoint with HF dependencies working_directory: ~/metaseq/gpu_tests command: | + source $BASH_ENV source activate fairseq-20221101 wget https://dl.fbaipublicfiles.com/opt/test_artifacts/125m_with_hf_dependencies.tar.gz tar -xvzf ./125m_with_hf_dependencies.tar.gz -C . @@ -156,6 +159,7 @@ commands: - run: name: Run Unit Tests command: | + source $BASH_ENV source activate fairseq-20221101 python -m pytest --junitxml=test-results/junit.xml gpu_tests - store_test_results: From cf59b739c399e5699cf2c020a7946b73a6f41aff Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 12:19:53 +0000 Subject: [PATCH 11/49] skip warning about the cuda versions mismatch --- .circleci/config.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 731b030c6..dd33ee926 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.1" + CUDA_VERSION: "11.4" machine: image: ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi @@ -42,7 +42,10 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git cd apex - python -m pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ + # skip the part of the setup.py code with the warning about + # cuda versions mismatch + sed -i '32 i \ \ \ \ return' setup.py + pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi if ! python -c 'import megatron_lm'; then From ccc838c87c0a1e328b8d86971edf97fd16e88776 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 13:34:48 +0000 Subject: [PATCH 12/49] try different version of apex --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index dd33ee926..26a05c896 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -42,6 +42,7 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git cd apex + git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea # skip the part of the setup.py code with the warning about # cuda versions mismatch sed -i '32 i \ \ \ \ return' setup.py From 8f5723a98025cb17e9638758b797f4dcc7015432 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 14:03:37 +0000 Subject: [PATCH 13/49] try revert back the ubuntu image and cuda 11.1 --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 26a05c896..61ffea2dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,9 +6,9 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.4" + CUDA_VERSION: "11.1" machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: ubuntu-1604-cuda-11.1:202012-01 resource_class: gpu.nvidia.medium.multi From 95ab3f1a4947700007bcc1cb074f43bae0b57c82 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 14:04:23 +0000 Subject: [PATCH 14/49] checkout main branch in apex --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 61ffea2dc..9cc56f7b3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -42,7 +42,6 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git cd apex - git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea # skip the part of the setup.py code with the warning about # cuda versions mismatch sed -i '32 i \ \ \ \ return' setup.py From 8e28d572c57e1b86cc572de0b7371fc51cfe6bcf Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 14:15:23 +0000 Subject: [PATCH 15/49] add back the checkout for apex --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9cc56f7b3..61ffea2dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -42,6 +42,7 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git cd apex + git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea # skip the part of the setup.py code with the warning about # cuda versions mismatch sed -i '32 i \ \ \ \ return' setup.py From 8e42490eeb84eeac05a3b9f453143643112e5389 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 15:02:55 +0000 Subject: [PATCH 16/49] try cuda 11.6 image --- .circleci/config.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 61ffea2dc..c60002022 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,9 +6,9 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.1" + CUDA_VERSION: "11.6" machine: - image: ubuntu-1604-cuda-11.1:202012-01 + image: ubuntu-2204-cuda-11.6:current resource_class: gpu.nvidia.medium.multi @@ -42,7 +42,6 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone --recursive https://github.com/NVIDIA/apex.git cd apex - git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea # skip the part of the setup.py code with the warning about # cuda versions mismatch sed -i '32 i \ \ \ \ return' setup.py From c9cd83cb0f901e6772d4c8405bca2166e6b57a51 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 15:05:54 +0000 Subject: [PATCH 17/49] revert back to cuda 11.4 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c60002022..87f8899fe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ gpu: &gpu environment: CUDA_VERSION: "11.6" machine: - image: ubuntu-2204-cuda-11.6:current + image: ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi From 75c7d4c11913411430f731e9f85fc3d3c0476e57 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 16:48:20 +0000 Subject: [PATCH 18/49] try update cuda to 11.6 --- .circleci/config.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 87f8899fe..bc893dcc4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.6" + CUDA_VERSION: "11.4" machine: image: ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi @@ -103,6 +103,7 @@ check_nvidia_driver: &check_nvidia_driver command: | pyenv versions nvidia-smi + uname -m create_conda_env: &create_conda_env run: @@ -123,6 +124,18 @@ create_conda_env: &create_conda_env pip install --upgrade pip conda install -y conda-pack +update_cuda: &update_cuda + run: + name: Update the cuda version + working_directory: ~/ + command: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-11-6_11.6.2-1_amd64.deb + sudo dpkg -i cuda-11-6_11.6.2-1_amd64.deb + sudo apt-get update + sudo apt-get --yes --force-yes install cuda + nvidia-smi + + download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: name: Download and configure a 125m checkpoint with HF dependencies @@ -144,6 +157,7 @@ commands: command: sudo mount -t tmpfs tmpfs ~/ - checkout - <<: *check_nvidia_driver + - <<: *update_cuda - <<: *create_conda_env - restore_cache: key: *cache_key From f69786d0364c7ce9beff5f99b0595ffe7f7d891b Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 16:49:27 +0000 Subject: [PATCH 19/49] fix format --- .circleci/config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bc893dcc4..b73f9b347 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -129,11 +129,11 @@ update_cuda: &update_cuda name: Update the cuda version working_directory: ~/ command: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-11-6_11.6.2-1_amd64.deb - sudo dpkg -i cuda-11-6_11.6.2-1_amd64.deb - sudo apt-get update - sudo apt-get --yes --force-yes install cuda - nvidia-smi + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-11-6_11.6.2-1_amd64.deb + sudo dpkg -i cuda-11-6_11.6.2-1_amd64.deb + sudo apt-get update + sudo apt-get --yes --force-yes install cuda + nvidia-smi download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies From 4e81303da7687ddb0159b036ed7baf5398525b0f Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 17:27:07 +0000 Subject: [PATCH 20/49] try update cuda install --- .circleci/config.yml | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b73f9b347..778efb444 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -128,12 +128,24 @@ update_cuda: &update_cuda run: name: Update the cuda version working_directory: ~/ - command: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-11-6_11.6.2-1_amd64.deb - sudo dpkg -i cuda-11-6_11.6.2-1_amd64.deb - sudo apt-get update - sudo apt-get --yes --force-yes install cuda + command: | + # download the .pin file to setup cuda + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub + sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" + + # install cuda + sudo apt update + sudo apt install cuda-toolkit-11.6 + sudo apt install cuda + + # set path to point to CUDA binaries + echo 'export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}' >> ~/.bashrc + + # check th version + nvcc --version nvidia-smi + cat ~/.bashrc download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies From 33349227aa13c74f6679359287e7dbeaa1bd5ca3 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 17:39:38 +0000 Subject: [PATCH 21/49] update the keys for cuda repo --- .circleci/config.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 778efb444..79b6b90d5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -128,7 +128,11 @@ update_cuda: &update_cuda run: name: Update the cuda version working_directory: ~/ - command: | + command: | + # download and update the keys for cuda repo + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + # download the .pin file to setup cuda wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub From b83768e470f22f427731882fb336cabb7a0ed528 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 17:46:11 +0000 Subject: [PATCH 22/49] remove cuda-toolkit --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 79b6b90d5..7b11511fa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -140,7 +140,6 @@ update_cuda: &update_cuda # install cuda sudo apt update - sudo apt install cuda-toolkit-11.6 sudo apt install cuda # set path to point to CUDA binaries From 31257da47fbcc5c50a1186dc09303201858cc72b Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 17:55:27 +0000 Subject: [PATCH 23/49] try update the saving location --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7b11511fa..cc89d25f0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -134,7 +134,7 @@ update_cuda: &update_cuda sudo dpkg -i cuda-keyring_1.0-1_all.deb # download the .pin file to setup cuda - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + sudo wget -O /etc/apt/preferences.d/cuda-repository-pin-600 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" From 72177a0d0c5f61dbc219c6f803f7ba22bcc667b0 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:03:00 +0000 Subject: [PATCH 24/49] try install cuda using the ubuntu package manager --- .circleci/config.yml | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cc89d25f0..011a735ff 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -129,23 +129,10 @@ update_cuda: &update_cuda name: Update the cuda version working_directory: ~/ command: | - # download and update the keys for cuda repo - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb - sudo dpkg -i cuda-keyring_1.0-1_all.deb - - # download the .pin file to setup cuda - sudo wget -O /etc/apt/preferences.d/cuda-repository-pin-600 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub - sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" - - # install cuda sudo apt update - sudo apt install cuda - - # set path to point to CUDA binaries - echo 'export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}' >> ~/.bashrc + sudo apt install nvidia-cuda-toolkit - # check th version + # check the version nvcc --version nvidia-smi cat ~/.bashrc From c44685528843a13ae927fb41e00c877102979c8d Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:16:16 +0000 Subject: [PATCH 25/49] try most recent cuda version --- .circleci/config.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 011a735ff..a8aef8ca6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -126,16 +126,21 @@ create_conda_env: &create_conda_env update_cuda: &update_cuda run: - name: Update the cuda version + name: Update cuda version working_directory: ~/ command: | - sudo apt update - sudo apt install nvidia-cuda-toolkit + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 + wget https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-ubuntu2004-12-0-local_12.0.0-525.60.13-1_amd64.deb + + sudo dpkg -i cuda-repo-ubuntu2004-12-0-local_12.0.0-525.60.13-1_amd64.deb + sudo cp /var/cuda-repo-ubuntu2004-12-0-local/cuda-*-keyring.gpg /usr/share/keyrings/ + sudo apt-get update + sudo apt-get -y install cuda # check the version nvcc --version nvidia-smi - cat ~/.bashrc download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies From a7cfcddd1cc6172073180e87d07393170bdde6ff Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:17:50 +0000 Subject: [PATCH 26/49] try ubuntu 11.6 --- .circleci/config.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a8aef8ca6..7369aabc1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -131,10 +131,9 @@ update_cuda: &update_cuda command: | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 - wget https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-ubuntu2004-12-0-local_12.0.0-525.60.13-1_amd64.deb - - sudo dpkg -i cuda-repo-ubuntu2004-12-0-local_12.0.0-525.60.13-1_amd64.deb - sudo cp /var/cuda-repo-ubuntu2004-12-0-local/cuda-*-keyring.gpg /usr/share/keyrings/ + wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-ubuntu2004-11-6-local_11.6.2-510.47.03-1_amd64.deb + sudo dpkg -i cuda-repo-ubuntu2004-11-6-local_11.6.2-510.47.03-1_amd64.deb + sudo apt-key add /var/cuda-repo-ubuntu2004-11-6-local/7fa2af80.pub sudo apt-get update sudo apt-get -y install cuda From d1da22b7410c96f5e2310eedbcddf243cb317247 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:26:36 +0000 Subject: [PATCH 27/49] clean previous cuda versions first --- .circleci/config.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7369aabc1..95d28bcbe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -128,7 +128,11 @@ update_cuda: &update_cuda run: name: Update cuda version working_directory: ~/ - command: | + command: | + # clean previous CUDA versions + sudo apt clean; sudo apt update; sudo apt purge cuda; sudo apt-get remove nvidia-cuda-*; sudo apt autoremove; + + # install new CUDA wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-ubuntu2004-11-6-local_11.6.2-510.47.03-1_amd64.deb From ec4903a148e1129235e61c9dae6e2f3a75bb0791 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:29:50 +0000 Subject: [PATCH 28/49] try change image without cuda --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 95d28bcbe..87727f41f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,9 +6,9 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.4" + CUDA_VERSION: "11.6" machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: ubuntu-2204:2022.10.2 # ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi From dee4d68e0e6f7f0e420cf8c394335d555189a1a2 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 18:42:51 +0000 Subject: [PATCH 29/49] remove nvcc --version --- .circleci/config.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 87727f41f..9eb2cafba 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ gpu: &gpu environment: CUDA_VERSION: "11.6" machine: - image: ubuntu-2204:2022.10.2 # ubuntu-2004-cuda-11.4:202110-01 + image: ubuntu-2004-cuda-11.4:202110-01 resource_class: gpu.nvidia.medium.multi @@ -124,6 +124,9 @@ create_conda_env: &create_conda_env pip install --upgrade pip conda install -y conda-pack + # check CUDA version + nvidia-smi + update_cuda: &update_cuda run: name: Update cuda version @@ -140,10 +143,6 @@ update_cuda: &update_cuda sudo apt-key add /var/cuda-repo-ubuntu2004-11-6-local/7fa2af80.pub sudo apt-get update sudo apt-get -y install cuda - - # check the version - nvcc --version - nvidia-smi download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies From df56434bd047e34cae061faec024b12217840d63 Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 19:59:09 +0000 Subject: [PATCH 30/49] remove conda-pack --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9eb2cafba..c6fb70ee6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -122,7 +122,6 @@ create_conda_env: &create_conda_env source activate fairseq-20221101 python --version pip install --upgrade pip - conda install -y conda-pack # check CUDA version nvidia-smi From 7affc3cd3ef8bae38053c35087c57f385924030b Mon Sep 17 00:00:00 2001 From: bashnick Date: Wed, 18 Jan 2023 20:43:03 +0000 Subject: [PATCH 31/49] try reload CUDA --- .circleci/config.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index c6fb70ee6..88e2f9a2c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -143,6 +143,14 @@ update_cuda: &update_cuda sudo apt-get update sudo apt-get -y install cuda + # reload CUDA + # remove the existing Nvidia kernel module + modprobe -r nvidia nvidia_uvm + # reload the systemd units + systemctl daemon-reload + # build and load the new kernel module + systemctl restart cuda-driver + download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: From b5cfc412879b2d62c6134efaceaacccbadac7923 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 10:09:30 +0000 Subject: [PATCH 32/49] try unload the nvidia kernel module first --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 88e2f9a2c..4bf78f99b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -144,6 +144,8 @@ update_cuda: &update_cuda sudo apt-get -y install cuda # reload CUDA + # unload nvidia kernel + sudo rmmod nvidia # remove the existing Nvidia kernel module modprobe -r nvidia nvidia_uvm # reload the systemd units From 3d616f71df50a1f0a73d50fce1d7dc8f30359b4e Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 10:30:44 +0000 Subject: [PATCH 33/49] try stop other nvidia processes before restarting --- .circleci/config.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4bf78f99b..ef22ca0c1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -144,15 +144,21 @@ update_cuda: &update_cuda sudo apt-get -y install cuda # reload CUDA - # unload nvidia kernel - sudo rmmod nvidia + # stop other processes that might use CUDA + sudo service lightdm stop + sudo service gdm stop # remove the existing Nvidia kernel module modprobe -r nvidia nvidia_uvm # reload the systemd units systemctl daemon-reload + # unload nvidia kernel + sudo rmmod nvidia # build and load the new kernel module systemctl restart cuda-driver + # check CUDA version + nvidia-smi + download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: From 9dff62103a92c6c343aa53ed3c2d0d8d50a87f66 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 11:32:31 +0000 Subject: [PATCH 34/49] revert back to cu11.1 and try with PT1.13+cu11.1 --- .circleci/config.yml | 41 ++++------------------------------------- 1 file changed, 4 insertions(+), 37 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ef22ca0c1..f30a9ee5d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,9 +6,9 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.6" + CUDA_VERSION: "11.1" machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: ubuntu-1604-cuda-11.1:202012-01 resource_class: gpu.nvidia.medium.multi @@ -79,7 +79,8 @@ install_dep_pt19: &install_dep_pt19 source $BASH_ENV source activate fairseq-20221101 pip install --upgrade setuptools - pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 + # the only difference to prod fairseq-20221101 env is the cu111 VS cu116 + pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu111 python -c 'import torch; print("Torch version:", torch.__version__)' install_repo: &install_repo @@ -126,39 +127,6 @@ create_conda_env: &create_conda_env # check CUDA version nvidia-smi -update_cuda: &update_cuda - run: - name: Update cuda version - working_directory: ~/ - command: | - # clean previous CUDA versions - sudo apt clean; sudo apt update; sudo apt purge cuda; sudo apt-get remove nvidia-cuda-*; sudo apt autoremove; - - # install new CUDA - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin - sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 - wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-ubuntu2004-11-6-local_11.6.2-510.47.03-1_amd64.deb - sudo dpkg -i cuda-repo-ubuntu2004-11-6-local_11.6.2-510.47.03-1_amd64.deb - sudo apt-key add /var/cuda-repo-ubuntu2004-11-6-local/7fa2af80.pub - sudo apt-get update - sudo apt-get -y install cuda - - # reload CUDA - # stop other processes that might use CUDA - sudo service lightdm stop - sudo service gdm stop - # remove the existing Nvidia kernel module - modprobe -r nvidia nvidia_uvm - # reload the systemd units - systemctl daemon-reload - # unload nvidia kernel - sudo rmmod nvidia - # build and load the new kernel module - systemctl restart cuda-driver - - # check CUDA version - nvidia-smi - download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: @@ -181,7 +149,6 @@ commands: command: sudo mount -t tmpfs tmpfs ~/ - checkout - <<: *check_nvidia_driver - - <<: *update_cuda - <<: *create_conda_env - restore_cache: key: *cache_key From 25778c113f6832ecea9ea63e0618f6d97f48f335 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 12:39:31 +0000 Subject: [PATCH 35/49] try with PT1.13+cu11.1 from source --- .circleci/config.yml | 125 +++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f30a9ee5d..c3d8a8658 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,6 +16,57 @@ gpu: &gpu # Re-usable commands # ------------------------------------------------------------------------------------- cache_key: &cache_key cache-key-{{ checksum ".circleci/config.yml" }}-{{ checksum "setup.py"}} + +check_nvidia_driver: &check_nvidia_driver + - run: + name: Check NVIDIA Driver + working_directory: ~/ + command: | + pyenv versions + nvidia-smi + uname -m + +create_conda_env: &create_conda_env + run: + name: Install and Create Conda Environment + command: | + curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x ~/miniconda.sh + bash ~/miniconda.sh -b -p $HOME/miniconda + rm ~/miniconda.sh + echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV + source $BASH_ENV + if [ ! -d ~/miniconda/envs/fairseq-20221101 ] + then + conda create --name fairseq-20221101 python=3.9 -y + fi + source activate fairseq-20221101 + python --version + pip install --upgrade pip + + # check CUDA version + nvidia-smi + +install_dep_pt19: &install_dep_pt19 + - run: + name: Install Pytorch Dependencies + working_directory: ~/ + command: | + # the only difference to prod fairseq-20221101 env is the cu111 VS cu116 + source $BASH_ENV + source activate fairseq-20221101 + pip install --upgrade setuptools + + # install PyTorch from source + conda install -y astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions six requests dataclasses + conda install -y mkl mkl-include + conda install -c -y pytorch magma-cuda111 + + git clone --recursive https://github.com/pytorch/pytorch + cd pytorch + export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + python setup.py develop + python -c 'import torch; print("Torch version:", torch.__version__)' install_dep_common: &install_dep_common - run: @@ -32,6 +83,22 @@ install_dep_common: &install_dep_common sudo apt-get update sudo apt-get install ninja-build +#Remove this when we get a new fairscale release +install_fairscale: &install_fairscale + - run: + name: Install Fairscale from Source + working_directory: ~/ + command: | + source $BASH_ENV + source activate fairseq-20221101 + if ! python -c 'import fairscale'; then + git clone https://github.com/facebookresearch/fairscale.git + cd fairscale + git checkout ngoyal_bf16_changes + pip install --no-build-isolation -e . + cd ~/ + fi + install_dep_fused_ops: &install_dep_fused_ops - run: name: Install Megatron/Apex Dependencies @@ -55,34 +122,6 @@ install_dep_fused_ops: &install_dep_fused_ops cd ~/ fi -#Remove this when we get a new fairscale release -install_fairscale: &install_fairscale - - run: - name: Install Fairscale from Source - working_directory: ~/ - command: | - source $BASH_ENV - source activate fairseq-20221101 - if ! python -c 'import fairscale'; then - git clone https://github.com/facebookresearch/fairscale.git - cd fairscale - git checkout ngoyal_bf16_changes - pip install --no-build-isolation -e . - cd ~/ - fi - -install_dep_pt19: &install_dep_pt19 - - run: - name: Install Pytorch Dependencies - working_directory: ~/ - command: | - source $BASH_ENV - source activate fairseq-20221101 - pip install --upgrade setuptools - # the only difference to prod fairseq-20221101 env is the cu111 VS cu116 - pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu111 - python -c 'import torch; print("Torch version:", torch.__version__)' - install_repo: &install_repo - run: name: Install Repository @@ -97,36 +136,6 @@ install_repo: &install_repo cd ~/ fi -check_nvidia_driver: &check_nvidia_driver - - run: - name: Check NVIDIA Driver - working_directory: ~/ - command: | - pyenv versions - nvidia-smi - uname -m - -create_conda_env: &create_conda_env - run: - name: Install and Create Conda Environment - command: | - curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - chmod +x ~/miniconda.sh - bash ~/miniconda.sh -b -p $HOME/miniconda - rm ~/miniconda.sh - echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV - source $BASH_ENV - if [ ! -d ~/miniconda/envs/fairseq-20221101 ] - then - conda create --name fairseq-20221101 python=3.9 -y - fi - source activate fairseq-20221101 - python --version - pip install --upgrade pip - - # check CUDA version - nvidia-smi - download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: From ed559e8da655d0bd41efed49923e157f7a819484 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 13:27:00 +0000 Subject: [PATCH 36/49] small fix --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c3d8a8658..9c5c0ada7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -58,9 +58,9 @@ install_dep_pt19: &install_dep_pt19 pip install --upgrade setuptools # install PyTorch from source - conda install -y astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions six requests dataclasses - conda install -y mkl mkl-include - conda install -c -y pytorch magma-cuda111 + conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions six requests dataclasses -y + conda install mkl mkl-include -y + conda install -c pytorch magma-cuda111 -y git clone --recursive https://github.com/pytorch/pytorch cd pytorch From 8236c9c24813534e01ac97605a3e560af023a941 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 14:26:02 +0000 Subject: [PATCH 37/49] small fix with PATH --- .circleci/config.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c5c0ada7..c60e4dc69 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -57,6 +57,10 @@ install_dep_pt19: &install_dep_pt19 source activate fairseq-20221101 pip install --upgrade setuptools + # add CUDA path to the PATH environment variable + export PATH=$PATH:/usr/local/cuda/bin + export CUDACXX=/usr/local/cuda-11.1/bin/nvcc + # install PyTorch from source conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions six requests dataclasses -y conda install mkl mkl-include -y From 0b8050cc555e4d4c11819270d4d040b689968f20 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 15:09:49 +0000 Subject: [PATCH 38/49] revert to PT 1.10.2+cu111 --- .circleci/config.yml | 173 ++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 100 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c60e4dc69..edd56845e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,143 +16,117 @@ gpu: &gpu # Re-usable commands # ------------------------------------------------------------------------------------- cache_key: &cache_key cache-key-{{ checksum ".circleci/config.yml" }}-{{ checksum "setup.py"}} - -check_nvidia_driver: &check_nvidia_driver - - run: - name: Check NVIDIA Driver - working_directory: ~/ - command: | - pyenv versions - nvidia-smi - uname -m - -create_conda_env: &create_conda_env - run: - name: Install and Create Conda Environment - command: | - curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - chmod +x ~/miniconda.sh - bash ~/miniconda.sh -b -p $HOME/miniconda - rm ~/miniconda.sh - echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV - source $BASH_ENV - if [ ! -d ~/miniconda/envs/fairseq-20221101 ] - then - conda create --name fairseq-20221101 python=3.9 -y - fi - source activate fairseq-20221101 - python --version - pip install --upgrade pip - - # check CUDA version - nvidia-smi - -install_dep_pt19: &install_dep_pt19 - - run: - name: Install Pytorch Dependencies - working_directory: ~/ - command: | - # the only difference to prod fairseq-20221101 env is the cu111 VS cu116 - source $BASH_ENV - source activate fairseq-20221101 - pip install --upgrade setuptools - - # add CUDA path to the PATH environment variable - export PATH=$PATH:/usr/local/cuda/bin - export CUDACXX=/usr/local/cuda-11.1/bin/nvcc - - # install PyTorch from source - conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions six requests dataclasses -y - conda install mkl mkl-include -y - conda install -c pytorch magma-cuda111 -y - - git clone --recursive https://github.com/pytorch/pytorch - cd pytorch - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - python setup.py develop - python -c 'import torch; print("Torch version:", torch.__version__)' install_dep_common: &install_dep_common - run: name: Install Common Dependencies command: | - source $BASH_ENV - source activate fairseq-20221101 - pip install "flake8==3.9.2" "black==22.3.0" "transformers" "pyarrow" "boto3" "pandas" "protobuf==3.20.2" "aim>=3.9.4" "azure-storage-blob" "click==8.0.4" "cython" "dataclasses" "editdistance" "fire" "flask==2.1.1" "hydra-core==1.1.0" "ipdb" "ipython" "Jinja2==3.1.1" "markupsafe" "more_itertools" "mypy" "ninja" "numpy" "omegaconf==2.1.1" "portalocker>=2.5" "pre-commit" "pytest" "pytest-regressions" "regex" "scikit-learn" "sacrebleu" "tensorboard==2.8.0" "timeout-decorator" "tokenizers" "tqdm" "typing_extensions" "bitarray" "sacremoses" "sentencepiece" "pybind11" "pyre-extensions==0.0.23" "typing-inspect==0.8.0" "iopath" - - # install cudatoolkit to enable sequence_parallel - conda install cudatoolkit - + source activate metaseq + # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 + pip install setuptools==59.5.0 + pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U + python -c 'import torch; print("Torch version:", torch.__version__)' + python -m torch.utils.collect_env # Need to install ninja build system sudo apt-get update sudo apt-get install ninja-build - -#Remove this when we get a new fairscale release -install_fairscale: &install_fairscale - - run: - name: Install Fairscale from Source - working_directory: ~/ - command: | - source $BASH_ENV - source activate fairseq-20221101 - if ! python -c 'import fairscale'; then - git clone https://github.com/facebookresearch/fairscale.git - cd fairscale - git checkout ngoyal_bf16_changes - pip install --no-build-isolation -e . - cd ~/ - fi - install_dep_fused_ops: &install_dep_fused_ops - run: name: Install Megatron/Apex Dependencies working_directory: ~/ + # because of https://github.com/NVIDIA/apex/issues/1252 we need to pin to a specific apex commit command: | - source $BASH_ENV - source activate fairseq-20221101 + source activate metaseq if ! python -c 'import apex'; then - git clone --recursive https://github.com/NVIDIA/apex.git + git clone https://github.com/NVIDIA/apex cd apex - # skip the part of the setup.py code with the warning about - # cuda versions mismatch - sed -i '32 i \ \ \ \ return' setup.py + git checkout e2083df5eb96643c61613b9df48dd4eea6b07690 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi if ! python -c 'import megatron_lm'; then git clone --depth=1 --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git cd Megatron-LM + pip install -r requirements.txt pip install -e . cd ~/ fi - +# Remove this when we get a new fairscale release +install_fairscale: &install_fairscale + - run: + name: Install Fairscale from Source + working_directory: ~/ + command: | + source activate metaseq + if ! python -c 'import fairscale'; then + git clone https://github.com/facebookresearch/fairscale.git + cd fairscale + git checkout 1bc96fa8c69def6d990e42bfbd75f86146ce29bd + pip install . + cd ~/ + fi +install_dep_pt19: &install_dep_pt19 + - run: + name: Install Pytorch Dependencies + command: | + source activate metaseq + pip install --upgrade setuptools + pip install torch==1.10.2+cu111 torchvision torchaudio -f https://download.pytorch.org/whl/cu111 + python -c 'import torch; print("Torch version:", torch.__version__)' +install_pytorch_dep: &install_pytorch_dep + - parameters: + version_str: + type: string + default: "/dev/non_exist" # Default to error out + - run: + name: Install Pytorch Dependencies + command: | + source activate metaseq + pip install --upgrade setuptools + echo "<>" + pip install <> -f https://download.pytorch.org/whl/torch_stable.html + python -c 'import torch; print("Torch version:", torch.__version__)' install_repo: &install_repo - run: name: Install Repository + command: | + source activate metaseq + pip install -e .[dev,few_shot,gpu] + python setup.py build_ext --inplace +check_nvidia_driver: &check_nvidia_driver + - run: + name: Check NVIDIA Driver working_directory: ~/ command: | + pyenv versions + nvidia-smi +create_conda_env: &create_conda_env + run: + name: Install and Create Conda Environment + command: | + curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x ~/miniconda.sh + bash ~/miniconda.sh -b -p $HOME/miniconda + rm ~/miniconda.sh + echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV source $BASH_ENV - source activate fairseq-20221101 - if ! python -c 'import fairscale'; then - git clone git@github.com:facebookresearch/metaseq.git - cd metaseq - pip install --no-build-isolation -e . - cd ~/ + if [ ! -d ~/miniconda/envs/metaseq ] + then + conda create -y -n metaseq python=3.8 fi - - + source activate metaseq + python --version + pip install --upgrade pip download_and_configure_125m_with_hf_dependencies: &download_and_configure_125m_with_hf_dependencies - run: name: Download and configure a 125m checkpoint with HF dependencies working_directory: ~/metaseq/gpu_tests command: | - source $BASH_ENV - source activate fairseq-20221101 + source activate metaseq wget https://dl.fbaipublicfiles.com/opt/test_artifacts/125m_with_hf_dependencies.tar.gz tar -xvzf ./125m_with_hf_dependencies.tar.gz -C . python -m metaseq.scripts.convert_to_singleton ./125m python -m transformers.models.opt.convert_opt_original_pytorch_checkpoint_to_pytorch --pytorch_dump_folder_path ./125m/ --hf_config ./125m/config.json --fairseq_path ./125m/restored.pt - commands: gpu_pre: &gpu_pre @@ -175,13 +149,12 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/fairseq-20221101/lib/python3.9/site-packages + - ~/miniconda/envs/metaseq/lib/python3.8/site-packages key: *cache_key - run: name: Run Unit Tests command: | - source $BASH_ENV - source activate fairseq-20221101 + source activate metaseq python -m pytest --junitxml=test-results/junit.xml gpu_tests - store_test_results: path: test-results From 50d96458ae20b05c0a5b101bfb19f10b1b6c113e Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 15:28:19 +0000 Subject: [PATCH 39/49] take PT from torch_stable --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index edd56845e..763268840 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,7 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - pip install setuptools==59.5.0 + pip install --upgrade setuptools pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -71,7 +71,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.10.2+cu111 torchvision torchaudio -f https://download.pytorch.org/whl/cu111 + pip install torch==1.10.2+cu111 torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: From a0f28072757d0314e2a78b8e675bcd88ad326465 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 16:01:43 +0000 Subject: [PATCH 40/49] small changes --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 763268840..bb7ad5eb2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -71,7 +71,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.10.2+cu111 torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: @@ -112,7 +112,7 @@ create_conda_env: &create_conda_env source $BASH_ENV if [ ! -d ~/miniconda/envs/metaseq ] then - conda create -y -n metaseq python=3.8 + conda create -y -n metaseq python=3.9 fi source activate metaseq python --version @@ -149,7 +149,7 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/metaseq/lib/python3.8/site-packages + - ~/miniconda/envs/metaseq/lib/python3.9/site-packages key: *cache_key - run: name: Run Unit Tests From 3b9ba76b1c8fdbd9735037fe243656d530288b65 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 16:40:51 +0000 Subject: [PATCH 41/49] try original config --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bb7ad5eb2..e1e66f08e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,7 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - pip install --upgrade setuptools + pip install setuptools==59.5.0 pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -71,7 +71,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: @@ -112,7 +112,7 @@ create_conda_env: &create_conda_env source $BASH_ENV if [ ! -d ~/miniconda/envs/metaseq ] then - conda create -y -n metaseq python=3.9 + conda create -y -n metaseq python=3.8 fi source activate metaseq python --version @@ -149,7 +149,7 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/metaseq/lib/python3.9/site-packages + - ~/miniconda/envs/metaseq/lib/python3.8/site-packages key: *cache_key - run: name: Run Unit Tests From 89b1ba72a32adde419aa8a0e7ba3dfe275285d8f Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 16:50:58 +0000 Subject: [PATCH 42/49] try change to PT10.2 --- .circleci/config.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e1e66f08e..bb2d68b13 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,7 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - pip install setuptools==59.5.0 + pip install --upgrade setuptools pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -40,7 +40,6 @@ install_dep_fused_ops: &install_dep_fused_ops if ! python -c 'import apex'; then git clone https://github.com/NVIDIA/apex cd apex - git checkout e2083df5eb96643c61613b9df48dd4eea6b07690 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi @@ -71,7 +70,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: From f4cc998c0752038ba37e58428649fceeae9c2ea9 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 16:58:10 +0000 Subject: [PATCH 43/49] change checkouts for apex and fairscale --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bb2d68b13..eca3e897d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -60,7 +60,7 @@ install_fairscale: &install_fairscale if ! python -c 'import fairscale'; then git clone https://github.com/facebookresearch/fairscale.git cd fairscale - git checkout 1bc96fa8c69def6d990e42bfbd75f86146ce29bd + git checkout ngoyal_bf16_changes pip install . cd ~/ fi From 125800fc4dad4be4aa95d7e97bbf54176083af26 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 17:15:20 +0000 Subject: [PATCH 44/49] remove checkout for apex --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index eca3e897d..203995f4d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -60,7 +60,7 @@ install_fairscale: &install_fairscale if ! python -c 'import fairscale'; then git clone https://github.com/facebookresearch/fairscale.git cd fairscale - git checkout ngoyal_bf16_changes + git checkout 1bc96fa8c69def6d990e42bfbd75f86146ce29bd pip install . cd ~/ fi @@ -70,7 +70,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: From e65d479e5cd70beaa1b5cb79b8f61b87809ecdd2 Mon Sep 17 00:00:00 2001 From: bashnick Date: Thu, 19 Jan 2023 18:20:13 +0000 Subject: [PATCH 45/49] try python3.9 --- .circleci/config.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 203995f4d..e9a77b47a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -37,6 +37,7 @@ install_dep_fused_ops: &install_dep_fused_ops # because of https://github.com/NVIDIA/apex/issues/1252 we need to pin to a specific apex commit command: | source activate metaseq + pip install packaging if ! python -c 'import apex'; then git clone https://github.com/NVIDIA/apex cd apex @@ -70,7 +71,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: @@ -111,7 +112,7 @@ create_conda_env: &create_conda_env source $BASH_ENV if [ ! -d ~/miniconda/envs/metaseq ] then - conda create -y -n metaseq python=3.8 + conda create -y -n metaseq python=3.9 fi source activate metaseq python --version @@ -148,7 +149,7 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/metaseq/lib/python3.8/site-packages + - ~/miniconda/envs/metaseq/lib/python3.9/site-packages key: *cache_key - run: name: Run Unit Tests From 428adb7f2c6147e4d693b77ea90ae36cc4c9e9f2 Mon Sep 17 00:00:00 2001 From: bashnick Date: Fri, 20 Jan 2023 12:13:59 +0000 Subject: [PATCH 46/49] revert to initial setup --- .circleci/config.yml | 9 ++++----- fairscale | 1 + gpu_tests/test_sequence_parallel.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) create mode 160000 fairscale diff --git a/.circleci/config.yml b/.circleci/config.yml index e9a77b47a..d4a97e561 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,6 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - pip install --upgrade setuptools pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -37,10 +36,10 @@ install_dep_fused_ops: &install_dep_fused_ops # because of https://github.com/NVIDIA/apex/issues/1252 we need to pin to a specific apex commit command: | source activate metaseq - pip install packaging if ! python -c 'import apex'; then git clone https://github.com/NVIDIA/apex cd apex + git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ fi @@ -71,7 +70,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: @@ -112,7 +111,7 @@ create_conda_env: &create_conda_env source $BASH_ENV if [ ! -d ~/miniconda/envs/metaseq ] then - conda create -y -n metaseq python=3.9 + conda create -y -n metaseq python=3.8 fi source activate metaseq python --version @@ -149,7 +148,7 @@ commands: - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: - - ~/miniconda/envs/metaseq/lib/python3.9/site-packages + - ~/miniconda/envs/metaseq/lib/python3.8/site-packages key: *cache_key - run: name: Run Unit Tests diff --git a/fairscale b/fairscale new file mode 160000 index 000000000..1bc96fa8c --- /dev/null +++ b/fairscale @@ -0,0 +1 @@ +Subproject commit 1bc96fa8c69def6d990e42bfbd75f86146ce29bd diff --git a/gpu_tests/test_sequence_parallel.py b/gpu_tests/test_sequence_parallel.py index e386cc695..020743e42 100644 --- a/gpu_tests/test_sequence_parallel.py +++ b/gpu_tests/test_sequence_parallel.py @@ -29,7 +29,7 @@ class TestSequenceParallel(unittest.TestCase): """ def test_sequence_parallel(self): - # parameters to train an mp2 model with sequence_parallel flag + # parameters to train an mp2 model with sequence_parallel flag argv_injection = ( "python3 metaseq/launcher/opt_baselines.py " "--prefix train.8m --model-size 8m --checkpoints-dir ./test-checkpoint " @@ -187,4 +187,4 @@ def log_to_events(self, info, message, args, events, **kwargs): if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file From 198f3b71d9bd2df8a2f642716e2764c54fdb4127 Mon Sep 17 00:00:00 2001 From: bashnick Date: Fri, 20 Jan 2023 15:03:26 +0000 Subject: [PATCH 47/49] try upgrade pytorch after apex installation --- .circleci/config.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d4a97e561..4a58d37ad 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,6 +23,8 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 + # need this version of setuptools to enable distutils + pip install setuptools==59.5.0 pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -92,6 +94,18 @@ install_repo: &install_repo source activate metaseq pip install -e .[dev,few_shot,gpu] python setup.py build_ext --inplace + +upgrade_pytorch: &upgrade_pytorch + - run: + name: Upgrade Pytorch + working_directory: ~/ + command: | + source activate metaseq + pip uninstall torch -y + pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install setuptools==59.5.0 + python -c 'import torch; print("Torch version:", torch.__version__)' + check_nvidia_driver: &check_nvidia_driver - run: name: Check NVIDIA Driver @@ -145,6 +159,7 @@ commands: - <<: *install_fairscale - <<: *install_dep_fused_ops - <<: *install_repo + - <<: *upgrade_pytorch - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: From e23fd629d73543c0c2dbc09e2d1a3c86d6e16850 Mon Sep 17 00:00:00 2001 From: bashnick Date: Fri, 20 Jan 2023 16:49:45 +0000 Subject: [PATCH 48/49] try with PT1.10.2+cu111 --- .circleci/config.yml | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4a58d37ad..1f93d8026 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,6 @@ install_dep_common: &install_dep_common command: | source activate metaseq # Fixed version to work around https://github.com/pytorch/pytorch/pull/69904 - # need this version of setuptools to enable distutils pip install setuptools==59.5.0 pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' @@ -72,7 +71,7 @@ install_dep_pt19: &install_dep_pt19 command: | source activate metaseq pip install --upgrade setuptools - pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c 'import torch; print("Torch version:", torch.__version__)' install_pytorch_dep: &install_pytorch_dep - parameters: @@ -94,18 +93,6 @@ install_repo: &install_repo source activate metaseq pip install -e .[dev,few_shot,gpu] python setup.py build_ext --inplace - -upgrade_pytorch: &upgrade_pytorch - - run: - name: Upgrade Pytorch - working_directory: ~/ - command: | - source activate metaseq - pip uninstall torch -y - pip install torch==1.10.2+cu111 torchvision==0.11.3+cu111 torchaudio==0.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html - pip install setuptools==59.5.0 - python -c 'import torch; print("Torch version:", torch.__version__)' - check_nvidia_driver: &check_nvidia_driver - run: name: Check NVIDIA Driver @@ -159,7 +146,6 @@ commands: - <<: *install_fairscale - <<: *install_dep_fused_ops - <<: *install_repo - - <<: *upgrade_pytorch - <<: *download_and_configure_125m_with_hf_dependencies - save_cache: paths: From dd2b6f5206a6255c36f65628ea6bcfcdc16d2cc1 Mon Sep 17 00:00:00 2001 From: bashnick Date: Mon, 23 Jan 2023 11:42:56 +0000 Subject: [PATCH 49/49] try using a dockerhub image --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1f93d8026..6ad0239c7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,9 +6,9 @@ version: 2.1 # ------------------------------------------------------------------------------------- gpu: &gpu environment: - CUDA_VERSION: "11.1" + CUDA_VERSION: "11.6" machine: - image: ubuntu-1604-cuda-11.1:202012-01 + image: nvidia/cuda:11.6.1-base-ubuntu20.04 resource_class: gpu.nvidia.medium.multi