Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
vlad-karpuhin committed Jul 1, 2024
2 parents e5a0093 + e33c8f7 commit 79088de
Show file tree
Hide file tree
Showing 380 changed files with 10,281 additions and 3,238 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ build
slurm*
logs
.vscode
local/
77 changes: 55 additions & 22 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
workflow:
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests and nightly/
variables:
JET_CUSTOM_FILTER: "type == 'build' or 'merge-request' in spec.scope or 'nightly' in spec.scope"
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
variables:
JET_CUSTOM_FILTER: "type == 'build' or 'merge-request' in spec.scope"
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/
variables:
JET_CUSTOM_FILTER: "type == 'build'"
# always run MR pipelines
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
# always run web pipelines
Expand All @@ -14,15 +23,14 @@ stages:
- test
- jet

variables: &VARS
variables:
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
JET_CUSTOM_FILTER: ""
JET_CUSTOM_FILTER:
description: |
Selects what functional tests to run. For merge-request tests: "type == 'build' or 'merge-request' in spec.scope". For nightly tests: "type == 'build' or 'nightly' in spec.scope"
value: ""
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
TIME_LIMIT: "10:00" # Default time limit for all jobs
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
Expand All @@ -37,7 +45,7 @@ include:
- jet-tests.yml

unit_tests:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -53,7 +61,7 @@ unit_tests:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

unit_tests-data:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -65,9 +73,10 @@ unit_tests-data:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
interruptible: true

unit_tests-dist-checkpointing:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -79,9 +88,10 @@ unit_tests-dist-checkpointing:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-fusions:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -93,9 +103,25 @@ unit_tests-fusions:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-inference:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/inference
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
interruptible: true

unit_tests-models:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -107,9 +133,10 @@ unit_tests-models:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-pipeline-parallel:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -121,9 +148,10 @@ unit_tests-pipeline-parallel:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-tensor-parallel:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -135,9 +163,10 @@ unit_tests-tensor-parallel:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-transformer:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -149,9 +178,10 @@ unit_tests-transformer:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

unit_tests-top-py:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:24.01v3
tags:
- 8xL40S
stage: test
Expand All @@ -163,7 +193,8 @@ unit_tests-top-py:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always

interruptible: true

docs_build_test:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
stage: test
Expand All @@ -178,6 +209,7 @@ docs_build_test:
allow_failure: true
except:
- main
interruptible: true

formatting:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
Expand All @@ -189,3 +221,4 @@ formatting:
- isort megatron/core --check
rules:
- when: always
interruptible: true
4 changes: 2 additions & 2 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[MCORE][3]
megatron/core/ @shanmugamr @maanug @jcasper @eharper
megatron/core/ @shanmugamr @jcasper @eharper @terryk

[TESTS]
tests/ @shanmugamr @maanug
tests/ @shanmugamr @terryk

25 changes: 11 additions & 14 deletions Dockerfile.test
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3
FROM ${FROM_IMAGE_NAME}

RUN pip install --no-cache-dir \
"pytest-cov" \
"pytest_mock" \
"nltk" \
"wrapt" \
"zarr" \
"tensorstore==0.1.45" \
"git+https://github.com/fanshiqing/[email protected]" \
"black==19.10b0" \
"isort" \
"click==8.0.2"
# syntax=docker/dockerfile:experimental

FROM nvcr.io/nvidia/pytorch:24.01-py3
ENV DEBIAN_FRONTEND=noninteractive

RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
/etc/apt/apt.conf.d/docker-clean

RUN apt-get update && apt-get install -y --no-install-recommends

RUN pip3 install --no-cache-dir einops flask-restful nltk pytest pytest-cov pytest_mock sentencepiece wrapt git+https://github.com/fanshiqing/[email protected]
9 changes: 5 additions & 4 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ The following applies to all files unless otherwise noted:

This repository also contains code from Hugging Face Inc., Google Research,
Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their
Swin-Transformer project) and Philip Popien. Files from these
organizations have notices at the top of each file. Below are
licenses used in those files, as indicated.
Swin-Transformer project), Philip Popien, and the Mamba project (Tri Dao and
Albert Gu). Files from these organizations have notices at the top of each file.
Below are licenses used in those files, as indicated.


------------- LICENSE FOR Facebook, huggingface and Google Research code --------------
--------------------------------------------------------------------------------
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code --


Apache License
Expand Down
Loading

0 comments on commit 79088de

Please sign in to comment.