From 0092945ea9c9f87a3f5bfdcd8b7baaa5bb3501fd Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Thu, 16 Nov 2023 22:45:00 +0100 Subject: [PATCH] Fix experiment exit code (#85) --- ...k_quality.yaml => check_code_quality.yaml} | 14 ++++---- .github/workflows/test_cli_misc.yaml | 33 +++++++++++++++++ .../workflows/test_cpu_neural_compressor.yaml | 8 ++--- .github/workflows/test_cpu_onnxruntime.yaml | 8 ++--- .github/workflows/test_cpu_openvino.yaml | 8 ++--- .github/workflows/test_cpu_pytorch.yaml | 8 ++--- .../test_cuda_onnxruntime_inference.yaml | 6 ++-- .../test_cuda_onnxruntime_training.yaml | 6 ++-- .github/workflows/test_cuda_pytorch.yaml | 26 ++++++++++---- .github/workflows/test_rocm_pytorch.yaml | 35 ++++++------------ .../test_tensorrt_onnxruntime_inference.yaml | 7 ++-- .gitignore | 4 +-- docker/cuda.dockerfile | 6 +++- docker/rocm.dockerfile | 9 +++-- optimum_benchmark/experiment.py | 8 ++++- tests/configs/{_mp_.yaml => _pp_.yaml} | 2 +- ...ml => cuda_pytorch_inference_gpt2_pp.yaml} | 2 +- .../cuda_pytorch_training_bert_dp.yaml | 2 +- tests/test_cli.py | 36 ++++++++++++++++--- 19 files changed, 152 insertions(+), 76 deletions(-) rename .github/workflows/{check_quality.yaml => check_code_quality.yaml} (67%) create mode 100644 .github/workflows/test_cli_misc.yaml rename tests/configs/{_mp_.yaml => _pp_.yaml} (83%) rename tests/configs/{cuda_pytorch_inference_gpt2_mp.yaml => cuda_pytorch_inference_gpt2_pp.yaml} (83%) diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/check_code_quality.yaml similarity index 67% rename from .github/workflows/check_quality.yaml rename to .github/workflows/check_code_quality.yaml index bfee800c..47f66305 100644 --- a/.github/workflows/check_quality.yaml +++ b/.github/workflows/check_code_quality.yaml @@ -1,24 +1,25 @@ -name: Quality checks +name: Quality Code Checks on: + workflow_dispatch: push: branches: [main] pull_request: - branches: [main] + types: [opened, reopened, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - run_quality_checks: + run_code_quality_checks: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 @@ -29,4 +30,5 @@ jobs: - name: Check style run: | - make style_check + black --check . + ruff check . diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml new file mode 100644 index 00000000..4fae0bda --- /dev/null +++ b/.github/workflows/test_cli_misc.yaml @@ -0,0 +1,33 @@ +name: CLI Misc Tests + +on: + workflow_dispatch: + push: + branches: [main] + pull_request: + types: [opened, reopened, synchronize] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_misc_tests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.8 + uses: actions/setup-python@v3 + with: + python-version: 3.8 + + - name: Install requirements + run: | + pip install --upgrade pip + pip install -e .[test] + + - name: Run tests + run: | + pytest -k "not (cpu or cuda or rocm or tensorrt)" diff --git a/.github/workflows/test_cpu_neural_compressor.yaml b/.github/workflows/test_cpu_neural_compressor.yaml index 4722857f..41616a41 100644 --- a/.github/workflows/test_cpu_neural_compressor.yaml +++ b/.github/workflows/test_cpu_neural_compressor.yaml @@ -1,11 +1,11 @@ -name: Intel Neural Compressor CPU Tests +name: CPU Intel Neural Compressor Tests on: workflow_dispatch: push: branches: [main] pull_request: - branches: [main] + types: [opened, reopened, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -16,10 +16,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 diff --git a/.github/workflows/test_cpu_onnxruntime.yaml b/.github/workflows/test_cpu_onnxruntime.yaml index 37d1c384..f578138f 100644 --- a/.github/workflows/test_cpu_onnxruntime.yaml +++ b/.github/workflows/test_cpu_onnxruntime.yaml @@ -1,11 +1,11 @@ -name: OnnxRuntime CPU Tests +name: CPU OnnxRuntime Tests on: workflow_dispatch: push: branches: [main] pull_request: - branches: [main] + types: [opened, reopened, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -16,10 +16,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 diff --git a/.github/workflows/test_cpu_openvino.yaml b/.github/workflows/test_cpu_openvino.yaml index ab197648..ee824927 100644 --- a/.github/workflows/test_cpu_openvino.yaml +++ b/.github/workflows/test_cpu_openvino.yaml @@ -1,11 +1,11 @@ -name: OpenVINO CPU Tests +name: CPU OpenVINO Tests on: workflow_dispatch: push: branches: [main] pull_request: - branches: [main] + types: [opened, reopened, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -16,10 +16,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 diff --git a/.github/workflows/test_cpu_pytorch.yaml b/.github/workflows/test_cpu_pytorch.yaml index 17816974..93bc9b8a 100644 --- a/.github/workflows/test_cpu_pytorch.yaml +++ b/.github/workflows/test_cpu_pytorch.yaml @@ -1,11 +1,11 @@ -name: Pytorch CPU tests +name: CPU Pytorch tests on: workflow_dispatch: push: branches: [main] pull_request: - branches: [main] + types: [opened, reopened, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -16,10 +16,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 diff --git a/.github/workflows/test_cuda_onnxruntime_inference.yaml b/.github/workflows/test_cuda_onnxruntime_inference.yaml index eeab6d3f..fca17e5f 100644 --- a/.github/workflows/test_cuda_onnxruntime_inference.yaml +++ b/.github/workflows/test_cuda_onnxruntime_inference.yaml @@ -1,7 +1,9 @@ -name: OnnxRuntime CUDA Inference Tests +name: CUDA OnnxRuntime Inference Tests on: workflow_dispatch: + push: + branches: [main] pull_request: types: [opened, reopened, synchronize] @@ -14,7 +16,7 @@ jobs: runs-on: hf-dgx-01 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build image run: docker build diff --git a/.github/workflows/test_cuda_onnxruntime_training.yaml b/.github/workflows/test_cuda_onnxruntime_training.yaml index 47c5056c..fdb7f932 100644 --- a/.github/workflows/test_cuda_onnxruntime_training.yaml +++ b/.github/workflows/test_cuda_onnxruntime_training.yaml @@ -1,7 +1,9 @@ -name: OnnxRuntime CUDA Training Tests +name: CUDA OnnxRuntime Training Tests on: workflow_dispatch: + push: + branches: [main] pull_request: types: [opened, reopened, synchronize] @@ -14,7 +16,7 @@ jobs: runs-on: hf-dgx-01 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build image run: docker build diff --git a/.github/workflows/test_cuda_pytorch.yaml b/.github/workflows/test_cuda_pytorch.yaml index b5a73a64..92a38b82 100644 --- a/.github/workflows/test_cuda_pytorch.yaml +++ b/.github/workflows/test_cuda_pytorch.yaml @@ -1,7 +1,9 @@ -name: Pytorch CUDA Tests +name: CUDA Pytorch Tests on: workflow_dispatch: + push: + branches: [main] pull_request: types: [opened, reopened, synchronize] @@ -11,19 +13,28 @@ concurrency: jobs: build_image_and_run_gpu_tests: + strategy: + fail-fast: false + matrix: + image: + [ + { torch_cuda: cu121, cuda_version: 12.1.1 }, + { torch_cuda: cu118, cuda_version: 11.8.0 }, + ] + runs-on: hf-dgx-01 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build image run: docker build --file docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) - --build-arg CUDA_VERSION=12.1.1 - --build-arg TORCH_CUDA=cu121 - --tag opt-bench-cuda:12.1.1 + --build-arg TORCH_CUDA=${{ matrix.image.torch_cuda }} + --build-arg CUDA_VERSION=${{ matrix.image.cuda_version }} + --tag opt-bench-cuda:${{ matrix.image.cuda_version }} . - name: Run tests @@ -33,9 +44,10 @@ jobs: --pid host --shm-size 64G --env USE_CUDA="1" - --entrypoint /bin/bash + --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --gpus '"device=0,1"' - opt-bench-cuda:12.1.1 + --entrypoint /bin/bash + opt-bench-cuda:${{ matrix.image.cuda_version }} -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x" diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_rocm_pytorch.yaml index 20ee74c3..952131e3 100644 --- a/.github/workflows/test_rocm_pytorch.yaml +++ b/.github/workflows/test_rocm_pytorch.yaml @@ -1,7 +1,9 @@ -name: Pytorch ROCm Tests +name: ROCm Pytorch Tests on: workflow_dispatch: + push: + branches: [main] pull_request: types: [opened, reopened, synchronize] @@ -16,20 +18,11 @@ jobs: matrix: image: [ - { - rocm_version: 5.6.1, - torch_rocm_version: 5.6, - torch_pre_release: 0, - }, - { - rocm_version: 5.7, - torch_rocm_version: 5.7, - torch_pre_release: 1, - }, + { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 }, + { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 }, ] - runner: [hf-amd-mi210-dev] - runs-on: ${{ matrix.runner }} + runs-on: hf-amd-mi210-dev steps: - name: Checkout code uses: actions/checkout@v3 @@ -39,15 +32,11 @@ jobs: --file docker/rocm.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) - --build-arg ROCM_VERSION=$ROCM_VERSION - --build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE - --build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION - --tag opt-bench-rocm:$TORCH_ROCM_VERSION + --build-arg TORCH_ROCM=${{ matrix.image.torch_rocm }} + --build-arg TORCH_PRE_RELEASE=${{ matrix.image.torch_pre_release }} + --build-arg ROCM_VERSION=${{ matrix.image.rocm_version }} + --tag opt-bench-rocm:${{ matrix.image.rocm_version }} . - env: - ROCM_VERSION: ${{ matrix.image.rocm_version }} - TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }} - TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }} - name: Run tests run: docker run @@ -63,7 +52,5 @@ jobs: --device /dev/dri/renderD128 --device /dev/dri/renderD129 --entrypoint /bin/bash - opt-bench-rocm:$TORCH_ROCM_VERSION + opt-bench-rocm:${{ matrix.image.rocm_version }} -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x" - env: - TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }} diff --git a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml b/.github/workflows/test_tensorrt_onnxruntime_inference.yaml index ea0103c2..f6dc3f16 100644 --- a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml +++ b/.github/workflows/test_tensorrt_onnxruntime_inference.yaml @@ -1,6 +1,9 @@ -name: OnnxRuntime TensorRT Inference Tests +name: TensorRT OnnxRuntime Inference Tests on: + workflow_dispatch: + push: + branches: [main] pull_request: types: [opened, reopened, synchronize] @@ -13,7 +16,7 @@ jobs: runs-on: hf-dgx-01 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build image run: docker build diff --git a/.gitignore b/.gitignore index c440896f..d1b3725c 100644 --- a/.gitignore +++ b/.gitignore @@ -167,6 +167,6 @@ sweeps/ data/ version.txt -# Experiments +actions-runner/ experiments/ -examples \ No newline at end of file +examples/ \ No newline at end of file diff --git a/docker/cuda.dockerfile b/docker/cuda.dockerfile index 3b409e87..a2270ffa 100644 --- a/docker/cuda.dockerfile +++ b/docker/cuda.dockerfile @@ -19,6 +19,7 @@ ARG UBUNTU_VERSION=22.04 FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG TORCH_CUDA=cu121 +ARG TORCH_PRE_RELEASE=0 # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive @@ -55,4 +56,7 @@ WORKDIR /home/user RUN pip install --upgrade pip # Install PyTorch -RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_CUDA} +RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; \ + then pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA} ; \ + else pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_CUDA} ; \ + fi diff --git a/docker/rocm.dockerfile b/docker/rocm.dockerfile index c198ee7d..2532bfd8 100644 --- a/docker/rocm.dockerfile +++ b/docker/rocm.dockerfile @@ -17,8 +17,8 @@ ARG UBUNTU_VERSION=22.04 FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} +ARG TORCH_ROCM=rocm5.6 ARG TORCH_PRE_RELEASE=0 -ARG TORCH_ROCM_VERSION=5.6 # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive @@ -64,8 +64,7 @@ WORKDIR /home/user RUN pip install --upgrade pip # Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1) -RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \ - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \ - else \ - pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \ +RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; \ + then pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_ROCM} ; \ + else pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \ fi diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index f89a0e75..6b99dca7 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -137,6 +137,7 @@ def run(experiment: DictConfig) -> None: device=experiment.device, hub_kwargs=experiment.hub_kwargs, ) + try: # Configure the backend backend.configure(experiment.backend) @@ -148,7 +149,9 @@ def run(experiment: DictConfig) -> None: # Allocate requested benchmark benchmark_factory: Type["Benchmark"] = get_class(experiment.benchmark._target_) benchmark: "Benchmark" = benchmark_factory() + try: + # Configure the benchmark benchmark.configure(experiment.benchmark) except Exception as e: LOGGER.error("Error during benchmark configuration: %s", e) @@ -173,11 +176,14 @@ def run_isolated(experiment: DictConfig, start_method: str = "spawn") -> None: if multiprocessing.get_start_method(allow_none=True) != start_method: multiprocessing.set_start_method(start_method) - # Spawn a new process + # Execute the experiment in a child process p = multiprocessing.Process(target=run, args=(experiment,)) p.start() p.join() + # Exit with the same exit code as the child process + exit(p.exitcode) + @hydra.main(version_base=None) def main(experiment: DictConfig) -> None: diff --git a/tests/configs/_mp_.yaml b/tests/configs/_pp_.yaml similarity index 83% rename from tests/configs/_mp_.yaml rename to tests/configs/_pp_.yaml index 6c12f1c2..4bb900c3 100644 --- a/tests/configs/_mp_.yaml +++ b/tests/configs/_pp_.yaml @@ -1,4 +1,4 @@ -# Model Parallel (MP) inference +# Pipeline Parallelism (PP) experiment_name: ${device}_${backend.name}_${benchmark.name}_${task}_mp backend: diff --git a/tests/configs/cuda_pytorch_inference_gpt2_mp.yaml b/tests/configs/cuda_pytorch_inference_gpt2_pp.yaml similarity index 83% rename from tests/configs/cuda_pytorch_inference_gpt2_mp.yaml rename to tests/configs/cuda_pytorch_inference_gpt2_pp.yaml index 5b156bed..4e2170d2 100644 --- a/tests/configs/cuda_pytorch_inference_gpt2_mp.yaml +++ b/tests/configs/cuda_pytorch_inference_gpt2_pp.yaml @@ -2,7 +2,7 @@ defaults: - benchmark: inference - backend: pytorch - _base_ # inherits from base config - - _mp_ # inherits from mp config + - _pp_ # inherits from pipeline parallelism config - _self_ # for hydra 1.1 compatibility # we use gpt2 because tiny-gpt2 fails probably because it's just too small to distribute diff --git a/tests/configs/cuda_pytorch_training_bert_dp.yaml b/tests/configs/cuda_pytorch_training_bert_dp.yaml index 77b115a9..ee154f2a 100644 --- a/tests/configs/cuda_pytorch_training_bert_dp.yaml +++ b/tests/configs/cuda_pytorch_training_bert_dp.yaml @@ -1,6 +1,6 @@ defaults: - benchmark: training - - backend: onnxruntime + - backend: pytorch - _base_ # inherits from base config - _dp_ # inherits from dp config - _self_ # for hydra 1.1 compatibility diff --git a/tests/test_cli.py b/tests/test_cli.py index c0a288ad..4bce083e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,12 +3,24 @@ import pytest -SINGLE_DEVICE_RUNS = [ - config for config in os.listdir("tests/configs") if config.endswith(".yaml") and config != "base_config.yaml" +SINGLERUNS = [ + config + for config in os.listdir("tests/configs") + if config.endswith(".yaml") + and not config.startswith("multirun") + and not (config.startswith("_") or config.endswith("_")) +] + +MULTIRUNS = [ + config + for config in os.listdir("tests/configs") + if config.endswith(".yaml") + and config.startswith("multirun") + and not (config.startswith("_") or config.endswith("_")) ] -@pytest.mark.parametrize("config_file", SINGLE_DEVICE_RUNS) +@pytest.mark.parametrize("config_file", SINGLERUNS) def test_configs(config_file): config_name = config_file.split(".")[0] @@ -19,10 +31,24 @@ def test_configs(config_file): "tests/configs", "--config-name", config_name, - # "--multirun", - # TODO: might be worth removing names from yaml configs and have a list of test models here ], capture_output=True, ) assert result.returncode == 0, result.stderr.decode("utf-8") + + +def test_exit_code(): + result = subprocess.run( + [ + "optimum-benchmark", + "--config-dir", + "tests/configs", + "--config-name", + "cpu_pytorch_inference_bert", + "model=inexistent_model", + ], + capture_output=True, + ) + + assert result.returncode == 1, result.stderr.decode("utf-8")