From 2b41637f1e3382478bf12d640bf9b2e56ef2abcc Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 9 Nov 2023 20:05:48 +0000 Subject: [PATCH] added rocm5.7 support --- .github/workflows/test_rocm_pytorch.yaml | 46 ++++++++++++++---- docker/rocm.dockerfile | 11 +++-- optimum_benchmark/backends/isolation_utils.py | 48 ++++++++++++------- optimum_benchmark/import_utils.py | 5 ++ setup.py | 14 +++--- 5 files changed, 88 insertions(+), 36 deletions(-) diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_rocm_pytorch.yaml index bb337f2a..20ee74c3 100644 --- a/.github/workflows/test_rocm_pytorch.yaml +++ b/.github/workflows/test_rocm_pytorch.yaml @@ -11,20 +11,43 @@ concurrency: jobs: build_image_and_run_gpu_tests: - runs-on: hf-amd-mi210-dev + strategy: + fail-fast: false + matrix: + image: + [ + { + rocm_version: 5.6.1, + torch_rocm_version: 5.6, + torch_pre_release: 0, + }, + { + rocm_version: 5.7, + torch_rocm_version: 5.7, + torch_pre_release: 1, + }, + ] + runner: [hf-amd-mi210-dev] + + runs-on: ${{ matrix.runner }} steps: - - name: Checkout - uses: actions/checkout@v2 + - name: Checkout code + uses: actions/checkout@v3 - name: Build image run: docker build --file docker/rocm.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) - --build-arg ROCM_VERSION=5.6.1 - --build-arg TORCH_ROCM=rocm5.6 - --tag opt-bench-rocm:5.6.1 + --build-arg ROCM_VERSION=$ROCM_VERSION + --build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE + --build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION + --tag opt-bench-rocm:$TORCH_ROCM_VERSION . + env: + ROCM_VERSION: ${{ matrix.image.rocm_version }} + TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }} + TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }} - name: Run tests run: docker run @@ -33,11 +56,14 @@ jobs: --pid host --shm-size 64G --env USE_ROCM="1" - --entrypoint /bin/bash + --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --device /dev/kfd - --device /dev/dri/card0 --device /dev/dri/renderD128 - --device /dev/dri/card1 --device /dev/dri/renderD129 - opt-bench-rocm:5.6.1 + --device /dev/dri/renderD128 + --device /dev/dri/renderD129 + --entrypoint /bin/bash + opt-bench-rocm:$TORCH_ROCM_VERSION -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x" + env: + TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }} diff --git a/docker/rocm.dockerfile b/docker/rocm.dockerfile index a091d029..c198ee7d 100644 --- a/docker/rocm.dockerfile +++ b/docker/rocm.dockerfile @@ -17,7 +17,8 @@ ARG UBUNTU_VERSION=22.04 FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION} -ARG TORCH_ROCM=rocm5.6 +ARG TORCH_PRE_RELEASE=0 +ARG TORCH_ROCM_VERSION=5.6 # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive @@ -62,5 +63,9 @@ WORKDIR /home/user # Update pip RUN pip install --upgrade pip -# Install PyTorch -RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} +# Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1) +RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \ + pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \ + else \ + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \ + fi diff --git a/optimum_benchmark/backends/isolation_utils.py b/optimum_benchmark/backends/isolation_utils.py index 1464956a..63a7a2f3 100644 --- a/optimum_benchmark/backends/isolation_utils.py +++ b/optimum_benchmark/backends/isolation_utils.py @@ -4,7 +4,7 @@ from typing import Dict, List from ..env_utils import is_nvidia_system, is_rocm_system -from ..import_utils import is_py3nvml_available, is_pyrsmi_available +from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None: @@ -29,24 +29,40 @@ def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchm pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle)) nvml.nvmlShutdown() elif is_rocm_system(): - if not is_pyrsmi_available(): + rocm_version = torch_version().split("rocm")[-1] + + if not is_amdsmi_available(): raise ValueError( - "check_no_process_is_running_on_cuda_device requires pyrsmi. " + "check_no_process_is_running_on_cuda_device requires amdsmi. " "Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master" ) - import amdsmi as rocml - - rocml.amdsmi_init() - devices_handles = rocml.amdsmi_get_device_handles() - for device_id in cuda_devices: - device_handle = devices_handles[device_id] - processes_handles = rocml.amdsmi_get_process_list(device_handle) - for process_handle in processes_handles: - info = rocml.amdsmi_get_process_info(device_handle, process_handle) - if info["memory_usage"]["vram_mem"] == 4096: - continue - pids[device_id].add(info["pid"]) - rocml.amdsmi_shut_down() + import amdsmi as smi + + smi.amdsmi_init() + + if rocm_version >= "5.7": + # starting from rocm 5.7, the api seems to have changed names + devices_handles = smi.amdsmi_get_processor_handles() + for device_id in cuda_devices: + device_handle = devices_handles[device_id] + processes_handles = smi.amdsmi_get_gpu_process_list(device_handle) + for process_handle in processes_handles: + info = smi.amdsmi_get_gpu_process_info(device_handle, process_handle) + if info["memory_usage"]["vram_mem"] == 4096: + continue + pids[device_id].add(info["pid"]) + else: + devices_handles = smi.amdsmi_get_device_handles() + for device_id in cuda_devices: + device_handle = devices_handles[device_id] + processes_handles = smi.amdsmi_get_process_list(device_handle) + for process_handle in processes_handles: + info = smi.amdsmi_get_process_info(device_handle, process_handle) + if info["memory_usage"]["vram_mem"] == 4096: + continue + pids[device_id].add(info["pid"]) + + smi.amdsmi_shut_down() else: raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.") diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 11610627..969bc3f0 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -14,6 +14,7 @@ _neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None _pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None _codecarbon_available = importlib.util.find_spec("codecarbon") is not None +_amdsmi_available = importlib.util.find_spec("amdsmi") is not None def is_onnx_available(): @@ -36,6 +37,10 @@ def is_pyrsmi_available(): return _pyrsmi_available +def is_amdsmi_available(): + return _amdsmi_available + + def is_torch_available(): return _torch_available diff --git a/setup.py b/setup.py index 4d72046c..9ad63753 100644 --- a/setup.py +++ b/setup.py @@ -3,19 +3,19 @@ from setuptools import find_packages, setup -OPTIMUM_VERSION = "1.13.0" +OPTIMUM_VERSION = "1.14.0" INSTALL_REQUIRES = [ # Mandatory HF dependencies f"optimum>={OPTIMUM_VERSION}", # backends, tasks and input generation "accelerate", # distributed inference and no weights init # Hydra - "omegaconf>=2.3.0", - "hydra-core>=1.3.2", - "hydra_colorlog>=1.2.0", + "omegaconf", + "hydra-core", + "hydra_colorlog", # Other - "psutil>=5.9.0", - "pandas>=2.0.0", + "psutil", + "pandas", ] # We may allow to install CUDA or RoCm dependencies even when building in a non-CUDA or non-RoCm environment. @@ -54,7 +54,7 @@ "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={OPTIMUM_VERSION}"], "onnxruntime-training": ["torch-ort", "onnxruntime-training"], # server-like backends - "text-generation-inference": ["docker>=6.1.3"], + "text-generation-inference": ["docker>=6.0.0"], # specific settings "diffusers": ["diffusers"], "peft": ["peft"],