huggingface · IlyasMoutawwakil · Nov 9, 2023 · Nov 9, 2023
diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_rocm_pytorch.yaml
@@ -11,20 +11,43 @@ concurrency:
 
 jobs:
   build_image_and_run_gpu_tests:
-    runs-on: hf-amd-mi210-dev
+    strategy:
+      fail-fast: false
+      matrix:
+        image:
+          [
+            {
+              rocm_version: 5.6.1,
+              torch_rocm_version: 5.6,
+              torch_pre_release: 0,
+            },
+            {
+              rocm_version: 5.7,
+              torch_rocm_version: 5.7,
+              torch_pre_release: 1,
+            },
+          ]
+        runner: [hf-amd-mi210-dev]
+
+    runs-on: ${{ matrix.runner }}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v2
+      - name: Checkout code
+        uses: actions/checkout@v3
 
       - name: Build image
         run: docker build
           --file docker/rocm.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg ROCM_VERSION=5.6.1
-          --build-arg TORCH_ROCM=rocm5.6
-          --tag opt-bench-rocm:5.6.1
+          --build-arg ROCM_VERSION=$ROCM_VERSION
+          --build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE
+          --build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION
+          --tag opt-bench-rocm:$TORCH_ROCM_VERSION
           .
+        env:
+          ROCM_VERSION: ${{ matrix.image.rocm_version }}
+          TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
+          TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }}
 
       - name: Run tests
         run: docker run
@@ -33,11 +56,14 @@ jobs:
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
-          --entrypoint /bin/bash
+          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --device /dev/kfd
-          --device /dev/dri/card0 --device /dev/dri/renderD128
-          --device /dev/dri/card1 --device /dev/dri/renderD129
-          opt-bench-rocm:5.6.1
+          --device /dev/dri/renderD128
+          --device /dev/dri/renderD129
+          --entrypoint /bin/bash
+          opt-bench-rocm:$TORCH_ROCM_VERSION
           -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
+        env:
+          TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
diff --git a/docker/rocm.dockerfile b/docker/rocm.dockerfile
@@ -17,7 +17,8 @@ ARG UBUNTU_VERSION=22.04
 
 FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}
 
-ARG TORCH_ROCM=rocm5.6
+ARG TORCH_PRE_RELEASE=0
+ARG TORCH_ROCM_VERSION=5.6
 
 # Ignore interactive questions during `docker build`
 ENV DEBIAN_FRONTEND noninteractive
@@ -62,5 +63,9 @@ WORKDIR /home/user
 # Update pip
 RUN pip install --upgrade pip
 
-# Install PyTorch
-RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM}
+# Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1)
+RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \
+        pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \
+    else \
+        pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \
+    fi
diff --git a/optimum_benchmark/backends/isolation_utils.py b/optimum_benchmark/backends/isolation_utils.py
@@ -4,7 +4,7 @@
 from typing import Dict, List
 
 from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import is_py3nvml_available, is_pyrsmi_available
+from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version
 
 
 def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
@@ -29,24 +29,40 @@ def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchm
             pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
         nvml.nvmlShutdown()
     elif is_rocm_system():
-        if not is_pyrsmi_available():
+        rocm_version = torch_version().split("rocm")[-1]
+
+        if not is_amdsmi_available():
             raise ValueError(
-                "check_no_process_is_running_on_cuda_device requires pyrsmi. "
+                "check_no_process_is_running_on_cuda_device requires amdsmi. "
                 "Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
             )
-        import amdsmi as rocml
-
-        rocml.amdsmi_init()
-        devices_handles = rocml.amdsmi_get_device_handles()
-        for device_id in cuda_devices:
-            device_handle = devices_handles[device_id]
-            processes_handles = rocml.amdsmi_get_process_list(device_handle)
-            for process_handle in processes_handles:
-                info = rocml.amdsmi_get_process_info(device_handle, process_handle)
-                if info["memory_usage"]["vram_mem"] == 4096:
-                    continue
-                pids[device_id].add(info["pid"])
-        rocml.amdsmi_shut_down()
+        import amdsmi as smi
+
+        smi.amdsmi_init()
+
+        if rocm_version >= "5.7":
+            # starting from rocm 5.7, the api seems to have changed names
+            devices_handles = smi.amdsmi_get_processor_handles()
+            for device_id in cuda_devices:
+                device_handle = devices_handles[device_id]
+                processes_handles = smi.amdsmi_get_gpu_process_list(device_handle)
+                for process_handle in processes_handles:
+                    info = smi.amdsmi_get_gpu_process_info(device_handle, process_handle)
+                    if info["memory_usage"]["vram_mem"] == 4096:
+                        continue
+                    pids[device_id].add(info["pid"])
+        else:
+            devices_handles = smi.amdsmi_get_device_handles()
+            for device_id in cuda_devices:
+                device_handle = devices_handles[device_id]
+                processes_handles = smi.amdsmi_get_process_list(device_handle)
+                for process_handle in processes_handles:
+                    info = smi.amdsmi_get_process_info(device_handle, process_handle)
+                    if info["memory_usage"]["vram_mem"] == 4096:
+                        continue
+                    pids[device_id].add(info["pid"])
+
+        smi.amdsmi_shut_down()
     else:
         raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")
 

diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
@@ -14,6 +14,7 @@
 _neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
 _pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None
 _codecarbon_available = importlib.util.find_spec("codecarbon") is not None
+_amdsmi_available = importlib.util.find_spec("amdsmi") is not None
 
 
 def is_onnx_available():
@@ -36,6 +37,10 @@ def is_pyrsmi_available():
     return _pyrsmi_available
 
 
+def is_amdsmi_available():
+    return _amdsmi_available
+
+
 def is_torch_available():
     return _torch_available
 

diff --git a/setup.py b/setup.py
@@ -3,19 +3,19 @@
 
 from setuptools import find_packages, setup
 
-OPTIMUM_VERSION = "1.13.0"
+OPTIMUM_VERSION = "1.14.0"
 
 INSTALL_REQUIRES = [
     # Mandatory HF dependencies
     f"optimum>={OPTIMUM_VERSION}",  # backends, tasks and input generation
     "accelerate",  # distributed inference and no weights init
     # Hydra
-    "omegaconf>=2.3.0",
-    "hydra-core>=1.3.2",
-    "hydra_colorlog>=1.2.0",
+    "omegaconf",
+    "hydra-core",
+    "hydra_colorlog",
     # Other
-    "psutil>=5.9.0",
-    "pandas>=2.0.0",
+    "psutil",
+    "pandas",
 ]
 
 # We may allow to install CUDA or RoCm dependencies even when building in a non-CUDA or non-RoCm environment.
@@ -54,7 +54,7 @@
     "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={OPTIMUM_VERSION}"],
     "onnxruntime-training": ["torch-ort", "onnxruntime-training"],
     # server-like backends
-    "text-generation-inference": ["docker>=6.1.3"],
+    "text-generation-inference": ["docker>=6.0.0"],
     # specific settings
     "diffusers": ["diffusers"],
     "peft": ["peft"],