Add support for amd-smi to perform device isolation (#82)

huggingface · Nov 2, 2023 · 301e484 · 301e484
1 parent 4bd0464
commit 301e484
Show file tree

Hide file tree

Showing 14 changed files with 173 additions and 186 deletions.
diff --git a/.github/workflows/test_cuda_onnxruntime_inference.yaml b/.github/workflows/test_cuda_onnxruntime_inference.yaml
@@ -29,10 +29,13 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --gpus '"device=0,1"'
+          --net host
+          --pid host
+          --shm-size 64G
+          --env USE_CUDA="1"
           --entrypoint /bin/bash
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --env USE_CUDA="1"
+          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[test,onnxruntime-gpu,diffusers] && pytest -k 'cuda and onnxruntime and inference' -x"
diff --git a/.github/workflows/test_cuda_onnxruntime_training.yaml b/.github/workflows/test_cuda_onnxruntime_training.yaml
@@ -29,11 +29,13 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --gpus '"device=0,1"'
+          --net host
+          --pid host
+          --shm-size 64G
+          --env USE_CUDA="1"
           --entrypoint /bin/bash
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --env TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
-          --env USE_CUDA="1"
+          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[test,onnxruntime-training,peft] && python -m torch_ort.configure && pytest -k 'cuda and onnxruntime and training' -x"
diff --git a/.github/workflows/test_cuda_pytorch.yaml b/.github/workflows/test_cuda_pytorch.yaml
@@ -29,10 +29,13 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --entrypoint /bin/bash
+          --net host
+          --pid host
+          --shm-size 64G
           --env USE_CUDA="1"
-          --gpus '"device=0,1"'
+          --entrypoint /bin/bash
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
+          --gpus '"device=0,1"'
           opt-bench-cuda:12.1.1
           -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_rocm_pytorch.yaml
@@ -29,12 +29,15 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --device /dev/kfd
-          --device /dev/dri
+          --net host
+          --pid host
+          --shm-size 64G
+          --env USE_ROCM="1"
           --entrypoint /bin/bash
-          --shm-size=64G
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --env USE_ROCM="1"
+          --device /dev/kfd
+          --device /dev/dri/card0 --device /dev/dri/renderD128
+          --device /dev/dri/card1 --device /dev/dri/renderD129
           opt-bench-rocm:5.6.1
           -c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
diff --git a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml b/.github/workflows/test_tensorrt_onnxruntime_inference.yaml
@@ -28,10 +28,13 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --gpus '"device=0,1"'
+          --net host
+          --pid host
+          --shm-size 64G
+          --env USE_CUDA="1"
           --entrypoint /bin/bash
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --env USE_CUDA="1"
+          --gpus '"device=0,1"'
           opt-bench-tensorrt:22.12
           -c "pip install -e .[test,onnxruntime-gpu,diffusers] && pytest -k 'tensorrt and onnxruntime and inference' -x"
diff --git a/docker/cuda.dockerfile b/docker/cuda.dockerfile
@@ -40,6 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm -rf /var/lib/apt/lists/* && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
 
+# Add local bin to PATH
 ENV PATH="/home/user/.local/bin:${PATH}"
 
 # Add user to sudoers

diff --git a/docker/rocm.dockerfile b/docker/rocm.dockerfile
@@ -30,7 +30,7 @@ RUN addgroup --gid $GROUP_ID user
 RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
 
 # Install python
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
     python3.10 \
     python3.10-dev \
     python3-pip \
@@ -39,13 +39,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm -rf /var/lib/apt/lists/* && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
 
+# amd-smi must be installed before switching to user
+RUN apt-get update && apt-get upgrade -y && apt-get -y --no-install-recommends install amd-smi-lib
+RUN pip install --upgrade pip setuptools wheel && cd /opt/rocm/share/amd_smi && pip install .
+ENV PATH="/opt/rocm/bin:${PATH}"
+
+# Add local bin to PATH
 ENV PATH="/home/user/.local/bin:${PATH}"
 
 # Add user to sudoers
 RUN adduser user sudo
 RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers
 
-# Fix permissions
+# Fix AMD permissions
 RUN usermod -g video user
 RUN usermod -a -G render user
 

diff --git a/docker/tensorrt.dockerfile b/docker/tensorrt.dockerfile
@@ -28,6 +28,7 @@ ARG GROUP_ID
 RUN addgroup --gid $GROUP_ID user
 RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
 
+# Add local bin to PATH
 ENV PATH="/home/user/.local/bin:${PATH}"
 
 # Add user to sudoers

diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
@@ -14,11 +14,8 @@ hydra:
   job:
     chdir: true
     env_set:
-      CUDA_VISIBLE_DEVICES: 0,1
-
-backend:
-  initial_isolation_check: false
-  continous_isolation_check: false
+      CUDA_VISIBLE_DEVICES: 0
+      CUDA_DEVICE_ORDER: PCI_BUS_ID
 
 experiment_name: pytorch_bert
 model: bert-base-uncased

diff --git a/examples/text_generation_inference_llama.yaml b/examples/text_generation_inference_llama.yaml
@@ -15,7 +15,8 @@ hydra:
     chdir: true
     env_set:
       CUDA_VISIBLE_DEVICES: 0,1
-
+      CUDA_DEVICE_ORDER: PCI_BUS_ID
+
 experiment_name: text_generation_inference
 model: NousResearch/Llama-2-7b-hf
 device: cuda

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -37,15 +37,25 @@
 
 from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
 from .config import BackendConfigT
+from .isolation_utils import (
+    only_this_process_is_running_on_cuda_devices,
+    only_this_process_will_run_on_cuda_devices,
+)
 from .utils import (
-    check_no_process_is_running_on_cuda_device,
-    check_only_this_process_is_running_on_cuda_device,
     extract_shapes_from_diffusion_pipeline,
     extract_shapes_from_model_artifacts,
 )
 
 LOGGER = getLogger("backend")
 
+CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+if CUDA_VISIBLE_DEVICES is not None:
+    CUDA_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))
+elif torch.cuda.is_available():
+    CUDA_DEVICES = list(range(torch.cuda.device_count()))
+else:
+    CUDA_DEVICES = []
+
 
 class Backend(Generic[BackendConfigT], ABC):
     NAME: ClassVar[str]
@@ -77,7 +87,8 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
             self.model_type = self.pretrained_config.model_type
 
             try:
-                # the processor sometimes contains information about the model's input shapes that's not available in the config
+                # the processor sometimes contains information about the model's
+                # input shapes that's not available in the config
                 self.pretrained_processor = AutoProcessor.from_pretrained(
                     pretrained_model_name_or_path=self.model, **self.hub_kwargs
                 )
@@ -96,52 +107,39 @@ def is_text_generation_model(self) -> bool:
     def is_diffusion_pipeline(self) -> bool:
         return self.task in DIFFUSION_TASKS
 
-    def check_initial_isolation(self) -> None:
-        if self.device.type == "cuda":
-            # at this point we are sure that CUDA_VISIBLE_DEVICES is set if there are multiple GPUs available on the machine
-            CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-            if CUDA_VISIBLE_DEVICES is None:
-                device_ids = [self.device.index if self.device.index is not None else 0]
-            else:
-                device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))
-
-            LOGGER.info(f"\t+ Checking initial device(s) isolation of CUDA device(s): {device_ids}")
-            check_no_process_is_running_on_cuda_device(device_ids)
-
-    def check_continuous_isolation(self) -> None:
-        if self.device.type == "cuda":
-            # at this point we are sure that CUDA_VISIBLE_DEVICES is set if there are multiple GPUs available on the machine
-            CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-            if CUDA_VISIBLE_DEVICES is None:
-                device_ids = [self.device.index if self.device.index is not None else 0]
-            else:
-                device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))
-
-            LOGGER.info(f"\t+ Checking continuous device(s) isolation of CUDA device(s): {device_ids}")
-            self.isolation_thread = Process(
-                target=check_only_this_process_is_running_on_cuda_device,
-                args=(device_ids, os.getpid()),
-                daemon=True,
-            )
-            self.isolation_thread.start()
-
     def configure(self, config: BackendConfigT) -> None:
         LOGGER.info(f"Configuring {self.NAME} backend")
         self.config = config
 
-        # seeding backend
-        self.seed()
-
         # isolation options
         if self.config.initial_isolation_check:
             self.check_initial_isolation()
         if self.config.continous_isolation_check:
             self.check_continuous_isolation()
 
+        # seeding backend
+        LOGGER.info(f"\t+ Seeding backend with seed {self.config.seed}")
+        self.seed()
+
         # clean up options
         if self.config.delete_cache:
             LOGGER.info("\t+ Model cache will be deleted after benchmark")
 
+    def check_initial_isolation(self) -> None:
+        if self.device.type == "cuda":
+            LOGGER.info(f"\t+ Checking initial device(s) isolation of CUDA device(s): {CUDA_DEVICES}")
+            only_this_process_is_running_on_cuda_devices(cuda_devices=CUDA_DEVICES, benchmark_pid=os.getpid())
+
+    def check_continuous_isolation(self) -> None:
+        if self.device.type == "cuda":
+            LOGGER.info(f"\t+ Checking continuous device(s) isolation of CUDA device(s): {CUDA_DEVICES}")
+            self.isolation_thread = Process(
+                target=only_this_process_will_run_on_cuda_devices,
+                kwargs={"cuda_devices": CUDA_DEVICES, "benchmark_pid": os.getpid()},
+                daemon=True,
+            )
+            self.isolation_thread.start()
+
     def seed(self) -> None:
         # https://pytorch.org/docs/stable/notes/randomness.html
         random.seed(self.config.seed)
@@ -150,16 +148,15 @@ def seed(self) -> None:
 
     def prepare_input(self, input: Dict[str, Any]) -> Dict[str, Any]:
         if self.is_diffusion_pipeline():
-            # diffusion pipelines expect a list of strings as input
+            # diffusion pipelines takes a list of strings
             return input
         else:
-            # models expect tensors on the target device as input
+            # models expect tensors on the target device
             for key, value in input.items():
                 input[key] = value.to(self.device)
 
         return input
 
-    # compiling in openvino requires input shapes, trt ep requires max tokens, etc.
     def prepare_for_inference(self, **kwargs) -> None:
         pass
 

diff --git a/optimum_benchmark/backends/isolation_utils.py b/optimum_benchmark/backends/isolation_utils.py
@@ -0,0 +1,95 @@
+import os
+import signal
+import time
+from typing import Dict, List
+
+from ..env_utils import is_nvidia_system, is_rocm_system
+from ..import_utils import is_py3nvml_available, is_pyrsmi_available
+
+
+def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
+    """
+    Raises a RuntimeError if any process other than the benchmark process is running on the specified CUDA devices.
+    """
+    pids: Dict[int, set] = {}
+    for device_id in cuda_devices:
+        pids[device_id] = set()
+
+    if is_nvidia_system():
+        if not is_py3nvml_available():
+            raise ValueError(
+                "check_no_process_is_running_on_cuda_device requires py3nvml. "
+                "Please install it with `pip install py3nvml`."
+            )
+        import py3nvml.py3nvml as nvml
+
+        nvml.nvmlInit()
+        for device_id in cuda_devices:
+            device_handle = nvml.nvmlDeviceGetHandleByIndex(device_id)
+            pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
+        nvml.nvmlShutdown()
+    elif is_rocm_system():
+        if not is_pyrsmi_available():
+            raise ValueError(
+                "check_no_process_is_running_on_cuda_device requires pyrsmi. "
+                "Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
+            )
+        import amdsmi as rocml
+
+        rocml.amdsmi_init()
+        devices_handles = rocml.amdsmi_get_device_handles()
+        for device_id in cuda_devices:
+            device_handle = devices_handles[device_id]
+            processes_handles = rocml.amdsmi_get_process_list(device_handle)
+            for process_handle in processes_handles:
+                info = rocml.amdsmi_get_process_info(device_handle, process_handle)
+                if info["memory_usage"]["vram_mem"] == 4096:
+                    continue
+                pids[device_id].add(info["pid"])
+        rocml.amdsmi_shut_down()
+    else:
+        raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")
+
+    all_pids = set()
+    for device_id in cuda_devices:
+        all_pids |= pids[device_id]
+    other_pids = all_pids - {benchmark_pid}
+
+    if len(other_pids) > 0:
+        error_message = f"Expected only process {benchmark_pid} on device(s) {cuda_devices}, but found {other_pids}."
+
+        # for pid in other_pids:
+        #     error_message += f"\nProcess {pid} info: {get_pid_info(pid)}"
+
+        raise RuntimeError(error_message)
+
+
+def only_this_process_will_run_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
+    """
+    Kills the benchmark process if any other process is running on the specified CUDA devices.
+    """
+    while True:
+        try:
+            only_this_process_is_running_on_cuda_devices(cuda_devices, benchmark_pid)
+            time.sleep(0.1)
+        except RuntimeError as exception:
+            os.kill(benchmark_pid, signal.SIGTERM)
+            raise exception
+
+
+## we can report more information about the process to explain the source of the error
+## but that might be dangerous in a CI context
+
+# import psutil
+
+# def get_pid_info(pid: int) -> Dict[str, str]:
+#     """Returns a dictionary containing the process' information."""
+
+#     process = psutil.Process(pid)
+
+#     return {
+#         "pid": pid,
+#         "name": process.name(),
+#         "username": process.username(),
+#         "cmdline": " ".join(process.cmdline()),
+#     }