Skip to content

Commit

Permalink
Add support for amd-smi to perform device isolation (#82)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Nov 2, 2023
1 parent 4bd0464 commit 301e484
Show file tree
Hide file tree
Showing 14 changed files with 173 additions and 186 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/test_cuda_onnxruntime_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ jobs:
- name: Run tests
run: docker run
--rm
--gpus '"device=0,1"'
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--entrypoint /bin/bash
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--env USE_CUDA="1"
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
-c "pip install -e .[test,onnxruntime-gpu,diffusers] && pytest -k 'cuda and onnxruntime and inference' -x"
8 changes: 5 additions & 3 deletions .github/workflows/test_cuda_onnxruntime_training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ jobs:
- name: Run tests
run: docker run
--rm
--gpus '"device=0,1"'
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--entrypoint /bin/bash
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--env TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
--env USE_CUDA="1"
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
-c "pip install -e .[test,onnxruntime-training,peft] && python -m torch_ort.configure && pytest -k 'cuda and onnxruntime and training' -x"
7 changes: 5 additions & 2 deletions .github/workflows/test_cuda_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ jobs:
- name: Run tests
run: docker run
--rm
--entrypoint /bin/bash
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--gpus '"device=0,1"'
--entrypoint /bin/bash
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
opt-bench-cuda:12.1.1
-c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
11 changes: 7 additions & 4 deletions .github/workflows/test_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,15 @@ jobs:
- name: Run tests
run: docker run
--rm
--device /dev/kfd
--device /dev/dri
--net host
--pid host
--shm-size 64G
--env USE_ROCM="1"
--entrypoint /bin/bash
--shm-size=64G
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--env USE_ROCM="1"
--device /dev/kfd
--device /dev/dri/card0 --device /dev/dri/renderD128
--device /dev/dri/card1 --device /dev/dri/renderD129
opt-bench-rocm:5.6.1
-c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
7 changes: 5 additions & 2 deletions .github/workflows/test_tensorrt_onnxruntime_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,13 @@ jobs:
- name: Run tests
run: docker run
--rm
--gpus '"device=0,1"'
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--entrypoint /bin/bash
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--env USE_CUDA="1"
--gpus '"device=0,1"'
opt-bench-tensorrt:22.12
-c "pip install -e .[test,onnxruntime-gpu,diffusers] && pytest -k 'tensorrt and onnxruntime and inference' -x"
1 change: 1 addition & 0 deletions docker/cuda.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
rm -rf /var/lib/apt/lists/* && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1

# Add local bin to PATH
ENV PATH="/home/user/.local/bin:${PATH}"

# Add user to sudoers
Expand Down
10 changes: 8 additions & 2 deletions docker/rocm.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ RUN addgroup --gid $GROUP_ID user
RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user

# Install python
RUN apt-get update && apt-get install -y --no-install-recommends \
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3.10 \
python3.10-dev \
python3-pip \
Expand All @@ -39,13 +39,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
rm -rf /var/lib/apt/lists/* && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1

# amd-smi must be installed before switching to user
RUN apt-get update && apt-get upgrade -y && apt-get -y --no-install-recommends install amd-smi-lib
RUN pip install --upgrade pip setuptools wheel && cd /opt/rocm/share/amd_smi && pip install .
ENV PATH="/opt/rocm/bin:${PATH}"

# Add local bin to PATH
ENV PATH="/home/user/.local/bin:${PATH}"

# Add user to sudoers
RUN adduser user sudo
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers

# Fix permissions
# Fix AMD permissions
RUN usermod -g video user
RUN usermod -a -G render user

Expand Down
1 change: 1 addition & 0 deletions docker/tensorrt.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ ARG GROUP_ID
RUN addgroup --gid $GROUP_ID user
RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user

# Add local bin to PATH
ENV PATH="/home/user/.local/bin:${PATH}"

# Add user to sudoers
Expand Down
7 changes: 2 additions & 5 deletions examples/pytorch_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@ hydra:
job:
chdir: true
env_set:
CUDA_VISIBLE_DEVICES: 0,1

backend:
initial_isolation_check: false
continous_isolation_check: false
CUDA_VISIBLE_DEVICES: 0
CUDA_DEVICE_ORDER: PCI_BUS_ID

experiment_name: pytorch_bert
model: bert-base-uncased
Expand Down
3 changes: 2 additions & 1 deletion examples/text_generation_inference_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ hydra:
chdir: true
env_set:
CUDA_VISIBLE_DEVICES: 0,1

CUDA_DEVICE_ORDER: PCI_BUS_ID

experiment_name: text_generation_inference
model: NousResearch/Llama-2-7b-hf
device: cuda
Expand Down
73 changes: 35 additions & 38 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,25 @@

from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
from .config import BackendConfigT
from .isolation_utils import (
only_this_process_is_running_on_cuda_devices,
only_this_process_will_run_on_cuda_devices,
)
from .utils import (
check_no_process_is_running_on_cuda_device,
check_only_this_process_is_running_on_cuda_device,
extract_shapes_from_diffusion_pipeline,
extract_shapes_from_model_artifacts,
)

LOGGER = getLogger("backend")

CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if CUDA_VISIBLE_DEVICES is not None:
CUDA_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))
elif torch.cuda.is_available():
CUDA_DEVICES = list(range(torch.cuda.device_count()))
else:
CUDA_DEVICES = []


class Backend(Generic[BackendConfigT], ABC):
NAME: ClassVar[str]
Expand Down Expand Up @@ -77,7 +87,8 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
self.model_type = self.pretrained_config.model_type

try:
# the processor sometimes contains information about the model's input shapes that's not available in the config
# the processor sometimes contains information about the model's
# input shapes that's not available in the config
self.pretrained_processor = AutoProcessor.from_pretrained(
pretrained_model_name_or_path=self.model, **self.hub_kwargs
)
Expand All @@ -96,52 +107,39 @@ def is_text_generation_model(self) -> bool:
def is_diffusion_pipeline(self) -> bool:
return self.task in DIFFUSION_TASKS

def check_initial_isolation(self) -> None:
if self.device.type == "cuda":
# at this point we are sure that CUDA_VISIBLE_DEVICES is set if there are multiple GPUs available on the machine
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if CUDA_VISIBLE_DEVICES is None:
device_ids = [self.device.index if self.device.index is not None else 0]
else:
device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))

LOGGER.info(f"\t+ Checking initial device(s) isolation of CUDA device(s): {device_ids}")
check_no_process_is_running_on_cuda_device(device_ids)

def check_continuous_isolation(self) -> None:
if self.device.type == "cuda":
# at this point we are sure that CUDA_VISIBLE_DEVICES is set if there are multiple GPUs available on the machine
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if CUDA_VISIBLE_DEVICES is None:
device_ids = [self.device.index if self.device.index is not None else 0]
else:
device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(",")))

LOGGER.info(f"\t+ Checking continuous device(s) isolation of CUDA device(s): {device_ids}")
self.isolation_thread = Process(
target=check_only_this_process_is_running_on_cuda_device,
args=(device_ids, os.getpid()),
daemon=True,
)
self.isolation_thread.start()

def configure(self, config: BackendConfigT) -> None:
LOGGER.info(f"Configuring {self.NAME} backend")
self.config = config

# seeding backend
self.seed()

# isolation options
if self.config.initial_isolation_check:
self.check_initial_isolation()
if self.config.continous_isolation_check:
self.check_continuous_isolation()

# seeding backend
LOGGER.info(f"\t+ Seeding backend with seed {self.config.seed}")
self.seed()

# clean up options
if self.config.delete_cache:
LOGGER.info("\t+ Model cache will be deleted after benchmark")

def check_initial_isolation(self) -> None:
if self.device.type == "cuda":
LOGGER.info(f"\t+ Checking initial device(s) isolation of CUDA device(s): {CUDA_DEVICES}")
only_this_process_is_running_on_cuda_devices(cuda_devices=CUDA_DEVICES, benchmark_pid=os.getpid())

def check_continuous_isolation(self) -> None:
if self.device.type == "cuda":
LOGGER.info(f"\t+ Checking continuous device(s) isolation of CUDA device(s): {CUDA_DEVICES}")
self.isolation_thread = Process(
target=only_this_process_will_run_on_cuda_devices,
kwargs={"cuda_devices": CUDA_DEVICES, "benchmark_pid": os.getpid()},
daemon=True,
)
self.isolation_thread.start()

def seed(self) -> None:
# https://pytorch.org/docs/stable/notes/randomness.html
random.seed(self.config.seed)
Expand All @@ -150,16 +148,15 @@ def seed(self) -> None:

def prepare_input(self, input: Dict[str, Any]) -> Dict[str, Any]:
if self.is_diffusion_pipeline():
# diffusion pipelines expect a list of strings as input
# diffusion pipelines takes a list of strings
return input
else:
# models expect tensors on the target device as input
# models expect tensors on the target device
for key, value in input.items():
input[key] = value.to(self.device)

return input

# compiling in openvino requires input shapes, trt ep requires max tokens, etc.
def prepare_for_inference(self, **kwargs) -> None:
pass

Expand Down
95 changes: 95 additions & 0 deletions optimum_benchmark/backends/isolation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
import signal
import time
from typing import Dict, List

from ..env_utils import is_nvidia_system, is_rocm_system
from ..import_utils import is_py3nvml_available, is_pyrsmi_available


def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
"""
Raises a RuntimeError if any process other than the benchmark process is running on the specified CUDA devices.
"""
pids: Dict[int, set] = {}
for device_id in cuda_devices:
pids[device_id] = set()

if is_nvidia_system():
if not is_py3nvml_available():
raise ValueError(
"check_no_process_is_running_on_cuda_device requires py3nvml. "
"Please install it with `pip install py3nvml`."
)
import py3nvml.py3nvml as nvml

nvml.nvmlInit()
for device_id in cuda_devices:
device_handle = nvml.nvmlDeviceGetHandleByIndex(device_id)
pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
nvml.nvmlShutdown()
elif is_rocm_system():
if not is_pyrsmi_available():
raise ValueError(
"check_no_process_is_running_on_cuda_device requires pyrsmi. "
"Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
)
import amdsmi as rocml

rocml.amdsmi_init()
devices_handles = rocml.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = rocml.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = rocml.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
rocml.amdsmi_shut_down()
else:
raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")

all_pids = set()
for device_id in cuda_devices:
all_pids |= pids[device_id]
other_pids = all_pids - {benchmark_pid}

if len(other_pids) > 0:
error_message = f"Expected only process {benchmark_pid} on device(s) {cuda_devices}, but found {other_pids}."

# for pid in other_pids:
# error_message += f"\nProcess {pid} info: {get_pid_info(pid)}"

raise RuntimeError(error_message)


def only_this_process_will_run_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
"""
Kills the benchmark process if any other process is running on the specified CUDA devices.
"""
while True:
try:
only_this_process_is_running_on_cuda_devices(cuda_devices, benchmark_pid)
time.sleep(0.1)
except RuntimeError as exception:
os.kill(benchmark_pid, signal.SIGTERM)
raise exception


## we can report more information about the process to explain the source of the error
## but that might be dangerous in a CI context

# import psutil

# def get_pid_info(pid: int) -> Dict[str, str]:
# """Returns a dictionary containing the process' information."""

# process = psutil.Process(pid)

# return {
# "pid": pid,
# "name": process.name(),
# "username": process.username(),
# "cmdline": " ".join(process.cmdline()),
# }
Loading

0 comments on commit 301e484

Please sign in to comment.