Skip to content

Commit

Permalink
Add rocm5.7 support (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Nov 9, 2023
1 parent 33cc6e8 commit 4767fd8
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 36 deletions.
46 changes: 36 additions & 10 deletions .github/workflows/test_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,43 @@ concurrency:

jobs:
build_image_and_run_gpu_tests:
runs-on: hf-amd-mi210-dev
strategy:
fail-fast: false
matrix:
image:
[
{
rocm_version: 5.6.1,
torch_rocm_version: 5.6,
torch_pre_release: 0,
},
{
rocm_version: 5.7,
torch_rocm_version: 5.7,
torch_pre_release: 1,
},
]
runner: [hf-amd-mi210-dev]

runs-on: ${{ matrix.runner }}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Checkout code
uses: actions/checkout@v3

- name: Build image
run: docker build
--file docker/rocm.dockerfile
--build-arg USER_ID=$(id -u)
--build-arg GROUP_ID=$(id -g)
--build-arg ROCM_VERSION=5.6.1
--build-arg TORCH_ROCM=rocm5.6
--tag opt-bench-rocm:5.6.1
--build-arg ROCM_VERSION=$ROCM_VERSION
--build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE
--build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION
--tag opt-bench-rocm:$TORCH_ROCM_VERSION
.
env:
ROCM_VERSION: ${{ matrix.image.rocm_version }}
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }}

- name: Run tests
run: docker run
Expand All @@ -33,11 +56,14 @@ jobs:
--pid host
--shm-size 64G
--env USE_ROCM="1"
--entrypoint /bin/bash
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--device /dev/kfd
--device /dev/dri/card0 --device /dev/dri/renderD128
--device /dev/dri/card1 --device /dev/dri/renderD129
opt-bench-rocm:5.6.1
--device /dev/dri/renderD128
--device /dev/dri/renderD129
--entrypoint /bin/bash
opt-bench-rocm:$TORCH_ROCM_VERSION
-c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
env:
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
11 changes: 8 additions & 3 deletions docker/rocm.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ ARG UBUNTU_VERSION=22.04

FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}

ARG TORCH_ROCM=rocm5.6
ARG TORCH_PRE_RELEASE=0
ARG TORCH_ROCM_VERSION=5.6

# Ignore interactive questions during `docker build`
ENV DEBIAN_FRONTEND noninteractive
Expand Down Expand Up @@ -62,5 +63,9 @@ WORKDIR /home/user
# Update pip
RUN pip install --upgrade pip

# Install PyTorch
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM}
# Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1)
RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \
pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \
else \
pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \
fi
48 changes: 32 additions & 16 deletions optimum_benchmark/backends/isolation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, List

from ..env_utils import is_nvidia_system, is_rocm_system
from ..import_utils import is_py3nvml_available, is_pyrsmi_available
from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version


def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
Expand All @@ -29,24 +29,40 @@ def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchm
pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
nvml.nvmlShutdown()
elif is_rocm_system():
if not is_pyrsmi_available():
rocm_version = torch_version().split("rocm")[-1]

if not is_amdsmi_available():
raise ValueError(
"check_no_process_is_running_on_cuda_device requires pyrsmi. "
"check_no_process_is_running_on_cuda_device requires amdsmi. "
"Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
)
import amdsmi as rocml

rocml.amdsmi_init()
devices_handles = rocml.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = rocml.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = rocml.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
rocml.amdsmi_shut_down()
import amdsmi as smi

smi.amdsmi_init()

if rocm_version >= "5.7":
# starting from rocm 5.7, the api seems to have changed names
devices_handles = smi.amdsmi_get_processor_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_gpu_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_gpu_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
else:
devices_handles = smi.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])

smi.amdsmi_shut_down()
else:
raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")

Expand Down
5 changes: 5 additions & 0 deletions optimum_benchmark/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
_pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None
_codecarbon_available = importlib.util.find_spec("codecarbon") is not None
_amdsmi_available = importlib.util.find_spec("amdsmi") is not None


def is_onnx_available():
Expand All @@ -36,6 +37,10 @@ def is_pyrsmi_available():
return _pyrsmi_available


def is_amdsmi_available():
return _amdsmi_available


def is_torch_available():
return _torch_available

Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

from setuptools import find_packages, setup

OPTIMUM_VERSION = "1.13.0"
OPTIMUM_VERSION = "1.14.0"

INSTALL_REQUIRES = [
# Mandatory HF dependencies
f"optimum>={OPTIMUM_VERSION}", # backends, tasks and input generation
"accelerate", # distributed inference and no weights init
# Hydra
"omegaconf>=2.3.0",
"hydra-core>=1.3.2",
"hydra_colorlog>=1.2.0",
"omegaconf",
"hydra-core",
"hydra_colorlog",
# Other
"psutil>=5.9.0",
"pandas>=2.0.0",
"psutil",
"pandas",
]

# We may allow to install CUDA or RoCm dependencies even when building in a non-CUDA or non-RoCm environment.
Expand Down Expand Up @@ -54,7 +54,7 @@
"onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={OPTIMUM_VERSION}"],
"onnxruntime-training": ["torch-ort", "onnxruntime-training"],
# server-like backends
"text-generation-inference": ["docker>=6.1.3"],
"text-generation-inference": ["docker>=6.0.0"],
# specific settings
"diffusers": ["diffusers"],
"peft": ["peft"],
Expand Down

0 comments on commit 4767fd8

Please sign in to comment.