Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rocm5.7 support #84

Merged
merged 1 commit into from
Nov 9, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
added rocm5.7 support
IlyasMoutawwakil committed Nov 9, 2023
commit 2b41637f1e3382478bf12d640bf9b2e56ef2abcc
46 changes: 36 additions & 10 deletions .github/workflows/test_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
@@ -11,20 +11,43 @@ concurrency:

jobs:
build_image_and_run_gpu_tests:
runs-on: hf-amd-mi210-dev
strategy:
fail-fast: false
matrix:
image:
[
{
rocm_version: 5.6.1,
torch_rocm_version: 5.6,
torch_pre_release: 0,
},
{
rocm_version: 5.7,
torch_rocm_version: 5.7,
torch_pre_release: 1,
},
]
runner: [hf-amd-mi210-dev]

runs-on: ${{ matrix.runner }}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Checkout code
uses: actions/checkout@v3

- name: Build image
run: docker build
--file docker/rocm.dockerfile
--build-arg USER_ID=$(id -u)
--build-arg GROUP_ID=$(id -g)
--build-arg ROCM_VERSION=5.6.1
--build-arg TORCH_ROCM=rocm5.6
--tag opt-bench-rocm:5.6.1
--build-arg ROCM_VERSION=$ROCM_VERSION
--build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE
--build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION
--tag opt-bench-rocm:$TORCH_ROCM_VERSION
.
env:
ROCM_VERSION: ${{ matrix.image.rocm_version }}
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }}

- name: Run tests
run: docker run
@@ -33,11 +56,14 @@ jobs:
--pid host
--shm-size 64G
--env USE_ROCM="1"
--entrypoint /bin/bash
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--device /dev/kfd
--device /dev/dri/card0 --device /dev/dri/renderD128
--device /dev/dri/card1 --device /dev/dri/renderD129
opt-bench-rocm:5.6.1
--device /dev/dri/renderD128
--device /dev/dri/renderD129
--entrypoint /bin/bash
opt-bench-rocm:$TORCH_ROCM_VERSION
-c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
env:
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
11 changes: 8 additions & 3 deletions docker/rocm.dockerfile
Original file line number Diff line number Diff line change
@@ -17,7 +17,8 @@ ARG UBUNTU_VERSION=22.04

FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}

ARG TORCH_ROCM=rocm5.6
ARG TORCH_PRE_RELEASE=0
ARG TORCH_ROCM_VERSION=5.6

# Ignore interactive questions during `docker build`
ENV DEBIAN_FRONTEND noninteractive
@@ -62,5 +63,9 @@ WORKDIR /home/user
# Update pip
RUN pip install --upgrade pip

# Install PyTorch
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM}
# Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1)
RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \
pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \
else \
pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \
fi
48 changes: 32 additions & 16 deletions optimum_benchmark/backends/isolation_utils.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
from typing import Dict, List

from ..env_utils import is_nvidia_system, is_rocm_system
from ..import_utils import is_py3nvml_available, is_pyrsmi_available
from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version


def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
@@ -29,24 +29,40 @@ def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchm
pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
nvml.nvmlShutdown()
elif is_rocm_system():
if not is_pyrsmi_available():
rocm_version = torch_version().split("rocm")[-1]

if not is_amdsmi_available():
raise ValueError(
"check_no_process_is_running_on_cuda_device requires pyrsmi. "
"check_no_process_is_running_on_cuda_device requires amdsmi. "
"Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
)
import amdsmi as rocml

rocml.amdsmi_init()
devices_handles = rocml.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = rocml.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = rocml.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
rocml.amdsmi_shut_down()
import amdsmi as smi

smi.amdsmi_init()

if rocm_version >= "5.7":
# starting from rocm 5.7, the api seems to have changed names
devices_handles = smi.amdsmi_get_processor_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_gpu_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_gpu_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
else:
devices_handles = smi.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])

smi.amdsmi_shut_down()
else:
raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")

5 changes: 5 additions & 0 deletions optimum_benchmark/import_utils.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@
_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
_pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None
_codecarbon_available = importlib.util.find_spec("codecarbon") is not None
_amdsmi_available = importlib.util.find_spec("amdsmi") is not None


def is_onnx_available():
@@ -36,6 +37,10 @@ def is_pyrsmi_available():
return _pyrsmi_available


def is_amdsmi_available():
return _amdsmi_available


def is_torch_available():
return _torch_available

14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -3,19 +3,19 @@

from setuptools import find_packages, setup

OPTIMUM_VERSION = "1.13.0"
OPTIMUM_VERSION = "1.14.0"

INSTALL_REQUIRES = [
# Mandatory HF dependencies
f"optimum>={OPTIMUM_VERSION}", # backends, tasks and input generation
"accelerate", # distributed inference and no weights init
# Hydra
"omegaconf>=2.3.0",
"hydra-core>=1.3.2",
"hydra_colorlog>=1.2.0",
"omegaconf",
"hydra-core",
"hydra_colorlog",
# Other
"psutil>=5.9.0",
"pandas>=2.0.0",
"psutil",
"pandas",
]

# We may allow to install CUDA or RoCm dependencies even when building in a non-CUDA or non-RoCm environment.
@@ -54,7 +54,7 @@
"onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={OPTIMUM_VERSION}"],
"onnxruntime-training": ["torch-ort", "onnxruntime-training"],
# server-like backends
"text-generation-inference": ["docker>=6.1.3"],
"text-generation-inference": ["docker>=6.0.0"],
# specific settings
"diffusers": ["diffusers"],
"peft": ["peft"],