Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rocm5.7 support #84

Merged
merged 1 commit into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 36 additions & 10 deletions .github/workflows/test_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,43 @@ concurrency:

jobs:
build_image_and_run_gpu_tests:
runs-on: hf-amd-mi210-dev
strategy:
fail-fast: false
matrix:
image:
[
{
rocm_version: 5.6.1,
torch_rocm_version: 5.6,
torch_pre_release: 0,
},
{
rocm_version: 5.7,
torch_rocm_version: 5.7,
torch_pre_release: 1,
},
]
runner: [hf-amd-mi210-dev]

runs-on: ${{ matrix.runner }}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Checkout code
uses: actions/checkout@v3

- name: Build image
run: docker build
--file docker/rocm.dockerfile
--build-arg USER_ID=$(id -u)
--build-arg GROUP_ID=$(id -g)
--build-arg ROCM_VERSION=5.6.1
--build-arg TORCH_ROCM=rocm5.6
--tag opt-bench-rocm:5.6.1
--build-arg ROCM_VERSION=$ROCM_VERSION
--build-arg TORCH_PRE_RELEASE=$TORCH_PRE_RELEASE
--build-arg TORCH_ROCM_VERSION=$TORCH_ROCM_VERSION
--tag opt-bench-rocm:$TORCH_ROCM_VERSION
.
env:
ROCM_VERSION: ${{ matrix.image.rocm_version }}
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
TORCH_PRE_RELEASE: ${{ matrix.image.torch_pre_release }}

- name: Run tests
run: docker run
Expand All @@ -33,11 +56,14 @@ jobs:
--pid host
--shm-size 64G
--env USE_ROCM="1"
--entrypoint /bin/bash
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--device /dev/kfd
--device /dev/dri/card0 --device /dev/dri/renderD128
--device /dev/dri/card1 --device /dev/dri/renderD129
opt-bench-rocm:5.6.1
--device /dev/dri/renderD128
--device /dev/dri/renderD129
--entrypoint /bin/bash
opt-bench-rocm:$TORCH_ROCM_VERSION
-c "pip install -e .[test,peft,diffusers] && pytest -k 'cuda and pytorch' -x"
env:
TORCH_ROCM_VERSION: ${{ matrix.image.torch_rocm_version }}
11 changes: 8 additions & 3 deletions docker/rocm.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ ARG UBUNTU_VERSION=22.04

FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}

ARG TORCH_ROCM=rocm5.6
ARG TORCH_PRE_RELEASE=0
ARG TORCH_ROCM_VERSION=5.6

# Ignore interactive questions during `docker build`
ENV DEBIAN_FRONTEND noninteractive
Expand Down Expand Up @@ -62,5 +63,9 @@ WORKDIR /home/user
# Update pip
RUN pip install --upgrade pip

# Install PyTorch
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM}
# Install PyTorch (nightly if ROCM_VERSION=5.7 or TORCH_PRE_RELEASE=1)
RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; then \
pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${TORCH_ROCM_VERSION} ; \
else \
pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${TORCH_ROCM_VERSION} ; \
fi
48 changes: 32 additions & 16 deletions optimum_benchmark/backends/isolation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, List

from ..env_utils import is_nvidia_system, is_rocm_system
from ..import_utils import is_py3nvml_available, is_pyrsmi_available
from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version


def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchmark_pid: int) -> None:
Expand All @@ -29,24 +29,40 @@ def only_this_process_is_running_on_cuda_devices(cuda_devices: List[int], benchm
pids[device_id] = set(nvml.nvmlDeviceGetComputeRunningProcesses(device_handle))
nvml.nvmlShutdown()
elif is_rocm_system():
if not is_pyrsmi_available():
rocm_version = torch_version().split("rocm")[-1]

if not is_amdsmi_available():
raise ValueError(
"check_no_process_is_running_on_cuda_device requires pyrsmi. "
"check_no_process_is_running_on_cuda_device requires amdsmi. "
"Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
)
import amdsmi as rocml

rocml.amdsmi_init()
devices_handles = rocml.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = rocml.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = rocml.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
rocml.amdsmi_shut_down()
import amdsmi as smi

smi.amdsmi_init()

if rocm_version >= "5.7":
# starting from rocm 5.7, the api seems to have changed names
devices_handles = smi.amdsmi_get_processor_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_gpu_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_gpu_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])
else:
devices_handles = smi.amdsmi_get_device_handles()
for device_id in cuda_devices:
device_handle = devices_handles[device_id]
processes_handles = smi.amdsmi_get_process_list(device_handle)
for process_handle in processes_handles:
info = smi.amdsmi_get_process_info(device_handle, process_handle)
if info["memory_usage"]["vram_mem"] == 4096:
continue
pids[device_id].add(info["pid"])

smi.amdsmi_shut_down()
else:
raise ValueError("check_no_process_is_running_on_cuda_device is only supported on NVIDIA and AMD GPUs.")

Expand Down
5 changes: 5 additions & 0 deletions optimum_benchmark/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
_pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None
_codecarbon_available = importlib.util.find_spec("codecarbon") is not None
_amdsmi_available = importlib.util.find_spec("amdsmi") is not None


def is_onnx_available():
Expand All @@ -36,6 +37,10 @@ def is_pyrsmi_available():
return _pyrsmi_available


def is_amdsmi_available():
return _amdsmi_available


def is_torch_available():
return _torch_available

Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

from setuptools import find_packages, setup

OPTIMUM_VERSION = "1.13.0"
OPTIMUM_VERSION = "1.14.0"

INSTALL_REQUIRES = [
# Mandatory HF dependencies
f"optimum>={OPTIMUM_VERSION}", # backends, tasks and input generation
"accelerate", # distributed inference and no weights init
# Hydra
"omegaconf>=2.3.0",
"hydra-core>=1.3.2",
"hydra_colorlog>=1.2.0",
"omegaconf",
"hydra-core",
"hydra_colorlog",
# Other
"psutil>=5.9.0",
"pandas>=2.0.0",
"psutil",
"pandas",
]

# We may allow to install CUDA or RoCm dependencies even when building in a non-CUDA or non-RoCm environment.
Expand Down Expand Up @@ -54,7 +54,7 @@
"onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={OPTIMUM_VERSION}"],
"onnxruntime-training": ["torch-ort", "onnxruntime-training"],
# server-like backends
"text-generation-inference": ["docker>=6.1.3"],
"text-generation-inference": ["docker>=6.0.0"],
# specific settings
"diffusers": ["diffusers"],
"peft": ["peft"],
Expand Down