diff --git a/Dockerfile.rocm.ubi b/Dockerfile.rocm.ubi index 850c1b9f67024..467ecf3aa9235 100644 --- a/Dockerfile.rocm.ubi +++ b/Dockerfile.rocm.ubi @@ -3,111 +3,80 @@ ARG BASE_UBI_IMAGE_TAG=9.4 ARG PYTHON_VERSION=3.11 # Default ROCm ARCHes to build vLLM for. ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" +ARG MAX_JOBS=12 -## Base Layer ################################################################## -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as rocm-base +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base -# Max jobs for parallel build -ARG MAX_JOBS=12 +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" -ENV BUILD_TARGET='rocm' +RUN --mount=type=cache,target=/root/.cache/pip \ + microdnf install -y --setopt=install_weak_deps=0 --nodocs \ + python${PYTHON_VERSION}-devel \ + python${PYTHON_VERSION}-pip \ + python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ + pip install -U pip wheel setuptools uv -USER root +FROM base AS rocm_base ENV ROCM_VERSION=6.1.2 -# Set up ROCm repository and install necessary packages - -RUN echo "[amdgpu]" > /etc/yum.repos.d/amdgpu.repo && \ -echo "name=amdgpu" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.4/main/x86_64/" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "priority=50" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "[ROCm-${ROCM_VERSION}]" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "name=ROCm${ROCM_VERSION}" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "priority=50" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo && \ -echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo +RUN printf "[amdgpu]\n\ +name=amdgpu\n\ +baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.4/main/x86_64/\n\ +enabled=1\n\ +priority=50\n\ +gpgcheck=1\n\ +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\ +[ROCm-${ROCM_VERSION}]\n\ +name=ROCm${ROCM_VERSION}\n\ +baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\ +enabled=1\n\ +priority=50\n\ +gpgcheck=1\n\ +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo + RUN microdnf -y update && \ - microdnf -y install rocm hipcc git which && \ + microdnf -y install \ + rocm-hip-libraries rocm-hip-runtime \ + miopen-hip && \ microdnf clean all -WORKDIR /workspace - -################################################################################################## - -FROM rocm-base as python-install -ARG PYTHON_VERSION +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install -v --index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \ + torch==2.5.0.dev20240726+rocm6.1 \ + torchvision==0.20.0.dev20240726+rocm6.1 -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" -RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs \ - python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV --system-site-packages && \ - $VIRTUAL_ENV/bin/pip install --no-cache -U pip wheel && \ - microdnf clean all -################################################################################################## +FROM rocm_base as rocm_devel -FROM python-install as python-rocm-base +ENV CCACHE_DIR=/root/.cache/ccache -# install common dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt,readonly \ - --mount=type=bind,source=requirements-rocm.txt,target=requirements-rocm.txt,readonly \ - pip install -r requirements-rocm.txt +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + rpm -ql epel-release && \ + microdnf -y update && \ + microdnf -y install \ + ccache \ + git \ + rocm \ + hipcc \ + wget \ + which && \ + microdnf clean all -################################################################################################## +WORKDIR /workspace -FROM python-rocm-base as base - -# Set the application mount point -ARG APP_MOUNT=/vllm-workspace -WORKDIR ${APP_MOUNT} - -# Upgrade pip and remove unnecessary packages -RUN python3 -m pip install --upgrade --no-cache-dir pip && \ - microdnf -y remove sccache || true && \ - python3 -m pip uninstall -y sccache || true && \ - rm -f "$(which sccache)" && \ - microdnf clean all && \ - rm -rf /var/cache/yum /var/cache/dnf - -# Install torch == 2.5.0 on ROCm -RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.1"*) \ - python3 -m pip uninstall -y torch torchvision \ - && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240726 \ - torchvision==0.20.0.dev20240726 \ - --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ - *) ;; esac - -# Set environment variables ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/libtorch/lib ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include:/opt/rocm/include -ENV PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" -ENV CCACHE_DIR=/root/.cache/ccache -################################################################################################## - -FROM base as build_base - -RUN python3 -m pip install --upgrade --no-cache-dir ninja cmake>=3.26 - -################################################################################################## - -################################################################################################## -### AMD-SMI build stage -FROM build_base AS build_amdsmi +FROM rocm_devel AS build_amdsmi # Build AMD SMI wheel RUN cd /opt/rocm/share/amd_smi && \ @@ -115,21 +84,25 @@ RUN cd /opt/rocm/share/amd_smi && \ ################################################################################################## -### Flash-Attention wheel build stage -FROM build_base AS build_fa +FROM rocm_devel AS build_flashattention # Whether to install CK-based flash-attention ARG BUILD_FA="1" ARG TRY_FA_WHEEL="1" +# Note: The ROCm fork provides a wheel built for ROCm but only for 2.5.9 and python 3.9, so this will be incompatible with the current build ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl" +# only required when not using triton backend ARG FA_GFX_ARCHS="gfx90a;gfx942" -ARG FA_BRANCH="23a2b1c2" - -# Ensure necessary tools are installed -RUN microdnf install -y wget git && microdnf clean all +ARG FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" +# FA_BRANCH is the main_perf branch as of Sep 4 2024 which includes triton backend support, see https://github.com/Dao-AILab/flash-attention/pull/1203 +ARG FA_BRANCH="75b5360" +ARG MAX_JOBS +ENV MAX_JOBS=${MAX_JOBS} +ENV FLASH_ATTENTION_USE_TRITON_ROCM=${FLASH_ATTENTION_USE_TRITON_ROCM} # Build ROCm flash-attention wheel if `BUILD_FA` is set to 1 -RUN --mount=type=cache,target=${CCACHE_DIR} \ +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/workspace/build \ if [ "$BUILD_FA" = "1" ]; then \ if [ "$TRY_FA_WHEEL" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \ # If a suitable wheel exists, download it instead of building FA @@ -139,132 +112,97 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ cd /libs && \ git clone https://github.com/ROCm/flash-attention.git && \ cd flash-attention && \ - git checkout "${FA_BRANCH}" && \ + git checkout ${FA_BRANCH} && \ git submodule update --init && \ - GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + uv pip install cmake ninja packaging && \ + env \ + GPU_ARCHS="${FA_GFX_ARCHS}" \ + BUILD_TARGET="rocm" \ + python3 setup.py bdist_wheel --dist-dir=/install; \ fi; \ else \ - # Create an empty directory otherwise as later build stages expect one + # Create an empty directory otherwise AS later build stages expect one mkdir -p /install; \ fi ################################################################################################## -### Triton wheel build stage -FROM build_base AS build_triton +FROM rocm_devel AS build_triton # Whether to build triton on rocm -ARG BUILD_TRITON="1" +ARG BUILD_TRITON="0" ARG TRITON_BRANCH="e0fc12c" +ARG TRITON_WHEEL_SHA=21eae954ef # Build triton wheel if `BUILD_TRITON` is set to 1 -RUN --mount=type=cache,target=${CCACHE_DIR} \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p /libs && cd /libs && \ git clone https://github.com/OpenAI/triton.git && \ cd triton && \ git checkout "${TRITON_BRANCH}" && \ cd python && \ + uv pip install -v cmake ninja && \ python3 setup.py bdist_wheel --dist-dir=/install; \ else \ - # Create an empty directory otherwise as later build stages expect one - mkdir -p /install; \ + mkdir -p /install && \ + wget -P /install "https://download.pytorch.org/whl/nightly/pytorch_triton_rocm-3.0.0%2B${TRITON_WHEEL_SHA}-cp311-cp311-linux_x86_64.whl"; \ fi ################################################################################################## -### Final vLLM build stage -FROM build_base AS final - -# Import the vLLM development directory from the build context -COPY . . +FROM rocm_devel AS build_vllm +ARG PYTORCH_ROCM_ARCH +ARG MAX_JOBS +ENV MAX_JOBS=${MAX_JOBS} +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} -# Install wget only if it is needed -RUN microdnf -y install wget && microdnf clean all -# Package upgrades to avoid dependency issues and add functionality -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade numba scipy huggingface-hub[cli] && \ - microdnf clean all +COPY . . -ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" -# Set environment variables for runtime -ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -# Silences the HF Tokenizers warning -ENV TOKENIZERS_PARALLELISM=false +ENV VLLM_TARGET_DEVICE="rocm" +ENV MAX_JOBS=${MAX_JOBS} +# Make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 -# Install dependencies from requirements file and apply ROCm specific patches -RUN --mount=type=cache,target=${CCACHE_DIR} \ +RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -Ur requirements-rocm.txt && \ - ROCM_VERSION=$(ls /opt | grep -Po 'rocm-[0-9]+\.[0-9]+') && \ - case "$ROCM_VERSION" in \ - "rocm-6.1") \ - # Apply patch for ROCm 6.1 - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib && \ - # Remove potentially conflicting HIP runtime from torch - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true ;; \ - *) \ - echo "ROCm version $ROCM_VERSION is not supported for patching." ;; \ - esac && \ - python3 setup.py clean --all && \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install -v -U \ + ninja "cmake>=3.26" packaging && \ python3 setup.py bdist_wheel --dist-dir=dist ################################################################################################## -FROM base AS vllm-openai +FROM rocm_base AS vllm-openai +ARG MAX_JOBS +ARG PYTHON_VERSION WORKDIR /workspace -# Set up the virtual environment and update PATH ENV VIRTUAL_ENV=/opt/vllm ENV PATH=$VIRTUAL_ENV/bin:$PATH -# Install necessary build tools +# Required for triton RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc && \ microdnf clean all -# Copy amdsmi wheel into final image -RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ - mkdir -p libs \ - && cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y amdsmi; - -# Copy triton wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y triton; fi - -# Copy flash-attn wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y flash-attn; fi - -# Copy vLLM wheel(s) into the final image -RUN --mount=type=bind,from=final,src=/vllm-workspace/dist,target=/dist \ +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \ + --mount=type=bind,from=build_triton,src=/install,target=/install/triton \ + --mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \ + --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \ --mount=type=cache,target=/root/.cache/pip \ - cp /dist/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y vllm - -# Install wheels that were built to the final image -RUN --mount=type=cache,target=/root/.cache/pip \ - if ls libs/*.whl; then \ - python3 -m pip install libs/*.whl; fi - -# Environment variables for runtime configuration -ENV HF_HUB_OFFLINE=1 \ - PORT=8000 \ - HOME=/home/vllm \ - VLLM_USAGE_SOURCE=production-docker-image + --mount=type=cache,target=/root/.cache/uv \ + uv pip install -v \ + --index-strategy=unsafe-best-match \ + --extra-index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \ + /install/amdsmi/*.whl\ + /install/triton/*.whl\ + /install/flashattention/*.whl\ + /install/vllm/*.whl # Set up a non-root user for OpenShift RUN umask 002 && \ @@ -275,6 +213,7 @@ RUN umask 002 && \ COPY LICENSE /licenses/vllm.md ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ HOME=/home/vllm \ # Allow requested max length to exceed what is extracted from the # config.json @@ -282,7 +221,16 @@ ENV HF_HUB_OFFLINE=1 \ VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ VLLM_USAGE_SOURCE=production-docker-image \ VLLM_WORKER_MULTIPROC_METHOD=fork \ - VLLM_NO_USAGE_STATS=1 + VLLM_NO_USAGE_STATS=1 \ + # Silences the HF Tokenizers warning + TOKENIZERS_PARALLELISM=false \ + RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \ + FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" \ + OUTLINES_CACHE_DIR=/tmp/outlines \ + NUMBA_CACHE_DIR=/tmp/numba \ + TRITON_CACHE_DIR=/tmp/triton + +# FIXME: check if the last env var is actually required # Switch to the non-root user USER 2000