Skip to content

Commit

Permalink
⚗️ try 3.12 on official vllm
Browse files Browse the repository at this point in the history
Signed-off-by: Joe Runde <[email protected]>
  • Loading branch information
joerunde committed Sep 3, 2024
1 parent 095df75 commit de7c076
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 313 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
push:
branches:
- release
- 312-test
paths-ignore:
- "**.md"
- "proto/**"
Expand Down
334 changes: 21 additions & 313 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
@@ -1,318 +1,26 @@
# Please update any changes made here to
# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst

## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.4-1134
ARG PYTHON_VERSION=3.11

# NOTE: This setting only has an effect when not using prebuilt-wheel kernels
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"


## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base

WORKDIR /workspace

ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp
RUN microdnf install -y \
which procps findutils tar vim \
&& microdnf clean all


## Python Installer ############################################################
FROM base as python-install

ARG PYTHON_VERSION
ARG MINIFORGE_VERSION=24.3.0-0

RUN curl -fsSL -o ~/miniforge3.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \
chmod +x ~/miniforge3.sh && \
bash ~/miniforge3.sh -b -p /opt/conda && \
source "/opt/conda/etc/profile.d/conda.sh" && \
conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \
conda activate /opt/vllm && \
rm ~/miniforge3.sh
# use of the /opt/vllm env requires:
# ENV PATH=/opt/vllm/bin/:$PATH

## CUDA Base ###################################################################
FROM base as cuda-base

# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
# this env var is set to 12.2.0, even though it's compatible
#ENV CUDA_VERSION=12.2.0 \
ENV CUDA_VERSION=12.0.0 \
NV_CUDA_LIB_VERSION=12.2.0-1 \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
NV_CUDA_CUDART_VERSION=12.2.53-1 \
NV_CUDA_COMPAT_VERSION=535.104.12

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo

RUN microdnf install -y \
cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
&& microdnf clean all

ENV CUDA_HOME="/usr/local/cuda" \
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"


## CUDA Runtime ################################################################
FROM cuda-base as cuda-runtime

ENV NV_NVTX_VERSION=12.2.53-1 \
NV_LIBNPP_VERSION=12.1.1.14-1 \
NV_LIBCUBLAS_VERSION=12.2.1.16-1 \
NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2

RUN microdnf install -y \
cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \
cuda-nvtx-12-2-${NV_NVTX_VERSION} \
libnpp-12-2-${NV_LIBNPP_VERSION} \
libcublas-12-2-${NV_LIBCUBLAS_VERSION} \
libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
&& microdnf clean all


## CUDA Development ############################################################
FROM cuda-base as cuda-devel

ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
NV_NVML_DEV_VERSION=12.2.81-1 \
NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2

RUN microdnf install -y \
cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
&& microdnf clean all

ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.2/compat/

## Python cuda base #################################################################
FROM cuda-devel as python-cuda-base

COPY --from=python-install --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

# install cuda and common dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
pip3 install \
-r requirements-cuda.txt

## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
--mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
--mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
pip3 install \
-r requirements-cuda.txt \
-r requirements-dev.txt

## Proto Compilation ###########################################################
FROM python-install AS gen-protos

ENV PATH=/opt/vllm/bin/:$PATH

RUN microdnf install -y \
make \
findutils \
&& microdnf clean all

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=Makefile,target=Makefile \
--mount=type=bind,source=proto,target=proto \
make gen-protos

## Extension Cache #############################################################
# Instead of compiling artifacts every build just copy from pre-built wheel
# This might not work if the PyTorch and CUDA versions don't match!
FROM base as prebuilt-wheel

RUN microdnf install -y \
unzip \
&& microdnf clean all

ARG PYTHON_VERSION
# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
ARG VLLM_WHEEL_VERSION=0.4.3

RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
&& unzip vllm.whl \
&& rm vllm.whl
# compiled extensions located at /workspace/vllm/*.so

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
pip install -r requirements-build.txt

# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=2
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Copy the entire directory before building wheel
COPY --link vllm vllm

# Comment if building *.so files from scratch
##################################################
# Copy the prebuilt *.so files
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
ENV VLLM_USE_PRECOMPILED=1
##################################################
# Comment if not building .so files from scratch
#RUN microdnf install -y git \
# && microdnf clean all
##################################################

# Copy over the generated *.pb2 files
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base as libsodium-builder

RUN microdnf install -y gcc gzip \
&& microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION=1.0.19
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
&& tar -xzvf libsodium*.tar.gz \
&& rm -f libsodium*.tar.gz \
&& mv libsodium*/* ./

RUN ./configure && make && make check

## Release #####################################################################
# Note from the non-UBI Dockerfile:
# We used base cuda image because pytorch installs its own cuda libraries.
# However pynccl depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM cuda-runtime AS vllm-openai

WORKDIR /workspace

# Create release python environment
COPY --from=python-cuda-base --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install $(echo dist/*.whl)'[tensorizer]' --verbose

# Install the vllm_nccl package which is a bit quirky
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
# The "install" happens in `setup.py` so it happens when built...
# Remove the already installed package and the cached wheel
pip uninstall -y vllm-nccl-cu12 \
&& pip cache remove vllm_nccl* \
# install the version depended on by vllm requirements
&& pip install vllm-nccl-cu12 -r requirements-cuda.txt \
# The lib is downloaded to root's home directory... move it
&& mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2
ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2

RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install \
# additional dependencies for the TGIS gRPC server
grpcio-tools==1.63.0 \
# additional dependencies for openai api_server
accelerate==0.30.0 \
# hf_transfer for faster HF hub downloads
hf_transfer==0.1.6

# Triton needs a CC compiler
RUN microdnf install -y gcc \
&& microdnf clean all

# patch triton (fix for #720)
COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
cd /usr/src/libsodium \
&& make install
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH

ENV HF_HUB_OFFLINE=1 \
PORT=8000 \
GRPC_PORT=8033 \
HOME=/home/vllm \
VLLM_USAGE_SOURCE=production-docker-image \
VLLM_WORKER_MULTIPROC_METHOD=fork \
TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager"

# setup non-root user for OpenShift
RUN microdnf install -y shadow-utils \
&& umask 002 \
&& useradd --uid 2000 --gid 0 vllm \
&& microdnf remove -y shadow-utils \
&& microdnf clean all \
&& chmod g+rwx $HOME /usr/src /workspace

COPY LICENSE /licenses/vllm.md

USER 2000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
FROM vllm/vllm-openai:v0.5.5 AS vllm-openai


# install python 3.12 instead

ARG PYTHON_VERSION=3.12
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version

0 comments on commit de7c076

Please sign in to comment.