-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Joe Runde <[email protected]>
- Loading branch information
Showing
2 changed files
with
22 additions
and
313 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ on: | |
push: | ||
branches: | ||
- release | ||
- 312-test | ||
paths-ignore: | ||
- "**.md" | ||
- "proto/**" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,318 +1,26 @@ | ||
# Please update any changes made here to | ||
# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst | ||
|
||
## Global Args ################################################################# | ||
ARG BASE_UBI_IMAGE_TAG=9.4-1134 | ||
ARG PYTHON_VERSION=3.11 | ||
|
||
# NOTE: This setting only has an effect when not using prebuilt-wheel kernels | ||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" | ||
|
||
|
||
## Base Layer ################################################################## | ||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base | ||
|
||
WORKDIR /workspace | ||
|
||
ENV LANG=C.UTF-8 \ | ||
LC_ALL=C.UTF-8 | ||
|
||
# Some utils for dev purposes - tar required for kubectl cp | ||
RUN microdnf install -y \ | ||
which procps findutils tar vim \ | ||
&& microdnf clean all | ||
|
||
|
||
## Python Installer ############################################################ | ||
FROM base as python-install | ||
|
||
ARG PYTHON_VERSION | ||
ARG MINIFORGE_VERSION=24.3.0-0 | ||
|
||
RUN curl -fsSL -o ~/miniforge3.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \ | ||
chmod +x ~/miniforge3.sh && \ | ||
bash ~/miniforge3.sh -b -p /opt/conda && \ | ||
source "/opt/conda/etc/profile.d/conda.sh" && \ | ||
conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \ | ||
conda activate /opt/vllm && \ | ||
rm ~/miniforge3.sh | ||
# use of the /opt/vllm env requires: | ||
# ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
## CUDA Base ################################################################### | ||
FROM base as cuda-base | ||
|
||
# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if | ||
# this env var is set to 12.2.0, even though it's compatible | ||
#ENV CUDA_VERSION=12.2.0 \ | ||
ENV CUDA_VERSION=12.0.0 \ | ||
NV_CUDA_LIB_VERSION=12.2.0-1 \ | ||
NVIDIA_VISIBLE_DEVICES=all \ | ||
NVIDIA_DRIVER_CAPABILITIES=compute,utility \ | ||
NV_CUDA_CUDART_VERSION=12.2.53-1 \ | ||
NV_CUDA_COMPAT_VERSION=535.104.12 | ||
|
||
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ | ||
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo | ||
|
||
RUN microdnf install -y \ | ||
cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \ | ||
cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \ | ||
&& microdnf clean all | ||
|
||
ENV CUDA_HOME="/usr/local/cuda" \ | ||
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \ | ||
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" | ||
|
||
|
||
## CUDA Runtime ################################################################ | ||
FROM cuda-base as cuda-runtime | ||
|
||
ENV NV_NVTX_VERSION=12.2.53-1 \ | ||
NV_LIBNPP_VERSION=12.1.1.14-1 \ | ||
NV_LIBCUBLAS_VERSION=12.2.1.16-1 \ | ||
NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2 | ||
|
||
RUN microdnf install -y \ | ||
cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-nvtx-12-2-${NV_NVTX_VERSION} \ | ||
libnpp-12-2-${NV_LIBNPP_VERSION} \ | ||
libcublas-12-2-${NV_LIBCUBLAS_VERSION} \ | ||
libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \ | ||
&& microdnf clean all | ||
|
||
|
||
## CUDA Development ############################################################ | ||
FROM cuda-base as cuda-devel | ||
|
||
ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \ | ||
NV_NVML_DEV_VERSION=12.2.81-1 \ | ||
NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \ | ||
NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \ | ||
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2 | ||
|
||
RUN microdnf install -y \ | ||
cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \ | ||
cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \ | ||
libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \ | ||
libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \ | ||
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \ | ||
&& microdnf clean all | ||
|
||
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" | ||
|
||
# Workaround for https://github.com/openai/triton/issues/2507 and | ||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully | ||
# this won't be needed for future versions of this docker image | ||
# or future versions of triton. | ||
RUN ldconfig /usr/local/cuda-12.2/compat/ | ||
|
||
## Python cuda base ################################################################# | ||
FROM cuda-devel as python-cuda-base | ||
|
||
COPY --from=python-install --link /opt/vllm /opt/vllm | ||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
# install cuda and common dependencies | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ | ||
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ | ||
pip3 install \ | ||
-r requirements-cuda.txt | ||
|
||
## Development ################################################################# | ||
FROM python-cuda-base AS dev | ||
|
||
# install build and runtime dependencies | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ | ||
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ | ||
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ | ||
--mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ | ||
--mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ | ||
pip3 install \ | ||
-r requirements-cuda.txt \ | ||
-r requirements-dev.txt | ||
|
||
## Proto Compilation ########################################################### | ||
FROM python-install AS gen-protos | ||
|
||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
RUN microdnf install -y \ | ||
make \ | ||
findutils \ | ||
&& microdnf clean all | ||
|
||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=Makefile,target=Makefile \ | ||
--mount=type=bind,source=proto,target=proto \ | ||
make gen-protos | ||
|
||
## Extension Cache ############################################################# | ||
# Instead of compiling artifacts every build just copy from pre-built wheel | ||
# This might not work if the PyTorch and CUDA versions don't match! | ||
FROM base as prebuilt-wheel | ||
|
||
RUN microdnf install -y \ | ||
unzip \ | ||
&& microdnf clean all | ||
|
||
ARG PYTHON_VERSION | ||
# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0 | ||
ARG VLLM_WHEEL_VERSION=0.4.3 | ||
|
||
RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ | ||
&& unzip vllm.whl \ | ||
&& rm vllm.whl | ||
# compiled extensions located at /workspace/vllm/*.so | ||
|
||
## Builder ##################################################################### | ||
FROM dev AS build | ||
|
||
# install build dependencies | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ | ||
pip install -r requirements-build.txt | ||
|
||
# copy input files | ||
COPY csrc csrc | ||
COPY setup.py setup.py | ||
COPY cmake cmake | ||
COPY CMakeLists.txt CMakeLists.txt | ||
COPY requirements-common.txt requirements-common.txt | ||
COPY requirements-cuda.txt requirements-cuda.txt | ||
COPY pyproject.toml pyproject.toml | ||
COPY vllm/__init__.py vllm/__init__.py | ||
|
||
ARG TORCH_CUDA_ARCH_LIST | ||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | ||
|
||
# max jobs used by Ninja to build extensions | ||
ARG max_jobs=2 | ||
ENV MAX_JOBS=${max_jobs} | ||
# number of threads used by nvcc | ||
ARG nvcc_threads=2 | ||
ENV NVCC_THREADS=$nvcc_threads | ||
# make sure punica kernels are built (for LoRA) | ||
ENV VLLM_INSTALL_PUNICA_KERNELS=1 | ||
|
||
# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8 | ||
ENV PATH=/usr/local/cuda/bin:$PATH | ||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH | ||
|
||
# Copy the entire directory before building wheel | ||
COPY --link vllm vllm | ||
|
||
# Comment if building *.so files from scratch | ||
################################################## | ||
# Copy the prebuilt *.so files | ||
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/ | ||
ENV VLLM_USE_PRECOMPILED=1 | ||
################################################## | ||
# Comment if not building .so files from scratch | ||
#RUN microdnf install -y git \ | ||
# && microdnf clean all | ||
################################################## | ||
|
||
# Copy over the generated *.pb2 files | ||
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb | ||
|
||
ENV CCACHE_DIR=/root/.cache/ccache | ||
RUN --mount=type=cache,target=/root/.cache/ccache \ | ||
--mount=type=cache,target=/root/.cache/pip \ | ||
CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist | ||
|
||
#################### libsodium Build IMAGE #################### | ||
FROM base as libsodium-builder | ||
|
||
RUN microdnf install -y gcc gzip \ | ||
&& microdnf clean all | ||
|
||
WORKDIR /usr/src/libsodium | ||
|
||
ARG LIBSODIUM_VERSION=1.0.19 | ||
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ | ||
&& tar -xzvf libsodium*.tar.gz \ | ||
&& rm -f libsodium*.tar.gz \ | ||
&& mv libsodium*/* ./ | ||
|
||
RUN ./configure && make && make check | ||
|
||
## Release ##################################################################### | ||
# Note from the non-UBI Dockerfile: | ||
# We used base cuda image because pytorch installs its own cuda libraries. | ||
# However pynccl depends on cuda libraries so we had to switch to the runtime image | ||
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda | ||
FROM cuda-runtime AS vllm-openai | ||
|
||
WORKDIR /workspace | ||
|
||
# Create release python environment | ||
COPY --from=python-cuda-base --link /opt/vllm /opt/vllm | ||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
# install vllm wheel first, so that torch etc will be installed | ||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ | ||
--mount=type=cache,target=/root/.cache/pip \ | ||
pip install $(echo dist/*.whl)'[tensorizer]' --verbose | ||
|
||
# Install the vllm_nccl package which is a bit quirky | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ | ||
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ | ||
# The "install" happens in `setup.py` so it happens when built... | ||
# Remove the already installed package and the cached wheel | ||
pip uninstall -y vllm-nccl-cu12 \ | ||
&& pip cache remove vllm_nccl* \ | ||
# install the version depended on by vllm requirements | ||
&& pip install vllm-nccl-cu12 -r requirements-cuda.txt \ | ||
# The lib is downloaded to root's home directory... move it | ||
&& mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2 | ||
ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2 | ||
|
||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip3 install \ | ||
# additional dependencies for the TGIS gRPC server | ||
grpcio-tools==1.63.0 \ | ||
# additional dependencies for openai api_server | ||
accelerate==0.30.0 \ | ||
# hf_transfer for faster HF hub downloads | ||
hf_transfer==0.1.6 | ||
|
||
# Triton needs a CC compiler | ||
RUN microdnf install -y gcc \ | ||
&& microdnf clean all | ||
|
||
# patch triton (fix for #720) | ||
COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py | ||
|
||
# Install libsodium for Tensorizer encryption | ||
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ | ||
cd /usr/src/libsodium \ | ||
&& make install | ||
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH | ||
|
||
ENV HF_HUB_OFFLINE=1 \ | ||
PORT=8000 \ | ||
GRPC_PORT=8033 \ | ||
HOME=/home/vllm \ | ||
VLLM_USAGE_SOURCE=production-docker-image \ | ||
VLLM_WORKER_MULTIPROC_METHOD=fork \ | ||
TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager" | ||
|
||
# setup non-root user for OpenShift | ||
RUN microdnf install -y shadow-utils \ | ||
&& umask 002 \ | ||
&& useradd --uid 2000 --gid 0 vllm \ | ||
&& microdnf remove -y shadow-utils \ | ||
&& microdnf clean all \ | ||
&& chmod g+rwx $HOME /usr/src /workspace | ||
|
||
COPY LICENSE /licenses/vllm.md | ||
|
||
USER 2000 | ||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] | ||
FROM vllm/vllm-openai:v0.5.5 AS vllm-openai | ||
|
||
|
||
# install python 3.12 instead | ||
|
||
ARG PYTHON_VERSION=3.12 | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Install Python and other dependencies | ||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | ||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y ccache software-properties-common git curl sudo \ | ||
&& add-apt-repository ppa:deadsnakes/ppa \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ | ||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | ||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | ||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | ||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ | ||
&& python3 --version && python3 -m pip --version |