diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f5051eb86..c6854a9f6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,6 +6,7 @@ on: push: branches: - release + - 312-test paths-ignore: - "**.md" - "proto/**" diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 32157c16e..4c7396e72 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,318 +1,26 @@ -# Please update any changes made here to -# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst - -## Global Args ################################################################# -ARG BASE_UBI_IMAGE_TAG=9.4-1134 -ARG PYTHON_VERSION=3.11 - -# NOTE: This setting only has an effect when not using prebuilt-wheel kernels -ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" - - -## Base Layer ################################################################## -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base - -WORKDIR /workspace - -ENV LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# Some utils for dev purposes - tar required for kubectl cp -RUN microdnf install -y \ - which procps findutils tar vim \ - && microdnf clean all - - -## Python Installer ############################################################ -FROM base as python-install - -ARG PYTHON_VERSION -ARG MINIFORGE_VERSION=24.3.0-0 - -RUN curl -fsSL -o ~/miniforge3.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \ - chmod +x ~/miniforge3.sh && \ - bash ~/miniforge3.sh -b -p /opt/conda && \ - source "/opt/conda/etc/profile.d/conda.sh" && \ - conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \ - conda activate /opt/vllm && \ - rm ~/miniforge3.sh -# use of the /opt/vllm env requires: -# ENV PATH=/opt/vllm/bin/:$PATH - -## CUDA Base ################################################################### -FROM base as cuda-base - -# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if -# this env var is set to 12.2.0, even though it's compatible -#ENV CUDA_VERSION=12.2.0 \ -ENV CUDA_VERSION=12.0.0 \ - NV_CUDA_LIB_VERSION=12.2.0-1 \ - NVIDIA_VISIBLE_DEVICES=all \ - NVIDIA_DRIVER_CAPABILITIES=compute,utility \ - NV_CUDA_CUDART_VERSION=12.2.53-1 \ - NV_CUDA_COMPAT_VERSION=535.104.12 - -RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ - https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo - -RUN microdnf install -y \ - cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \ - cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \ - && microdnf clean all - -ENV CUDA_HOME="/usr/local/cuda" \ - PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" - - -## CUDA Runtime ################################################################ -FROM cuda-base as cuda-runtime - -ENV NV_NVTX_VERSION=12.2.53-1 \ - NV_LIBNPP_VERSION=12.1.1.14-1 \ - NV_LIBCUBLAS_VERSION=12.2.1.16-1 \ - NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2 - -RUN microdnf install -y \ - cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-nvtx-12-2-${NV_NVTX_VERSION} \ - libnpp-12-2-${NV_LIBNPP_VERSION} \ - libcublas-12-2-${NV_LIBCUBLAS_VERSION} \ - libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \ - && microdnf clean all - - -## CUDA Development ############################################################ -FROM cuda-base as cuda-devel - -ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \ - NV_NVML_DEV_VERSION=12.2.81-1 \ - NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \ - NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \ - NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2 - -RUN microdnf install -y \ - cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \ - cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \ - libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \ - libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \ - libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \ - && microdnf clean all - -ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" - -# Workaround for https://github.com/openai/triton/issues/2507 and -# https://github.com/pytorch/pytorch/issues/107960 -- hopefully -# this won't be needed for future versions of this docker image -# or future versions of triton. -RUN ldconfig /usr/local/cuda-12.2/compat/ - -## Python cuda base ################################################################# -FROM cuda-devel as python-cuda-base - -COPY --from=python-install --link /opt/vllm /opt/vllm -ENV PATH=/opt/vllm/bin/:$PATH - -# install cuda and common dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - pip3 install \ - -r requirements-cuda.txt - -## Development ################################################################# -FROM python-cuda-base AS dev - -# install build and runtime dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ - --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ - --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ - pip3 install \ - -r requirements-cuda.txt \ - -r requirements-dev.txt - -## Proto Compilation ########################################################### -FROM python-install AS gen-protos - -ENV PATH=/opt/vllm/bin/:$PATH - -RUN microdnf install -y \ - make \ - findutils \ - && microdnf clean all - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=Makefile,target=Makefile \ - --mount=type=bind,source=proto,target=proto \ - make gen-protos - -## Extension Cache ############################################################# -# Instead of compiling artifacts every build just copy from pre-built wheel -# This might not work if the PyTorch and CUDA versions don't match! -FROM base as prebuilt-wheel - -RUN microdnf install -y \ - unzip \ - && microdnf clean all - -ARG PYTHON_VERSION -# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0 -ARG VLLM_WHEEL_VERSION=0.4.3 - -RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ - && unzip vllm.whl \ - && rm vllm.whl -# compiled extensions located at /workspace/vllm/*.so - -## Builder ##################################################################### -FROM dev AS build - -# install build dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ - pip install -r requirements-build.txt - -# copy input files -COPY csrc csrc -COPY setup.py setup.py -COPY cmake cmake -COPY CMakeLists.txt CMakeLists.txt -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -COPY pyproject.toml pyproject.toml -COPY vllm/__init__.py vllm/__init__.py - -ARG TORCH_CUDA_ARCH_LIST -ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST - -# max jobs used by Ninja to build extensions -ARG max_jobs=2 -ENV MAX_JOBS=${max_jobs} -# number of threads used by nvcc -ARG nvcc_threads=2 -ENV NVCC_THREADS=$nvcc_threads -# make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 - -# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8 -ENV PATH=/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -# Copy the entire directory before building wheel -COPY --link vllm vllm - -# Comment if building *.so files from scratch -################################################## -# Copy the prebuilt *.so files -COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/ -ENV VLLM_USE_PRECOMPILED=1 -################################################## -# Comment if not building .so files from scratch -#RUN microdnf install -y git \ -# && microdnf clean all -################################################## - -# Copy over the generated *.pb2 files -COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ - CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist - -#################### libsodium Build IMAGE #################### -FROM base as libsodium-builder - -RUN microdnf install -y gcc gzip \ - && microdnf clean all - -WORKDIR /usr/src/libsodium - -ARG LIBSODIUM_VERSION=1.0.19 -RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ - && tar -xzvf libsodium*.tar.gz \ - && rm -f libsodium*.tar.gz \ - && mv libsodium*/* ./ - -RUN ./configure && make && make check - ## Release ##################################################################### # Note from the non-UBI Dockerfile: # We used base cuda image because pytorch installs its own cuda libraries. # However pynccl depends on cuda libraries so we had to switch to the runtime image # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda -FROM cuda-runtime AS vllm-openai - -WORKDIR /workspace - -# Create release python environment -COPY --from=python-cuda-base --link /opt/vllm /opt/vllm -ENV PATH=/opt/vllm/bin/:$PATH - -# install vllm wheel first, so that torch etc will be installed -RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ - pip install $(echo dist/*.whl)'[tensorizer]' --verbose - -# Install the vllm_nccl package which is a bit quirky -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - # The "install" happens in `setup.py` so it happens when built... - # Remove the already installed package and the cached wheel - pip uninstall -y vllm-nccl-cu12 \ - && pip cache remove vllm_nccl* \ - # install the version depended on by vllm requirements - && pip install vllm-nccl-cu12 -r requirements-cuda.txt \ - # The lib is downloaded to root's home directory... move it - && mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2 -ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2 - -RUN --mount=type=cache,target=/root/.cache/pip \ - pip3 install \ - # additional dependencies for the TGIS gRPC server - grpcio-tools==1.63.0 \ - # additional dependencies for openai api_server - accelerate==0.30.0 \ - # hf_transfer for faster HF hub downloads - hf_transfer==0.1.6 - -# Triton needs a CC compiler -RUN microdnf install -y gcc \ - && microdnf clean all - -# patch triton (fix for #720) -COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py - -# Install libsodium for Tensorizer encryption -RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ - cd /usr/src/libsodium \ - && make install -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -ENV HF_HUB_OFFLINE=1 \ - PORT=8000 \ - GRPC_PORT=8033 \ - HOME=/home/vllm \ - VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork \ - TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager" - -# setup non-root user for OpenShift -RUN microdnf install -y shadow-utils \ - && umask 002 \ - && useradd --uid 2000 --gid 0 vllm \ - && microdnf remove -y shadow-utils \ - && microdnf clean all \ - && chmod g+rwx $HOME /usr/src /workspace - -COPY LICENSE /licenses/vllm.md - -USER 2000 -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +FROM vllm/vllm-openai:v0.5.5 AS vllm-openai + + +# install python 3.12 instead + +ARG PYTHON_VERSION=3.12 +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl sudo \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version