-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Nick Hill <[email protected]> Co-authored-by: Travis Johnson <[email protected]> Signed-off-by: Joe Runde <[email protected]>
- Loading branch information
1 parent
05af6da
commit cde932c
Showing
33 changed files
with
2,104 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
vllm/*.so | ||
.* | ||
docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
## Global Args ################################################################# | ||
ARG BASE_UBI_IMAGE_TAG=9.3-1552 | ||
ARG PYTHON_VERSION=3.11 | ||
ARG PYTORCH_INDEX="https://download.pytorch.org/whl" | ||
# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly" | ||
ARG PYTORCH_VERSION=2.1.2 | ||
|
||
# NOTE: This setting only has an effect when not using prebuilt-wheel kernels | ||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" | ||
|
||
|
||
## Base Layer ################################################################## | ||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base | ||
|
||
WORKDIR /workspace | ||
|
||
ENV LANG=C.UTF-8 \ | ||
LC_ALL=C.UTF-8 | ||
|
||
# Some utils for dev purposes - tar required for kubectl cp | ||
RUN microdnf install -y \ | ||
which procps findutils tar vim \ | ||
&& microdnf clean all | ||
|
||
|
||
## Python Installer ############################################################ | ||
FROM base as python-install | ||
|
||
ARG PYTHON_VERSION | ||
ARG MINIFORGE_VERSION=23.11.0-0 | ||
|
||
RUN curl -fsSL -o ~/miniforge3.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \ | ||
chmod +x ~/miniforge3.sh && \ | ||
bash ~/miniforge3.sh -b -p /opt/conda && \ | ||
source "/opt/conda/etc/profile.d/conda.sh" && \ | ||
conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \ | ||
conda activate /opt/vllm && \ | ||
rm ~/miniforge3.sh | ||
# use of the /opt/vllm env requires: | ||
# ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
|
||
## Python Base ################################################################# | ||
FROM base as python-base | ||
|
||
COPY --from=python-install --link /opt/vllm /opt/vllm | ||
|
||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
|
||
## Python/Torch Base ########################################################### | ||
FROM python-base as python-torch-base | ||
|
||
ARG PYTORCH_INDEX | ||
ARG PYTORCH_VERSION | ||
|
||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip3 install torch==$PYTORCH_VERSION+cu121 --index-url "${PYTORCH_INDEX}/cu121" | ||
|
||
|
||
## CUDA Base ################################################################### | ||
FROM base as cuda-base | ||
|
||
# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if | ||
# this env var is set to 12.2.0, even though it's compatible | ||
#ENV CUDA_VERSION=12.2.0 \ | ||
ENV CUDA_VERSION=12.0.0 \ | ||
NV_CUDA_LIB_VERSION=12.2.0-1 \ | ||
NVIDIA_VISIBLE_DEVICES=all \ | ||
NVIDIA_DRIVER_CAPABILITIES=compute,utility \ | ||
NV_CUDA_CUDART_VERSION=12.2.53-1 \ | ||
NV_CUDA_COMPAT_VERSION=535.104.12 | ||
|
||
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ | ||
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo | ||
|
||
RUN microdnf install -y \ | ||
cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \ | ||
cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \ | ||
&& microdnf clean all | ||
|
||
ENV CUDA_HOME="/usr/local/cuda" \ | ||
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \ | ||
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" | ||
|
||
|
||
## CUDA Runtime ################################################################ | ||
FROM cuda-base as cuda-runtime | ||
|
||
ENV NV_NVTX_VERSION=12.2.53-1 \ | ||
NV_LIBNPP_VERSION=12.1.1.14-1 \ | ||
NV_LIBCUBLAS_VERSION=12.2.1.16-1 \ | ||
NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2 | ||
|
||
RUN microdnf install -y \ | ||
cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-nvtx-12-2-${NV_NVTX_VERSION} \ | ||
libnpp-12-2-${NV_LIBNPP_VERSION} \ | ||
libcublas-12-2-${NV_LIBCUBLAS_VERSION} \ | ||
libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \ | ||
&& microdnf clean all | ||
|
||
|
||
## CUDA Development ############################################################ | ||
FROM cuda-base as cuda-devel | ||
|
||
ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \ | ||
NV_NVML_DEV_VERSION=12.2.81-1 \ | ||
NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \ | ||
NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \ | ||
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2 | ||
|
||
RUN microdnf install -y \ | ||
cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \ | ||
cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \ | ||
cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \ | ||
libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \ | ||
libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \ | ||
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \ | ||
&& microdnf clean all | ||
|
||
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" | ||
|
||
|
||
## Development ################################################################# | ||
FROM cuda-devel AS dev | ||
|
||
COPY --from=python-torch-base --link /opt/vllm /opt/vllm | ||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
# install build and runtime dependencies | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements.txt,target=requirements.txt \ | ||
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ | ||
pip3 install \ | ||
-r requirements.txt \ | ||
-r requirements-dev.txt | ||
|
||
|
||
## Builder ##################################################################### | ||
FROM dev AS build | ||
|
||
# install build dependencies | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ | ||
pip install -r requirements-build.txt | ||
|
||
# copy input files | ||
COPY csrc csrc | ||
COPY setup.py setup.py | ||
COPY requirements.txt requirements.txt | ||
COPY pyproject.toml pyproject.toml | ||
COPY vllm/__init__.py vllm/__init__.py | ||
|
||
ARG TORCH_CUDA_ARCH_LIST | ||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | ||
|
||
# max jobs used by Ninja to build extensions | ||
ARG max_jobs=2 | ||
ENV MAX_JOBS=${max_jobs} | ||
# number of threads used by nvcc | ||
ARG nvcc_threads=8 | ||
ENV NVCC_THREADS=$nvcc_threads | ||
# make sure punica kernels are built (for LoRA) | ||
ENV VLLM_INSTALL_PUNICA_KERNELS=1 | ||
|
||
RUN python3 setup.py build_ext --inplace | ||
|
||
|
||
## Extension Cache ############################################################# | ||
# Instead of compiling artifacts every build just copy from pre-built wheel | ||
# This might not work if the PyTorch and CUDA versions don't match! | ||
FROM base as prebuilt-wheel | ||
|
||
RUN microdnf install -y \ | ||
unzip \ | ||
&& microdnf clean all | ||
|
||
ARG PYTHON_VERSION | ||
# 0.3.2 is built for CUDA 12.1 and PyTorch 2.1.2 | ||
ARG VLLM_WHEEL_VERSION=0.3.2 | ||
|
||
RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ | ||
&& unzip vllm.whl \ | ||
&& rm vllm.whl | ||
# compiled extensions located at /workspace/vllm/*.so | ||
|
||
|
||
## Test ######################################################################## | ||
FROM dev AS test | ||
|
||
WORKDIR /vllm-workspace | ||
# ADD is used to preserve directory structure | ||
# NB: Could leak secrets from local context, the test image should not be pushed | ||
# to a registry | ||
ADD . /vllm-workspace/ | ||
# copy pytorch extensions separately to avoid having to rebuild | ||
# when python code changes | ||
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ | ||
# ignore build dependencies installation because we are using pre-complied extensions | ||
RUN rm pyproject.toml | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
VLLM_USE_PRECOMPILED=1 pip install . --verbose | ||
|
||
|
||
## Proto Compilation ########################################################### | ||
FROM python-base AS gen-protos | ||
|
||
RUN microdnf install -y \ | ||
make \ | ||
findutils \ | ||
&& microdnf clean all | ||
|
||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=Makefile,target=Makefile \ | ||
--mount=type=bind,source=proto,target=proto \ | ||
make gen-protos | ||
|
||
## vLLM Library Files ########################################################## | ||
# Little extra stage to gather files and manage permissions on them without any | ||
# duplication in the release layer due to permission changes | ||
FROM base AS vllm | ||
|
||
WORKDIR /vllm-staging | ||
# COPY files from various places into a staging directory | ||
COPY --link vllm vllm | ||
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/ | ||
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb | ||
|
||
# custom COPY command to use umask to control permissions and grant permissions | ||
# to the group | ||
RUN umask 002 \ | ||
&& cp --recursive --no-preserve=all /vllm-staging/vllm /workspace/vllm \ | ||
# not strictly needed, but .so files typically have executable bits | ||
&& chmod +x /workspace/vllm/*.so | ||
|
||
## Release ##################################################################### | ||
# Note from the non-UBI Dockerfile: | ||
# We used base cuda image because pytorch installs its own cuda libraries. | ||
# However cupy depends on cuda libraries so we had to switch to the runtime image | ||
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda | ||
FROM cuda-runtime AS vllm-openai | ||
|
||
WORKDIR /workspace | ||
|
||
# Create release python environment | ||
COPY --from=python-torch-base --link /opt/vllm /opt/vllm | ||
ENV PATH=/opt/vllm/bin/:$PATH | ||
|
||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
--mount=type=bind,source=requirements.txt,target=requirements.txt \ | ||
pip3 install \ | ||
-r requirements.txt \ | ||
# additional dependencies for the TGIS gRPC server | ||
grpcio-tools==1.62.0 \ | ||
# additional dependencies for openai api_server | ||
accelerate==0.27.2 | ||
|
||
# vLLM will not be installed in site-packages | ||
COPY --from=vllm --link /workspace/ ./ | ||
|
||
# Triton needs a CC compiler | ||
RUN microdnf install -y gcc \ | ||
&& microdnf clean all | ||
|
||
ENV HF_HUB_OFFLINE=1 \ | ||
PORT=8000 \ | ||
GRPC_PORT=8033 \ | ||
HOME=/home/vllm | ||
|
||
# setup non-root user for OpenShift | ||
RUN microdnf install -y shadow-utils \ | ||
&& umask 002 \ | ||
&& useradd --uid 2000 --gid 0 vllm \ | ||
&& microdnf remove -y shadow-utils \ | ||
&& microdnf clean all \ | ||
&& chmod g+rwx $HOME /usr/src /workspace | ||
|
||
USER 2000 | ||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
SHELL := /bin/bash | ||
|
||
server_image_name := tgis-vllm | ||
server_image_target := vllm-openai | ||
|
||
##@ Development Tasks | ||
|
||
has_gawk := $(shell gawk --version 2>/dev/null) | ||
.PHONY: help | ||
help: ## Display this help. | ||
ifdef has_gawk | ||
@gawk -f ./scripts/makefile.help.awk $(MAKEFILE_LIST) | ||
else | ||
@awk 'BEGIN{FS=":.*##"; printf("\nUsage:\n make \033[36m<target>\033[0m\n\n")} /^[-a-zA-Z_0-9\\.]+:.*?##/ {t=$$1; if(!(t in p)){p[t]; printf("\033[36m%-15s\033[0m %s\n", t, $$2)}}' $(MAKEFILE_LIST) | ||
@echo | ||
@echo "NOTE: Help output with headers requires GNU extensions to awk. Please install gawk for the best experience." | ||
endif | ||
|
||
target_path := "vllm/entrypoints/grpc/pb" | ||
gen-protos: | ||
# Compile protos | ||
pip install grpcio-tools==1.62.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' | ||
mkdir -p $(target_path) | ||
python -m grpc_tools.protoc -Iproto --python_out=$(target_path) \ | ||
--grpc_python_out=$(target_path) --mypy_out=$(target_path) proto/generation.proto | ||
find $(target_path)/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \; | ||
touch $(target_path)/__init__.py | ||
|
||
|
||
##@ Container Build Tasks | ||
|
||
.PHONY: build | ||
build: ## | ||
DOCKER_BUILDKIT=1 docker build \ | ||
--file Dockerfile.ubi \ | ||
--target $(server_image_target) \ | ||
--progress plain \ | ||
--tag "$(server_image_name)" . | ||
docker images |
Oops, something went wrong.