Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix the test case for cpu #40

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ env:
jobs:
cpu:
runs-on: ubuntu-latest
timeout-minutes: 120
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
Expand Down
67 changes: 31 additions & 36 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
# Base image
FROM ubuntu:22.04 AS base
ARG VERSION=0.6.4.post1
FROM ubuntu:22.04 AS cpu-test-1
ARG VERSION

ENV CCACHE_DIR=/root/.cache/ccache

ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache

RUN --mount=type=cache,target=/var/cache/apt \
Expand All @@ -13,46 +10,44 @@ RUN --mount=type=cache,target=/var/cache/apt \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp

ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/vllm-venv/lib/libiomp5.so"

RUN echo 'ulimit -c 0' >> ~/.bashrc
# Python environment setup
FROM base AS python-env
RUN python3 -m pip install --upgrade pip && \
pip install virtualenv && \
virtualenv /opt/vllm-venv

RUN pip install intel_extension_for_pytorch==2.5.0
# Activate virtual environment in Docker image
ENV PATH="/opt/vllm-venv/bin:$PATH"

# Code preparation
FROM python-env AS code-prep
WORKDIR /workspace
RUN git clone https://github.com/vllm-project/vllm.git && cd vllm && git checkout v${VERSION}
WORKDIR /workspace/vllm

ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt

FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
# Code building stage
FROM code-prep AS builder

RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v -r requirements-cpu.txt

# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
WORKDIR /workspace/vllm
RUN sed -i 's/if (AVX512_FOUND AND NOT AVX512_DISABLED)/if (AVX512_FOUND OR AVX2_FOUND)/' cmake/cpu_extension.cmake \
&& cat cmake/cpu_extension.cmake

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN pip install -r requirements-build.txt \
&& pip install -v -r requirements-cpu.txt \
&& pip install intel-openmp \
&& VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel \
&& pip install dist/*.whl

WORKDIR /workspace/
# Final image
FROM base AS final
COPY --from=builder /opt/vllm-venv /opt/vllm-venv

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# Activate virtual environment in Docker image
ENV PATH="/opt/vllm-venv/bin:$PATH"

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
WORKDIR /workspace
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
12 changes: 8 additions & 4 deletions test-image.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#!/usr/bin/env bash

set -xe
set -x

IMAGE_TAG="${IMAGE_TAG:-vllm/vllm-openai:latest}"
MODEL_NAME="${MODEL_NAME:-facebook/opt-125m}"

docker run --rm -d --name vllm -p 8000:8000 ${IMAGE_TAG} \
--model ${MODEL_NAME} ${ARGS}
docker run -d --name vllm -p 8000:8000 \
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
${IMAGE_TAG} \
--model ${MODEL_NAME} \
--disable-frontend-multiprocessing \
${ARGS}

# Wait for up to 120 seconds for the Docker container to be ready
echo "Waiting for the container to be ready..."
Expand All @@ -22,7 +26,7 @@ while ! curl -sf http://localhost:8000/v1/models; do
done
echo "Container is ready."

curl -v http://localhost:8000/v1/completions \
curl -v --fail-with-body --show-error http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "facebook/opt-125m",
Expand Down
Loading