intel · xwu99 · Aug 16, 2024 · Jun 25, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml
@@ -10,7 +10,7 @@ device: cpu
 num_replicas: 1
 ipex:
   enabled: true
-  precision: bf16
+  precision: fp32
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2

diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml
@@ -9,7 +9,7 @@ workers_per_group: 2
 device: CPU
 ipex:
   enabled: false
-  precision: bf16
+  precision: fp32
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m

diff --git a/.github/workflows/config/update_inference_config.py b/.github/workflows/config/update_inference_config.py
@@ -18,11 +18,14 @@
 import argparse
 
 
-def update_inference_config(config_file: str, output_file: str, deepspeed: bool, ipex: bool):
+def update_inference_config(
+    config_file: str, output_file: str, deepspeed: bool, ipex: bool, vllm: bool
+):
     with open(config_file) as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
         config["deepspeed"] = deepspeed
         config["ipex"]["enabled"] = ipex
+        config["vllm"]["enabled"] = vllm
 
     with open(output_file, "w") as f:
         yaml.dump(config, f, sort_keys=False)
@@ -34,10 +37,13 @@ def get_parser():
     parser.add_argument("--output_file", type=str, required=True)
     parser.add_argument("--deepspeed", action="store_true")
     parser.add_argument("--ipex", action="store_true")
+    parser.add_argument("--vllm", action="store_true")
     return parser
 
 
 if __name__ == "__main__":
     parser = get_parser()
     args = parser.parse_args()
-    update_inference_config(args.config_file, args.output_file, args.deepspeed, args.ipex)
+    update_inference_config(
+        args.config_file, args.output_file, args.deepspeed, args.ipex, args.vllm
+    )
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -34,7 +34,7 @@ jobs:
     name: finetune
     strategy:
       matrix:
-        model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -44,7 +44,6 @@ jobs:
           - { model: "EleutherAI/gpt-j-6b"}
           - { model: "NousResearch/Llama-2-7b-chat-hf"}
           - { model: "mistralai/Mistral-7B-v0.1"}
-          - { model: "google/gemma-2b"}
 
     runs-on: self-hosted
 

diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b, deepseek-coder-33b-instruct]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-no-vllm, deepseek-coder-33b-instruct]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -45,8 +45,7 @@ jobs:
           - { model: "gpt-j-6b"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b-ipex-llm"}
-          - { model: "llama-2-7b-chat-hf-vllm"}
-          - { model: "gemma-2b"}
+          - { model: "llama-2-7b-chat-hf-no-vllm"}
 
     runs-on: self-hosted
 
@@ -97,7 +96,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Inference Test
         run: |

diff --git a/.github/workflows/workflow_tests.yml b/.github/workflows/workflow_tests.yml
@@ -114,6 +114,7 @@ jobs:
           source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
           # Additional libraries required  for pytest
           pip install -r ./tests/requirements.txt
+          bash ./dev/scripts/install-vllm-cpu.sh
 
       - name: Start Ray Cluster
         run: |
@@ -176,7 +177,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Tests
         run: |

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -424,9 +424,8 @@ def main(args: argparse.Namespace):
 
     random.seed(args.seed)
     np.random.seed(args.seed)
-
-    route_prefix = all_models[args.model_name].route_prefix
     if args.simple:
+        route_prefix = all_models[args.model_name].route_prefix
         api_url = args.model_endpoint_base + route_prefix
     else:
         api_url = args.model_endpoint_base + "/v1/chat/completions"
@@ -720,17 +719,20 @@ def main(args: argparse.Namespace):
     )
     parser.add_argument(
         "--temperature",
+        type=float,
         default=None,
         help="The value used to modulate the next token probabilities.",
     )
     parser.add_argument(
         "--top_p",
+        type=float,
         default=None,
         help="If set to float < 1, only the smallest set of most probable tokens \
             with probabilities that add up to `Top p` or higher are kept for generation.",
     )
     parser.add_argument(
         "--top_k",
+        type=float,
         default=None,
         help="The number of highest probability vocabulary tokens to keep \
             for top-k-filtering.",

diff --git a/dev/docker/Dockerfile.habana b/dev/docker/Dockerfile.habana
@@ -7,13 +7,18 @@ WORKDIR /root/llm-on-ray
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-# create llm_on_ray package directory to bypass the following 'pip install -e' command
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
 RUN mkdir ./llm_on_ray
 
 RUN pip install -e . && \
     pip install --upgrade-strategy eager optimum[habana] && \
     pip install git+https://github.com/HabanaAI/[email protected]
 
+# Install vllm habana env
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d
+# Reinstall ray because vllm downgrades the ray version
+RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"
+
 # Optinal. Comment out if you are not using UI
 COPY ./dev/scripts/install-ui.sh /tmp
 
@@ -30,3 +35,4 @@ ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1
 ENV PT_HPU_LAZY_ACC_PAR_MODE=0
 
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed
@@ -0,0 +1,47 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+COPY ./dev/scripts/install-vllm-cpu.sh .
+
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+RUN ds_report
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable
@@ -0,0 +1,43 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+# copy all checkedout file for later non-editable pip
+COPY . .
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+RUN ds_report
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/docker/ci/Dockerfile.habana_vllm b/dev/docker/ci/Dockerfile.habana_vllm
@@ -15,7 +15,7 @@ RUN pip install -e . && \
     pip install git+https://github.com/HabanaAI/[email protected]
 
 # Install vllm habana env
-RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@ae3d6121
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d
 # Reinstall ray because vllm downgrades the ray version
 RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"
 

diff --git a/dev/docker/ci/Dockerfile.tests_cpu b/dev/docker/ci/Dockerfile.tests_cpu
@@ -24,10 +24,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     mamba config --add channels intel && \
-    mamba install python==${python_v}
+    mamba install -y -c conda-forge python==${python_v} gxx=12.3 gxx_linux-64=12.3 libxcrypt
 
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
+COPY ./dev/scripts/install-vllm-cpu.sh .
 
 # create llm_on_ray package directory to bypass the following 'pip install -e' command
 RUN mkdir ./llm_on_ray
@@ -39,3 +40,8 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-ind
 ARG CACHEBUST=1
 COPY ./dev/scripts/install-oneapi.sh /tmp
 RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
@@ -75,7 +75,7 @@ install_dependencies(){
     docker exec "${TARGET}" bash -c "pip install -r ./tests/requirements.txt"
 }
 
-strat_ray(){
+start_ray(){
     local TARGET=$1
 
     # Start Ray Cluster
@@ -110,8 +110,8 @@ stop_container(){
 declare -A DF_SUFFIX_MAPPER
 DF_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]=".ipex-llm"
-    ["llama-2-7b-chat-hf-vllm"]=".vllm"
-    ["gpt-j-6b"]=".cpu_and_deepspeed.pip_non_editable"
+    ["llama-2-7b-chat-hf-no-vllm"]=".cpu_and_deepspeed"
+    ["gpt-j-6b"]=".cpu_vllm_and_deepspeed.pip_non_editable"
 )
 
 
@@ -120,14 +120,14 @@ get_DF_SUFFIX() {
     if [[ ${DF_SUFFIX_MAPPER[$key]+_} ]]; then
         echo "${DF_SUFFIX_MAPPER[$key]}"
     else
-        echo ".cpu_and_deepspeed"
+        echo ".cpu_vllm_and_deepspeed"
     fi
 }
 
 declare -A TARGET_SUFFIX_MAPPER
 TARGET_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]="_ipex-llm"
-    ["llama-2-7b-chat-hf-vllm"]="_vllm"
+    ["llama-2-7b-chat-hf-no-vllm"]="_wo_vllm"
 )
 
 get_TARGET_SUFFIX() {
@@ -169,7 +169,7 @@ inference_deepspeed_test(){
     local model=$2
     if [[ ${model} =~ ^(gemma-2b|gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
         echo ${model} is not supported!
-    elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then
+    elif [[ ! ${model} == "llama-2-7b-chat-hf-no-vllm" ]]; then
         echo update_inference_config with deepspeed:
         docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file llm_on_ray/inference/models/\"${model}\".yaml --output_file \"${model}\".yaml.deepspeed --deepspeed"
         echo Start deepspeed simple serve :
@@ -187,7 +187,7 @@ inference_restapi_test(){
     if [[ ${model} == "mpt-7b-ipex-llm" ]]; then
         echo Start mpt-7b-ipex-llm simple serve :
         docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml"
-    elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then
+    else
         echo Start "${TARGET}"  serve :
         docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${model}"
         echo Http query:

diff --git a/dev/scripts/install-vllm-cpu.sh b/dev/scripts/install-vllm-cpu.sh
@@ -4,7 +4,6 @@
 [[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!";  exit 1; }
 [[ -n $(which pip) ]] || { echo "pip command is not found!";  exit 1; }
 
-# g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
 # conda install -y -c conda-forge gxx=12.3 gxx_linux-64=12.3 libxcrypt
 version_greater_equal()
 {
@@ -14,13 +13,20 @@ gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1)
 echo
 echo Current GNU C++ Compiler version: $gcc_version
 echo
-version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
-
-VLLM_VERSION=0.4.1
+VLLM_VERSION=0.5.2
 
 echo Installing vLLM v$VLLM_VERSION ...
 # Install VLLM from source, refer to https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html for details
-# We use this one-liner to install latest vllm-cpu
-MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
+is_avx512_available=$(cat /proc/cpuinfo | grep avx512)
+if [ -z "$is_avx512_available" ]; then
+    echo "AVX512 is not available, vLLM CPU backend using other ISA types."
+    MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu VLLM_CPU_DISABLE_AVX512="true" pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
     --extra-index-url https://download.pytorch.org/whl/cpu
+else
+    # g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
+    version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
+    echo "Install vllm-cpu with AVX512 ISA support"
+    MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
+        --extra-index-url https://download.pytorch.org/whl/cpu
+fi
 echo Done!