diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml
index b3927953b..b8f9fcba5 100644
--- a/.github/workflows/config/gpt2-ci.yaml
+++ b/.github/workflows/config/gpt2-ci.yaml
@@ -10,7 +10,7 @@ device: cpu
 num_replicas: 1
 ipex:
   enabled: true
-  precision: bf16
+  precision: fp32
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml
index f13ec7e54..6d175be7a 100644
--- a/.github/workflows/config/opt-125m-ci.yaml
+++ b/.github/workflows/config/opt-125m-ci.yaml
@@ -9,7 +9,7 @@ workers_per_group: 2
 device: CPU
 ipex:
   enabled: false
-  precision: bf16
+  precision: fp32
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
diff --git a/.github/workflows/config/update_inference_config.py b/.github/workflows/config/update_inference_config.py
index 502bcf591..94c5b22b5 100644
--- a/.github/workflows/config/update_inference_config.py
+++ b/.github/workflows/config/update_inference_config.py
@@ -18,11 +18,14 @@
 import argparse
 
 
-def update_inference_config(config_file: str, output_file: str, deepspeed: bool, ipex: bool):
+def update_inference_config(
+    config_file: str, output_file: str, deepspeed: bool, ipex: bool, vllm: bool
+):
     with open(config_file) as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
         config["deepspeed"] = deepspeed
         config["ipex"]["enabled"] = ipex
+        config["vllm"]["enabled"] = vllm
 
     with open(output_file, "w") as f:
         yaml.dump(config, f, sort_keys=False)
@@ -34,10 +37,13 @@ def get_parser():
     parser.add_argument("--output_file", type=str, required=True)
     parser.add_argument("--deepspeed", action="store_true")
     parser.add_argument("--ipex", action="store_true")
+    parser.add_argument("--vllm", action="store_true")
     return parser
 
 
 if __name__ == "__main__":
     parser = get_parser()
     args = parser.parse_args()
-    update_inference_config(args.config_file, args.output_file, args.deepspeed, args.ipex)
+    update_inference_config(
+        args.config_file, args.output_file, args.deepspeed, args.ipex, args.vllm
+    )
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 91950e553..cec4af99e 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -37,7 +37,7 @@ jobs:
     name: finetune
     strategy:
       matrix:
-        model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -47,7 +47,6 @@ jobs:
           - { model: "EleutherAI/gpt-j-6b"}
           - { model: "NousResearch/Llama-2-7b-chat-hf"}
           - { model: "mistralai/Mistral-7B-v0.1"}
-          - { model: "google/gemma-2b"}
 
     runs-on: self-hosted
 
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index e01ef598e..a0cee8827 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -37,7 +37,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b, deepseek-coder-33b-instruct]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-no-vllm, deepseek-coder-33b-instruct]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -48,8 +48,7 @@ jobs:
           - { model: "gpt-j-6b"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b-ipex-llm"}
-          - { model: "llama-2-7b-chat-hf-vllm"}
-          - { model: "gemma-2b"}
+          - { model: "llama-2-7b-chat-hf-no-vllm"}
 
     runs-on: self-hosted
 
@@ -100,7 +99,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Inference Test
         run: |
diff --git a/.github/workflows/workflow_tests.yml b/.github/workflows/workflow_tests.yml
index c74536866..912d59b42 100644
--- a/.github/workflows/workflow_tests.yml
+++ b/.github/workflows/workflow_tests.yml
@@ -117,6 +117,7 @@ jobs:
           source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
           # Additional libraries required  for pytest
           pip install -r ./tests/requirements.txt
+          bash ./dev/scripts/install-vllm-cpu.sh
 
       - name: Start Ray Cluster
         run: |
@@ -179,7 +180,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Tests
         run: |
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 2a2a8570a..b78e6c021 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -424,9 +424,8 @@ def main(args: argparse.Namespace):
 
     random.seed(args.seed)
     np.random.seed(args.seed)
-
-    route_prefix = all_models[args.model_name].route_prefix
     if args.simple:
+        route_prefix = all_models[args.model_name].route_prefix
         api_url = args.model_endpoint_base + route_prefix
     else:
         api_url = args.model_endpoint_base + "/v1/chat/completions"
@@ -720,17 +719,20 @@ def main(args: argparse.Namespace):
     )
     parser.add_argument(
         "--temperature",
+        type=float,
         default=None,
         help="The value used to modulate the next token probabilities.",
     )
     parser.add_argument(
         "--top_p",
+        type=float,
         default=None,
         help="If set to float < 1, only the smallest set of most probable tokens \
             with probabilities that add up to `Top p` or higher are kept for generation.",
     )
     parser.add_argument(
         "--top_k",
+        type=float,
         default=None,
         help="The number of highest probability vocabulary tokens to keep \
             for top-k-filtering.",
diff --git a/dev/docker/Dockerfile.habana b/dev/docker/Dockerfile.habana
index efdddf6c2..1972f60a6 100644
--- a/dev/docker/Dockerfile.habana
+++ b/dev/docker/Dockerfile.habana
@@ -7,13 +7,18 @@ WORKDIR /root/llm-on-ray
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-# create llm_on_ray package directory to bypass the following 'pip install -e' command
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
 RUN mkdir ./llm_on_ray
 
 RUN pip install -e . && \
     pip install --upgrade-strategy eager optimum[habana] && \
     pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1
 
+# Install vllm habana env
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d
+# Reinstall ray because vllm downgrades the ray version
+RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"
+
 # Optinal. Comment out if you are not using UI
 COPY ./dev/scripts/install-ui.sh /tmp
 
@@ -30,3 +35,4 @@ ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1
 ENV PT_HPU_LAZY_ACC_PAR_MODE=0
 
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed
new file mode 100644
index 000000000..b0d42c2a5
--- /dev/null
+++ b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed
@@ -0,0 +1,47 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+COPY ./dev/scripts/install-vllm-cpu.sh .
+
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+RUN ds_report
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable
new file mode 100644
index 000000000..7500c301a
--- /dev/null
+++ b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable
@@ -0,0 +1,43 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+# copy all checkedout file for later non-editable pip
+COPY . .
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+RUN ds_report
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/docker/ci/Dockerfile.habana_vllm b/dev/docker/ci/Dockerfile.habana_vllm
index 0d68b7756..1972f60a6 100644
--- a/dev/docker/ci/Dockerfile.habana_vllm
+++ b/dev/docker/ci/Dockerfile.habana_vllm
@@ -15,7 +15,7 @@ RUN pip install -e . && \
     pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1
 
 # Install vllm habana env
-RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@ae3d6121
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d
 # Reinstall ray because vllm downgrades the ray version
 RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"
 
diff --git a/dev/docker/ci/Dockerfile.tests_cpu b/dev/docker/ci/Dockerfile.tests_cpu
index 1438f5059..3720e9d4e 100644
--- a/dev/docker/ci/Dockerfile.tests_cpu
+++ b/dev/docker/ci/Dockerfile.tests_cpu
@@ -24,10 +24,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     mamba config --add channels intel && \
-    mamba install python==${python_v}
+    mamba install -y -c conda-forge python==${python_v} gxx=12.3 gxx_linux-64=12.3 libxcrypt
 
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
+COPY ./dev/scripts/install-vllm-cpu.sh .
 
 # create llm_on_ray package directory to bypass the following 'pip install -e' command
 RUN mkdir ./llm_on_ray
@@ -39,3 +40,8 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-ind
 ARG CACHEBUST=1
 COPY ./dev/scripts/install-oneapi.sh /tmp
 RUN /tmp/install-oneapi.sh
+
+# Install vllm-cpu
+# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index fde5f0717..3d3a62381 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -75,7 +75,7 @@ install_dependencies(){
     docker exec "${TARGET}" bash -c "pip install -r ./tests/requirements.txt"
 }
 
-strat_ray(){
+start_ray(){
     local TARGET=$1
 
     # Start Ray Cluster
@@ -110,8 +110,8 @@ stop_container(){
 declare -A DF_SUFFIX_MAPPER
 DF_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]=".ipex-llm"
-    ["llama-2-7b-chat-hf-vllm"]=".vllm"
-    ["gpt-j-6b"]=".cpu_and_deepspeed.pip_non_editable"
+    ["llama-2-7b-chat-hf-no-vllm"]=".cpu_and_deepspeed"
+    ["gpt-j-6b"]=".cpu_vllm_and_deepspeed.pip_non_editable"
 )
 
 
@@ -120,14 +120,14 @@ get_DF_SUFFIX() {
     if [[ ${DF_SUFFIX_MAPPER[$key]+_} ]]; then
         echo "${DF_SUFFIX_MAPPER[$key]}"
     else
-        echo ".cpu_and_deepspeed"
+        echo ".cpu_vllm_and_deepspeed"
     fi
 }
 
 declare -A TARGET_SUFFIX_MAPPER
 TARGET_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]="_ipex-llm"
-    ["llama-2-7b-chat-hf-vllm"]="_vllm"
+    ["llama-2-7b-chat-hf-no-vllm"]="_wo_vllm"
 )
 
 get_TARGET_SUFFIX() {
@@ -169,7 +169,7 @@ inference_deepspeed_test(){
     local model=$2
     if [[ ${model} =~ ^(gemma-2b|gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
         echo ${model} is not supported!
-    elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then
+    elif [[ ! ${model} == "llama-2-7b-chat-hf-no-vllm" ]]; then
         echo update_inference_config with deepspeed:
         docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file llm_on_ray/inference/models/\"${model}\".yaml --output_file \"${model}\".yaml.deepspeed --deepspeed"
         echo Start deepspeed simple serve :
@@ -187,7 +187,7 @@ inference_restapi_test(){
     if [[ ${model} == "mpt-7b-ipex-llm" ]]; then
         echo Start mpt-7b-ipex-llm simple serve :
         docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml"
-    elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then
+    else
         echo Start "${TARGET}"  serve :
         docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${model}"
         echo Http query:
diff --git a/dev/scripts/install-vllm-cpu.sh b/dev/scripts/install-vllm-cpu.sh
index 3e7481538..48822cae0 100755
--- a/dev/scripts/install-vllm-cpu.sh
+++ b/dev/scripts/install-vllm-cpu.sh
@@ -4,7 +4,6 @@
 [[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!";  exit 1; }
 [[ -n $(which pip) ]] || { echo "pip command is not found!";  exit 1; }
 
-# g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
 # conda install -y -c conda-forge gxx=12.3 gxx_linux-64=12.3 libxcrypt
 version_greater_equal()
 {
@@ -14,13 +13,20 @@ gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1)
 echo
 echo Current GNU C++ Compiler version: $gcc_version
 echo
-version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
-
-VLLM_VERSION=0.4.1
+VLLM_VERSION=0.5.2
 
 echo Installing vLLM v$VLLM_VERSION ...
 # Install VLLM from source, refer to https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html for details
-# We use this one-liner to install latest vllm-cpu
-MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
+is_avx512_available=$(cat /proc/cpuinfo | grep avx512)
+if [ -z "$is_avx512_available" ]; then
+    echo "AVX512 is not available, vLLM CPU backend using other ISA types."
+    MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu VLLM_CPU_DISABLE_AVX512="true" pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
     --extra-index-url https://download.pytorch.org/whl/cpu
+else
+    # g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
+    version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
+    echo "Install vllm-cpu with AVX512 ISA support"
+    MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \
+        --extra-index-url https://download.pytorch.org/whl/cpu
+fi
 echo Done!
\ No newline at end of file
diff --git a/llm_on_ray/inference/api_openai_backend/router_app.py b/llm_on_ray/inference/api_openai_backend/router_app.py
index a9f63da7b..35868bb24 100644
--- a/llm_on_ray/inference/api_openai_backend/router_app.py
+++ b/llm_on_ray/inference/api_openai_backend/router_app.py
@@ -34,13 +34,14 @@
 #
 
 import os
-from typing import AsyncGenerator, List
+from typing import AsyncGenerator, List, Dict, Union
 import uuid
 import async_timeout
 from fastapi import FastAPI, status
 from fastapi import Response as FastAPIResponse
 from fastapi.middleware.cors import CORSMiddleware
-from starlette.responses import Response, StreamingResponse
+from starlette.responses import Response, StreamingResponse, JSONResponse
+from starlette.requests import Request
 from llm_on_ray.inference.logger import get_logger
 from llm_on_ray.inference.api_openai_backend.request_handler import (
     OpenAIHTTPException,
@@ -68,6 +69,19 @@
 
 logger = get_logger(__name__)
 
+try:
+    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+    from vllm.entrypoints.openai.protocol import (
+        ChatCompletionRequest as vllm_ChatCompletionRequest,
+        ChatCompletionResponse as vllm_ChatCompletionResponse,
+    )
+    from llm_on_ray.inference.inference_config import (
+        DEVICE_HPU,
+        DEVICE_CUDA,
+    )
+except Exception:
+    logger.warning("VLLM package is not installed")
+
 # timeout in 10 minutes. Streaming can take longer than 3 min
 TIMEOUT = float(os.environ.get("ROUTER_HTTP_TIMEOUT", 1800))
 
@@ -243,8 +257,25 @@ class Router:
     def __init__(
         self,
         query_client: RouterQueryClient,
+        model_configs: Dict,
+        max_num_seqs: int,
     ) -> None:
         self.query_client = query_client
+        self.vllm_openai_serving_chat = {}
+        for infer_name, infer_conf in model_configs.items():
+            if infer_conf.vllm.enabled and infer_conf.device in [DEVICE_HPU, DEVICE_CUDA]:
+                from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor
+
+                predictor = VllmPredictor(infer_conf, max_num_seqs)
+                serving_chat = OpenAIServingChat(
+                    predictor.engine,
+                    infer_conf.name,
+                    infer_conf.vllm.response_role,
+                    infer_conf.model_description.chat_template,
+                )
+            else:
+                serving_chat = None
+            self.vllm_openai_serving_chat[infer_name] = serving_chat
 
     @router_app.get("/v1/models", response_model=ModelList)
     async def models(self) -> ModelList:
@@ -332,7 +363,8 @@ async def completions(
     @router_app.post("/v1/chat/completions")
     async def chat(
         self,
-        body: ChatCompletionRequest,
+        body: Union[ChatCompletionRequest, vllm_ChatCompletionRequest],
+        raw_request: Request,
         response: FastAPIResponse,
     ):
         """Given a prompt, the model will return one or more predicted completions,
@@ -341,57 +373,68 @@ async def chat(
         Returns:
             A response object with completions.
         """
-        prompt = Prompt(
-            prompt=body.messages,
-            parameters=dict(body),
-            tools=body.tools,
-            tool_choice=body.tool_choice,
-        )
-        request_id = f"chatcmpl-{str(uuid.uuid4().hex)}"
-        if body.stream:
-            return StreamingResponse(
-                _chat_completions_wrapper(
-                    request_id,
-                    body,
-                    response,
-                    self.query_client.query(body.model, prompt, request_id, body.stream),
-                ),
-                media_type="text/event-stream",
-            )
+        serving_chat = self.vllm_openai_serving_chat[body.model]
+        if serving_chat:
+            generator = await serving_chat.create_chat_completion(body, raw_request=raw_request)
+            if body.stream:
+                return StreamingResponse(content=generator, media_type="text/event-stream")
+            else:
+                assert isinstance(generator, vllm_ChatCompletionResponse)
+                return JSONResponse(content=generator.model_dump())
         else:
-            async with async_timeout.timeout(TIMEOUT):
-                results_reponse = self.query_client.query(
-                    body.model, prompt, request_id, body.stream
+            prompt = Prompt(
+                prompt=body.messages,
+                parameters=dict(body),
+                tools=body.tools,
+                tool_choice=body.tool_choice,
+            )
+            request_id = f"chatcmpl-{str(uuid.uuid4().hex)}"
+            if body.stream:
+                return StreamingResponse(
+                    _chat_completions_wrapper(
+                        request_id,
+                        body,
+                        response,
+                        self.query_client.query(body.model, prompt, request_id, body.stream),
+                    ),
+                    media_type="text/event-stream",
                 )
-                async for results in results_reponse:
-                    if results.error:
-                        raise OpenAIHTTPException(
-                            message=results.error.message,
-                            status_code=results.error.code,
-                            type=results.error.type,
-                        )
-
-                    if results.tool_calls is not None:
-                        msg = ChatMessage(role="assistant", tool_calls=results.tool_calls)
-                        # deleting this fields so that they don't appear in the response
-                        del msg.tool_call_id
-                    else:
-                        msg = ChatMessage(role="assistant", content=results.generated_text or "")
+            else:
+                async with async_timeout.timeout(TIMEOUT):
+                    results_reponse = self.query_client.query(
+                        body.model, prompt, request_id, body.stream
+                    )
+                    async for results in results_reponse:
+                        if results.error:
+                            raise OpenAIHTTPException(
+                                message=results.error.message,
+                                status_code=results.error.code,
+                                type=results.error.type,
+                            )
 
-                    usage = UsageInfo.from_response(results.dict())
-                    return ChatCompletionResponse(
-                        id=request_id,
-                        object="chat.completion",
-                        model=body.model,
-                        choices=[
-                            ChatCompletionResponseChoice(
-                                index=0,
-                                message=msg,
-                                finish_reason=results.finish_reason,
+                        if results.tool_calls is not None:
+                            msg = ChatMessage(role="assistant", tool_calls=results.tool_calls)
+                            # deleting this fields so that they don't appear in the response
+                            del msg.tool_call_id
+                        else:
+                            msg = ChatMessage(
+                                role="assistant", content=results.generated_text or ""
                             )
-                        ],
-                        usage=usage,
-                    )
+
+                        usage = UsageInfo.from_response(results.dict())
+                        return ChatCompletionResponse(
+                            id=request_id,
+                            object="chat.completion",
+                            model=body.model,
+                            choices=[
+                                ChatCompletionResponseChoice(
+                                    index=0,
+                                    message=msg,
+                                    finish_reason=results.finish_reason,
+                                )
+                            ],
+                            usage=usage,
+                        )
 
     @router_app.get("/v1/health_check")
     async def health_check(self) -> bool:
diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py
index 6b5a0e2db..dcc1ee85f 100644
--- a/llm_on_ray/inference/api_server_openai.py
+++ b/llm_on_ray/inference/api_server_openai.py
@@ -38,7 +38,7 @@
 from llm_on_ray.inference.api_openai_backend.router_app import Router, router_app
 
 
-def router_application(deployments, model_list, max_ongoing_requests):
+def router_application(deployments, model_list, max_ongoing_requests, max_num_seqs):
     """Create a Router Deployment.
 
     Router Deployment will point to a Serve Deployment for each specified base model,
@@ -68,11 +68,13 @@ def router_application(deployments, model_list, max_ongoing_requests):
         ),  # Maximum backlog for a single replica
     )(serve.ingress(router_app)(Router))
 
-    return RouterDeployment.bind(merged_client)
+    return RouterDeployment.bind(merged_client, model_list, max_num_seqs)
 
 
-def openai_serve_run(deployments, model_list, host, route_prefix, port, max_ongoing_requests):
-    router_app = router_application(deployments, model_list, max_ongoing_requests)
+def openai_serve_run(
+    deployments, model_list, host, route_prefix, port, max_ongoing_requests, max_num_seqs
+):
+    router_app = router_application(deployments, model_list, max_ongoing_requests, max_num_seqs)
 
     serve.start(http_options={"host": host, "port": port})
     serve.run(
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 7d405c7c7..1d1759685 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -44,7 +44,7 @@ class ModelConfig(BaseModel):
 
 
 class Ipex(BaseModel):
-    enabled: bool = True
+    enabled: bool = False
     precision: str = "bf16"
 
     @validator("precision")
@@ -59,6 +59,12 @@ class Vllm(BaseModel):
     max_num_seqs: int = 256
     precision: str = "bf16"
     enforce_eager: bool = False
+    tensor_parallel_size: int = 1
+    gpu_memory_utilization: float = 0.90
+    block_size: int = 16
+    max_seq_len_to_capture: int = 8192
+    response_role: str = "assistant"
+    lora_modules: Union[str, None] = None
 
     @validator("precision")
     def _check_precision(cls, v: str):
diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 37e18acf4..55eb6b35b 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: CodeLlama-7b-hf
 route_prefix: /CodeLlama-7b-hf
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
index 12d2b0372..d630ffaaf 100644
--- a/llm_on_ray/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -2,12 +2,9 @@ port: 8000
 name: bloom-560m
 route_prefix: /bloom-560m
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
+cpus_per_worker: 24
+vllm:
   enabled: true
   precision: bf16
 model_description:
diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
index adc1d158c..310134474 100644
--- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
+++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: deepseek-coder-33b-instruct
 route_prefix: /deepseek-coder-33b-instruct
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 6e5bde761..dfdb1798c 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: deplot
 route_prefix: /deplot
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: google/deplot
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 119337d70..5c02cf70f 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: falcon-7b
 route_prefix: /falcon-7b
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: tiiuae/falcon-7b
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index 77d33ff9b..e065b8ade 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: fuyu-8b
 route_prefix: /fuyu-8b
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: adept/fuyu-8b
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index 5b013b371..7f743a028 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -2,12 +2,9 @@ port: 8000
 name: gemma-2b
 route_prefix: /gemma-2b
 num_replicas: 1
-cpus_per_worker: 2
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
+cpus_per_worker: 24
+vllm:
   enabled: true
   precision: bf16
 model_description:
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index 9719b2f7e..2dffc062e 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -2,14 +2,10 @@ port: 8000
 name: gpt-j-6b
 route_prefix: /gpt-j-6b
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  # false here for ci coverage
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 06a4b1b8b..81021a85d 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -2,12 +2,9 @@ port: 8000
 name: gpt2
 route_prefix: /gpt2
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
+cpus_per_worker: 24
+vllm:
   enabled: true
   precision: bf16
 model_description:
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml
similarity index 83%
rename from llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml
rename to llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml
index ba32990a6..b8c50951a 100644
--- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml
@@ -9,18 +9,12 @@ autoscaling_config:
     target_ongoing_requests: 24
     downscale_delay_s: 30
     upscale_delay_s: 10
+device: cpu
 cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
 vllm:
   enabled: true
   max_num_seqs: 64
   precision: bf16
-workers_per_group: 2
-device: cpu
-ipex:
-  enabled: false
-  precision: bf16
 model_description:
   model_id_or_path: NousResearch/Llama-2-7b-chat-hf
   tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml
similarity index 59%
rename from llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
rename to llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml
index 29d562aa9..83acddba2 100644
--- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml
@@ -1,17 +1,10 @@
 port: 8000
-name: llama-2-7b-chat-hf
-route_prefix: /llama-2-7b-chat-hf
+name: llama-2-7b-chat-hf-no-vllm
+route_prefix: /llama-2-7b-chat-hf-no-vllm
 num_replicas: 1
+device: cpu
 cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
 vllm:
-  enabled: true
-  max_num_seqs: 256
-  precision: bf16
-workers_per_group: 2
-device: cpu
-ipex:
   enabled: false
   precision: bf16
 model_description:
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
index 81cb74d98..d4fe78093 100644
--- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -2,14 +2,10 @@ port: 8000
 name: llama-2-7b-chat-hf
 route_prefix: /llama-2-7b-chat-hf
 num_replicas: 1
-dynamic_max_batch_size: 8
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: NousResearch/Llama-2-7b-chat-hf
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
index ea50f6af7..c10f2e2cd 100644
--- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -1,13 +1,10 @@
 port: 8000
 name: mistral-7b-instruct-v0.2
 route_prefix: /mistral-7b-instruct-v0.2
-cpus_per_worker: 48
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index 3654f18f0..5767e8955 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: mistral-7b-v0.1
 route_prefix: /mistral-7b-v0.1
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
index 89ce086ed..42b6eefad 100644
--- a/llm_on_ray/inference/models/mpt-7b.yaml
+++ b/llm_on_ray/inference/models/mpt-7b.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: mpt-7b
 route_prefix: /mpt-7b
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 8f32c28b7..fca4487e1 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: neural-chat-7b-v3-1
 route_prefix: /neural-chat-7b-v3-1
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 81e05fc19..6bf1c728b 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -2,13 +2,10 @@ port: 8000
 name: opt-125m
 route_prefix: /opt-125m
 num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: facebook/opt-125m
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index daa5256c5..f9eabb8ae 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -1,13 +1,10 @@
 port: 8000
 name: sqlcoder-7b-2
 route_prefix: /sqlcoder-7b-2
-cpus_per_worker: 22
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
 device: cpu
-ipex:
-  enabled: false
+cpus_per_worker: 24
+vllm:
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: defog/sqlcoder-7b-2
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index 199926353..1ff137a42 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -2,14 +2,11 @@ port: 8000
 name: starcoder
 route_prefix: /starcoder
 num_replicas: 1
+device: cpu
 cpus_per_worker: 24
-gpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
-ipex:
-  enabled: false
+vllm:
+  enabled: true
   precision: bf16
-device: cpu
 model_description:
   model_id_or_path: bigcode/starcoder
   tokenizer_name_or_path: bigcode/starcoder
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index ed67f5119..73d1e4702 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -28,7 +28,7 @@
 from fastapi import HTTPException
 
 from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess
-from llm_on_ray.inference.inference_config import InferenceConfig
+from llm_on_ray.inference.inference_config import InferenceConfig, DEVICE_HPU, DEVICE_CUDA
 from llm_on_ray.inference.api_openai_backend.openai_protocol import (
     ChatMessage,
     ErrorResponse,
@@ -66,6 +66,7 @@ def __init__(
 
         # Used to determine if openai backend is used
         self.use_openai = False
+        self.vllm_openai_serving_chat = None
 
         if infer_conf.device == "hpu" and not self.use_vllm:
             from llm_on_ray.inference.predictors.hpu_predictor import HPUPredictor
@@ -76,9 +77,12 @@ def __init__(
 
             self.predictor = DeepSpeedPredictor(infer_conf)
         elif self.use_vllm:
-            from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor
+            if infer_conf.device not in [DEVICE_HPU, DEVICE_CUDA]:
+                from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor
 
-            self.predictor = VllmPredictor(infer_conf, max_num_seqs)
+                self.predictor = VllmPredictor(infer_conf, max_num_seqs)
+            else:
+                self.predictor = None
         elif self.is_mllm:
             from llm_on_ray.inference.predictors.mllm_predictor import MllmPredictor
 
diff --git a/llm_on_ray/inference/predictors/vllm_predictor.py b/llm_on_ray/inference/predictors/vllm_predictor.py
index d3d09414a..1a163580a 100644
--- a/llm_on_ray/inference/predictors/vllm_predictor.py
+++ b/llm_on_ray/inference/predictors/vllm_predictor.py
@@ -26,6 +26,8 @@
     InferenceConfig,
     ModelGenerateResult,
     PRECISION_BF16,
+    DEVICE_HPU,
+    DEVICE_CUDA,
 )
 
 
@@ -43,17 +45,26 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs):
         # The default value is 40GB.
         os.environ["VLLM_CPU_KVCACHE_SPACE"] = str(self.VLLM_CPU_KVCACHE_SPACE_DEFAULT)
 
-        args = AsyncEngineArgs(
+        engine_args = AsyncEngineArgs(
             model=model_desc.model_id_or_path,
+            tokenizer=model_desc.tokenizer_name_or_path,
             trust_remote_code=model_config.trust_remote_code,
             device=infer_conf.device,
             dtype=dtype,
             disable_log_requests=True,
             max_num_seqs=max_num_seqs,
+            gpu_memory_utilization=infer_conf.vllm.gpu_memory_utilization,
+            tensor_parallel_size=infer_conf.vllm.tensor_parallel_size,
+            block_size=infer_conf.vllm.block_size,
+            max_seq_len_to_capture=infer_conf.vllm.max_seq_len_to_capture,
             enforce_eager=infer_conf.vllm.enforce_eager,
         )
-
-        self.engine = AsyncLLMEngine.from_engine_args(args)
+        if (
+            infer_conf.device in [DEVICE_HPU, DEVICE_CUDA]
+            and infer_conf.vllm.tensor_parallel_size > 1
+        ):
+            engine_args.worker_use_ray = True
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
     def update_vllm_config(self, **config):
         # need to update the keys of config if vllm engine is used
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
index ecd3bdee8..60022690f 100644
--- a/llm_on_ray/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -24,6 +24,8 @@
     ModelDescription,
     InferenceConfig,
     all_models,
+    DEVICE_HPU,
+    DEVICE_CUDA,
 )
 
 
@@ -55,21 +57,40 @@ def get_deployed_models(args):
     deployments = {}
     for model_id, infer_conf in model_list.items():
         ray_actor_options = get_deployment_actor_options(infer_conf)
-        depolyment_config = {
+        deployment_config = {
             "ray_actor_options": ray_actor_options,
             "max_ongoing_requests": infer_conf.max_ongoing_requests
             if not args.max_ongoing_requests
             else args.max_ongoing_requests,
         }
         if infer_conf.autoscaling_config:
-            depolyment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict()
+            deployment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict()
         elif infer_conf.num_replicas:
-            depolyment_config["num_replicas"] = infer_conf.num_replicas
+            deployment_config["num_replicas"] = infer_conf.num_replicas
         max_num_seqs = infer_conf.vllm.max_num_seqs if not args.max_num_seqs else args.max_num_seqs
         dynamic_max_batch_size = (
             infer_conf.dynamic_max_batch_size if not args.max_batch_size else args.max_batch_size
         )
-        deployments[model_id] = PredictorDeployment.options(**depolyment_config).bind(
+        device = infer_conf.device
+        if infer_conf.vllm.enabled and (not args.simple) and device in [DEVICE_HPU, DEVICE_CUDA]:
+            tp = infer_conf.vllm.tensor_parallel_size
+            if tp > 1:
+                deployment_config["ray_actor_options"].pop("resources", None)
+                pg_resources = []
+                pg_resources.append(
+                    {"CPU": 2}
+                )  # One is for PredictorDeployment replica, and the other is for Router replica
+                # When device is HPU, the resources of workers will be allocated in vllm engine.
+                if device == DEVICE_CUDA:
+                    for i in range(tp):
+                        # for the vLLM actors on GPU
+                        pg_resources.append(
+                            {"CPU": infer_conf.cpus_per_worker, "GPU": infer_conf.gpus_per_worker}
+                        )
+                    deployment_config["placement_group_bundles"] = pg_resources
+                    deployment_config["placement_group_strategy"] = "STRICT_PACK"
+
+        deployments[model_id] = PredictorDeployment.options(**deployment_config).bind(
             infer_conf, max_num_seqs, dynamic_max_batch_size
         )
 
@@ -123,7 +144,7 @@ def main(argv=None):
     parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.")
     parser.add_argument(
         "--max_num_seqs",
-        default=None,
+        default=256,
         type=int,
         help="The batch size for vLLM. Used when vLLM is enabled.",
     )
@@ -158,7 +179,15 @@ def main(argv=None):
         host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
         print("Service is running with deployments:" + str(deployments))
         print("Service is running models:" + str(model_list))
-        openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)
+        openai_serve_run(
+            deployments,
+            model_list,
+            host,
+            "/",
+            args.port,
+            args.max_ongoing_requests,
+            args.max_num_seqs,
+        )
 
     msg = "Service is deployed successfully."
     if args.keep_serve_terminal:
diff --git a/pyproject.toml b/pyproject.toml
index 5a8e89306..f17820b12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ dependencies = [
 [project.optional-dependencies]
 cpu = [
     "transformers>=4.38.0, <=4.38.1",
-    "intel_extension_for_pytorch==2.2.0",
+    "intel_extension_for_pytorch==2.3.100",
     "torch==2.2.0",
     "oneccl_bind_pt==2.2.0"
 ]
diff --git a/tests/inference/test_chat_template.py b/tests/inference/test_chat_template.py
index 4a987a841..421b6cdb5 100644
--- a/tests/inference/test_chat_template.py
+++ b/tests/inference/test_chat_template.py
@@ -57,38 +57,6 @@
         False,
         "Hello\nHi there!\nWhat is the capital of\n",
     ),
-    (
-        "google/gemma-2b",
-        base_path / "template_gemma.jinja",
-        True,
-        "<|endoftext|>\n"
-        "<start_of_turn>user\n"
-        "Hello<end_of_turn>\n"
-        "\n"
-        "<start_of_turn>model\n"
-        "Hi there!<end_of_turn>\n"
-        "\n"
-        "<start_of_turn>user\n"
-        "What is the capital of<end_of_turn>\n"
-        "\n"
-        "<start_of_turn>model\n"
-        "\n",
-    ),
-    (
-        "google/gemma-2b",
-        base_path / "template_gemma.jinja",
-        False,
-        "<|endoftext|>\n"
-        "<start_of_turn>user\n"
-        "Hello<end_of_turn>\n"
-        "\n"
-        "<start_of_turn>model\n"
-        "Hi there!<end_of_turn>\n"
-        "\n"
-        "<start_of_turn>user\n"
-        "What is the capital of<end_of_turn>\n"
-        "\n",
-    ),
     (
         "mistralai/Mistral-7B-v0.1",
         base_path / "template_mistral.jinja",
diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh
index a84bfe334..6288b0543 100755
--- a/tests/test_getting_started.sh
+++ b/tests/test_getting_started.sh
@@ -15,6 +15,9 @@ pip install .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extr
 # Dynamic link oneCCL and Intel MPI libraries
 source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
 
+# Install vllm from source
+bash ./dev/scripts/install-vllm-cpu.sh
+
 echo "Step 2: Start ray cluster ..."
 ray start --head
 
diff --git a/tests/test_setup.sh b/tests/test_setup.sh
index 589b5a320..9f37119de 100755
--- a/tests/test_setup.sh
+++ b/tests/test_setup.sh
@@ -18,6 +18,7 @@ case $(echo $1 | tr 'a-z' 'A-Z') in
     "CPU")
         hardware=1
         pip install .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+        bash ./dev/scripts/install-vllm-cpu.sh
         ;;
     "GPU")
         pip install .[gpu] --extra-index-url https://developer.intel.com/ipex-whl-stable-xpu
@@ -31,7 +32,7 @@ case $(echo $1 | tr 'a-z' 'A-Z') in
         ;;
 esac
 
-# Check if it neesd deepspeed
+# Check if it needs deepspeed
 if [ $(echo $2 | tr 'A-Z' 'a-z') == "true" ]
 then
     deepspeed=true