From daf7b1cd56f564546b9fbde181cc5c2357a073ed Mon Sep 17 00:00:00 2001
From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com>
Date: Mon, 27 May 2024 16:20:13 +0800
Subject: [PATCH] [Docker] Fix image using two cards error (#11144)

* fix all

* done
---
 docker/llm/inference/xpu/docker/Dockerfile    |  2 +-
 .../doc/LLM/Quickstart/fastchat_quickstart.md | 25 ++++++++++++++++++-
 .../ipex_llm/serving/fastchat/vllm_worker.py  |  5 +++-
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile
index 4a291c70b85..6994e7cee6a 100644
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@@ -44,7 +44,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     pip install transformers_stream_generator einops tiktoken && \
     # Install opencl-related repos
     apt-get update && \
-    apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
+    apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero && \
     # Install related libary of chat.py
     pip install --upgrade colorama && \
     # Download all-in-one benchmark and examples
diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/fastchat_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/fastchat_quickstart.md
index 08ec474377f..b154026d55b 100644
--- a/docs/readthedocs/source/doc/LLM/Quickstart/fastchat_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/fastchat_quickstart.md
@@ -114,9 +114,32 @@ python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MO
 source /opt/intel/oneapi/setvars.sh
 export USE_XETLA=OFF
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
+python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu --load-in-low-bit "sym_int4" --enforce-eager
 ```
 
+#### Launch multiple workers
+
+Sometimes we may want to start multiple workers for the best performance.  For running in CPU, you may want to seperate multiple workers in different sockets.  Assuming each socket have 48 physicall cores, then you may want to start two workers using the following example:
+
+```bash
+export OMP_NUM_THREADS=48
+numactl -C 0-47 -m 0 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "cpu" &
+
+# All the workers other than the first worker need to specify a different worker port and corresponding worker-address
+numactl -C 48-95 -m 1 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "cpu" --port 21003 --worker-address "http://localhost:21003" &
+```
+
+For GPU, we may want to start two workers using different GPUs.  To achieve this, you should use `ZE_AFFINITY_MASK` environment variable to select different GPUs for different workers.  Below shows an example:
+
+```bash
+ZE_AFFINITY_MASK=1 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "xpu" &
+
+# All the workers other than the first worker need to specify a different worker port and corresponding worker-address
+ZE_AFFINITY_MASK=2 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "xpu" --port 21003 --worker-address "http://localhost:21003" &
+```
+
+If you are not sure the effect of `ZE_AFFINITY_MASK`, then you could set `ZE_AFFINITY_MASK` and check the result of `sycl-ls`.
+
 ### Launch Gradio web server
 
 When you have started the controller and the worker, you can start web server as follows:
diff --git a/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
index 55a459c2aac..482031f97a3 100644
--- a/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
@@ -41,6 +41,9 @@
     worker_id,
 )
 from fastchat.utils import get_context_length, is_partial_stop
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ipex_llm.vllm.cpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
 
 
 app = FastAPI()
@@ -56,7 +59,7 @@ def __init__(
         model_names: List[str],
         limit_worker_concurrency: int,
         no_register: bool,
-        llm_engine: AsyncLLMEngine,
+        llm_engine: 'AsyncLLMEngine',
         conv_template: str,
     ):
         super().__init__(