From 2d057305889b2aa571d8f3de405a029b679cd5dd Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Tue, 16 Jul 2024 10:13:18 +0800
Subject: [PATCH] update

---
 .../GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py        | 2 +-
 python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh      | 5 +++--
 python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py  | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
index d6b0800e462..ec0ade3be02 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -34,7 +34,7 @@
 local_rank = my_rank
 
 async def main():
-    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP')
+    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging Pipeline-Parallel')
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded'
                              ', or the path to the huggingface checkpoint folder')
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
index a4ce242f920..fc3c489ac8e 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
@@ -41,7 +41,8 @@ export MAX_NUM_SEQS="4"
 export MAX_PREFILLED_SEQS=0
 
 if [[ $NUM_GPUS -eq 1 ]]; then
-python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT
+    export ZE_AFFINITY_MASK=0
+    python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT
 else
-CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS
+    CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS
 fi
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py
index e2236277361..43f5888b401 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py
@@ -26,7 +26,7 @@
 logger = logging.get_logger(__name__)
 
 async def main():
-    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP')
+    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging ipex-llm')
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded'
                              ', or the path to the huggingface checkpoint folder')