update

intel-analytics · Jul 16, 2024 · 2d05730 · 2d05730
1 parent 9584742
commit 2d05730
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 4 deletions.
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -34,7 +34,7 @@
 local_rank = my_rank
 
 async def main():
-    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP')
+    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging Pipeline-Parallel')
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded'
                              ', or the path to the huggingface checkpoint folder')

diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
@@ -41,7 +41,8 @@ export MAX_NUM_SEQS="4"
 export MAX_PREFILLED_SEQS=0
 
 if [[ $NUM_GPUS -eq 1 ]]; then
-python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT
+    export ZE_AFFINITY_MASK=0
+    python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT
 else
-CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS
+    CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS
 fi
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py
@@ -26,7 +26,7 @@
 logger = logging.get_logger(__name__)
 
 async def main():
-    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP')
+    parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging ipex-llm')
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded'
                              ', or the path to the huggingface checkpoint folder')