From 2d057305889b2aa571d8f3de405a029b679cd5dd Mon Sep 17 00:00:00 2001 From: hzjane Date: Tue, 16 Jul 2024 10:13:18 +0800 Subject: [PATCH] update --- .../GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py | 2 +- python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh | 5 +++-- python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py index d6b0800e462..ec0ade3be02 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py @@ -34,7 +34,7 @@ local_rank = my_rank async def main(): - parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP') + parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging Pipeline-Parallel') parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded' ', or the path to the huggingface checkpoint folder') diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh index a4ce242f920..fc3c489ac8e 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh @@ -41,7 +41,8 @@ export MAX_NUM_SEQS="4" export MAX_PREFILLED_SEQS=0 if [[ $NUM_GPUS -eq 1 ]]; then -python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT + export ZE_AFFINITY_MASK=0 + python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT else -CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS + CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS fi diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py index e2236277361..43f5888b401 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/serving.py @@ -26,7 +26,7 @@ logger = logging.get_logger(__name__) async def main(): - parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging DeepSpeed-AutoTP') + parser = argparse.ArgumentParser(description='Predict Tokens using fastapi by leveraging ipex-llm') parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded' ', or the path to the huggingface checkpoint folder')