update bash

intel-analytics · Jul 16, 2024 · 9584742 · 9584742
1 parent 0beb240
commit 9584742
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 1 deletion.
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -16,6 +16,7 @@
 
 import torch.distributed as dist
 from ipex_llm.transformers import init_pipeline_parallel, ModelRunner
+from ipex_llm.serving.api import FastApp
 from transformers.utils import logging
 from transformers import AutoTokenizer
 import uvicorn
@@ -63,7 +64,6 @@ async def main():
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
 
-    from source import FastApp
     myapp = FastApp(local_model, tokenizer)
     if local_rank == 0:
         config = uvicorn.Config(app=myapp.app, host="0.0.0.0", port=args.port)

diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
@@ -40,4 +40,8 @@ export LOW_BIT="fp8"
 export MAX_NUM_SEQS="4"
 export MAX_PREFILLED_SEQS=0
 
+if [[ $NUM_GPUS -eq 1 ]]; then
+python serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT
+else
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS
+fi