update worker name and add readme

intel-analytics · Jul 16, 2024 · 9cabbbc · 9cabbbc
1 parent 2d05730
commit 9cabbbc
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 6 deletions.
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md
@@ -50,7 +50,14 @@ pip install transformers==4.40.0
 pip install trl==0.8.1
 ```
 
-### 2. Run pipeline parallel serving on multiple GPUs
+### 2-1. Run ipex-llm serving on one GPU card 
+
+```bash
+# Need to set NUM_GPUS=1 and MODEL_PATH in run.sh first
+bash run.sh
+```
+
+### 2-2. Run pipeline parallel serving on multiple GPUs
 
 ```bash
 # Need to set MODEL_PATH in run.sh first
@@ -99,7 +106,7 @@ Please change the test url accordingly.
 
 ```bash
 # set t/c to the number of concurrencies to test full throughput.
-wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate/ --timeout 1m
+wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate --timeout 1m
 ```
 
 ## 5. Using the `benchmark.py` Script

diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -15,7 +15,7 @@
 #
 
 import torch.distributed as dist
-from ipex_llm.transformers import init_pipeline_parallel, ModelRunner
+from ipex_llm.transformers import init_pipeline_parallel, PPModelWorker
 from ipex_llm.serving.api import FastApp
 from transformers.utils import logging
 from transformers import AutoTokenizer
@@ -57,7 +57,7 @@ async def main():
     for i in range(my_size):
         if my_rank == i:
             logger.info("start model initialization")
-            local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
+            local_model = PPModelWorker(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
             logger.info("model initialized")
         dist.barrier()
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')

diff --git a/python/llm/src/ipex_llm/transformers/__init__.py b/python/llm/src/ipex_llm/transformers/__init__.py
@@ -22,4 +22,4 @@
         AutoModelForNextSentencePrediction, AutoModelForMultipleChoice, \
         AutoModelForTokenClassification
 from .modelling_bigdl import *
-from .pipeline_parallel import init_pipeline_parallel, ModelRunner
+from .pipeline_parallel import init_pipeline_parallel, PPModelWorker
diff --git a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
@@ -431,7 +431,7 @@ def make_attention_mask(prompt_lengths):
     return attention_mask
 
 
-class ModelRunner:
+class PPModelWorker:
     """Implementation for pipeline parallel multi-stage serving."""
     def __init__(self, checkpoint, rank, world_size, low_bit, max_num_seqs, max_prefilled_seqs,
                  torch_dtype=torch.float16):