Skip to content

Commit

Permalink
update worker name and add readme
Browse files Browse the repository at this point in the history
  • Loading branch information
hzjane committed Jul 16, 2024
1 parent 2d05730 commit 9cabbbc
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 6 deletions.
11 changes: 9 additions & 2 deletions python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ pip install transformers==4.40.0
pip install trl==0.8.1
```

### 2. Run pipeline parallel serving on multiple GPUs
### 2-1. Run ipex-llm serving on one GPU card

```bash
# Need to set NUM_GPUS=1 and MODEL_PATH in run.sh first
bash run.sh
```

### 2-2. Run pipeline parallel serving on multiple GPUs

```bash
# Need to set MODEL_PATH in run.sh first
Expand Down Expand Up @@ -99,7 +106,7 @@ Please change the test url accordingly.

```bash
# set t/c to the number of concurrencies to test full throughput.
wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate/ --timeout 1m
wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate --timeout 1m
```

## 5. Using the `benchmark.py` Script
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#

import torch.distributed as dist
from ipex_llm.transformers import init_pipeline_parallel, ModelRunner
from ipex_llm.transformers import init_pipeline_parallel, PPModelWorker
from ipex_llm.serving.api import FastApp
from transformers.utils import logging
from transformers import AutoTokenizer
Expand Down Expand Up @@ -57,7 +57,7 @@ async def main():
for i in range(my_size):
if my_rank == i:
logger.info("start model initialization")
local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
local_model = PPModelWorker(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
logger.info("model initialized")
dist.barrier()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')
Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
AutoModelForNextSentencePrediction, AutoModelForMultipleChoice, \
AutoModelForTokenClassification
from .modelling_bigdl import *
from .pipeline_parallel import init_pipeline_parallel, ModelRunner
from .pipeline_parallel import init_pipeline_parallel, PPModelWorker
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def make_attention_mask(prompt_lengths):
return attention_mask


class ModelRunner:
class PPModelWorker:
"""Implementation for pipeline parallel multi-stage serving."""
def __init__(self, checkpoint, rank, world_size, low_bit, max_num_seqs, max_prefilled_seqs,
torch_dtype=torch.float16):
Expand Down

0 comments on commit 9cabbbc

Please sign in to comment.