Skip to content

Commit

Permalink
[Core] Increase the instance type when scaling up llumlet
Browse files Browse the repository at this point in the history
  • Loading branch information
Xinyi-ECNU authored and KuilongCui committed Jan 21, 2025
1 parent f12d6c3 commit 1da6e0a
Show file tree
Hide file tree
Showing 43 changed files with 1,385 additions and 982 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: e2e_test
name: correctness_test

on:
pull_request:
Expand All @@ -17,7 +17,7 @@ jobs:
with:
all_but_latest: true

e2e_tests:
correctness_tests:
needs: cancel_previous_workflows
runs-on: [self-hosted]
timeout-minutes: 30
Expand All @@ -28,4 +28,4 @@ jobs:
run: |
[[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
- name: Build And Test
run: ./tools/run_test.sh e2e_test
run: ./tools/run_test.sh correctness_test
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ proto-clean:
test: check_pytest_installed
@pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings
@python examlpes/offline_inference.py
@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
@pytest -v -x -s --tb=long ./tests/e2e_test/test_correctness.py
@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py

Expand All @@ -67,9 +67,9 @@ unit_test: check_pytest_installed
offline_test:
@python examlpes/offline_inference.py

.PHONY: e2e_test
e2e_test:
@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
.PHONY: correctness_test
correctness_test:
@pytest -v -x -s --tb=long ./tests/e2e_test/test_correctness.py

.PHONY: bench_test
bench_test:
Expand Down
44 changes: 31 additions & 13 deletions docs/Arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
[--log-request-timestamps]
[--config-file CONFIG_FILE]
[--initial-instances INITIAL_INSTANCES]
[--load-metric {remaining_steps,usage_ratio}]
[--dispatch-load-metric {remaining_steps,usage_ratio}]
[--migration-load-metric {remaining_steps,usage_ratio}]
[--scaling-load-metric {remaining_steps,usage_ratio}]
[--polling-interval POLLING_INTERVAL]
[--dispatch-policy {balanced,load,queue,rr}]
[--enable-migration]
Expand Down Expand Up @@ -49,12 +51,14 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
[--migration-backend-transfer-type {cuda_ipc,rdma,}]
[--grpc-migration-backend-server-address GRPC_MIGRATION_BACKEND_SERVER_ADDRESS]
[--kvtransfer-migration-backend-naming-url KVTRANSFER_MIGRATION_BACKEND_NAMING_URL]
[--max-stages MAX_STAGES]
[--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
[--migration-max-stages MIGRATION_MAX_STAGES]
[--migration-last-stage-max-blocks MIGRATION_LAST_STAGE_MAX_BLOCKS]
[--enable-pd-disagg]
[--num-dispatch-instances NUM_DISPATCH_INSTANCES]
[--pd-ratio PD_RATIO]
[--enable-port-increment]
[--enable-port-offset-store]
[--instance-type INSTANCE_TYPE]
```

`--host`
Expand Down Expand Up @@ -111,8 +115,18 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
- Number of instances created at initialization.
- Default: 1

`--load-metric`
- Instance load metric.
`--dispatch-load-metric`
- Instance dispatch load metric.
- Possible choices: remaining_steps, usage_ratio
- Default: "remaining_steps"

`--migration-load-metric`
- Instance migration load metric.
- Possible choices: remaining_steps, usage_ratio
- Default: "remaining_steps"

`--scaling-load-metric`
- Instance scaling load metric.
- Possible choices: remaining_steps, usage_ratio
- Default: "remaining_steps"

Expand Down Expand Up @@ -224,27 +238,31 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
- URL of naming server for kvtransfer migration backend
- Default: "file:/tmp/llumnix/naming/"

`--max-stages`
- Drop migration if the number of stages > max_stages.
`--migration-max-stages`
- Drop migration if the number of stages > migration_max_stages.
- Default: 3

`--last-stage-max-blocks`
- If the number of remaining blocks < last_stage_max_blocks, do last stage migration.
`--migration-last-stage-max-blocks`
- If the number of remaining blocks < migration_last_stage_max_blocks, do last stage migration.
- Default: 16

`--enable-pd-disagg`
- Enable prefill decoding disaggregation.

`--num-dispatch-instances`
- Number of available instances for dispatch.
- Default: math.inf
`--pd-ratio`
- The p:d ratio used in gloabl launch model.
- Default: "1:1"

`--enable-port-increment`
- Enable port increment when desploying multiple servers.

`--enable-port-offset-store`
- Enable store port offset when desploying multiple servers.

`--instance-type`
- Instance types for the engine.
- Possible choices: prefill, decode, no_constraints

# Unsupported vLLM feature options

`--device`
Expand Down
6 changes: 4 additions & 2 deletions examlpes/offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ray

from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager
from llumnix import (ManagerArgs, EngineArgs, Manager,
from llumnix import (ManagerArgs, InstanceArgs, EngineArgs, Manager,
Llumlet, ServerInfo, QueueType, BackendType,
SamplingParams)
from llumnix.utils import random_uuid
Expand Down Expand Up @@ -35,6 +35,7 @@

# Set manager args and engine args.
manager_args = ManagerArgs()
instance_args = InstanceArgs()
engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True,
trust_remote_code=True, max_model_len=370)

Expand All @@ -45,7 +46,8 @@
# Create instances.
instance_ids: List[str] = None
instances: List[Llumlet] = None
instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args))
instance_ids, instances = ray.get(manager.init_instances.remote(
QueueType("rayqueue"), BackendType.VLLM, instance_args, engine_args))

# The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
server_id = random_uuid()
Expand Down
3 changes: 2 additions & 1 deletion llumnix/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from llumnix.entrypoints.setup import (launch_ray_cluster,
connect_to_ray_cluster,
init_manager)
from llumnix.arg_utils import ManagerArgs
from llumnix.arg_utils import ManagerArgs, InstanceArgs
from llumnix.manager import Manager
from llumnix.llumlet.llumlet import Llumlet
from llumnix.queue.queue_type import QueueType
Expand All @@ -29,6 +29,7 @@
"connect_to_ray_cluster",
"init_manager",
"ManagerArgs",
"InstanceArgs",
"Manager",
"Llumlet",
"QueueType",
Expand Down
Loading

0 comments on commit 1da6e0a

Please sign in to comment.