[Core] Increase the instance type when scaling up llumlet

AlibabaPAI · Jan 21, 2025 · 1da6e0a · 1da6e0a
1 parent f12d6c3
commit 1da6e0a
Show file tree

Hide file tree

Showing 43 changed files with 1,385 additions and 982 deletions.
diff --git a/.github/workflows/e2e_test.yml → .github/workflows/correctness_test.yml b/.github/workflows/e2e_test.yml → .github/workflows/correctness_test.yml
@@ -1,4 +1,4 @@
-name: e2e_test
+name: correctness_test
 
 on:
   pull_request:
@@ -17,7 +17,7 @@ jobs:
       with:
         all_but_latest: true
 
-  e2e_tests:
+  correctness_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
     timeout-minutes: 30
@@ -28,4 +28,4 @@ jobs:
       run: |
         [[ -n $(docker ps -q) ]] && docker kill $(docker ps -q) || echo "No running containers to kill."
     - name: Build And Test
-      run: ./tools/run_test.sh e2e_test
+      run: ./tools/run_test.sh correctness_test
diff --git a/Makefile b/Makefile
@@ -55,7 +55,7 @@ proto-clean:
 test: check_pytest_installed
 	@pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings
 	@python examlpes/offline_inference.py
-	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_correctness.py
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
@@ -67,9 +67,9 @@ unit_test: check_pytest_installed
 offline_test:
 	@python examlpes/offline_inference.py
 
-.PHONY: e2e_test
-e2e_test:
-	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
+.PHONY: correctness_test
+correctness_test:
+	@pytest -v -x -s --tb=long ./tests/e2e_test/test_correctness.py
 
 .PHONY: bench_test
 bench_test:

diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -20,7 +20,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--log-request-timestamps]
             [--config-file CONFIG_FILE]
             [--initial-instances INITIAL_INSTANCES]
-            [--load-metric {remaining_steps,usage_ratio}]
+            [--dispatch-load-metric {remaining_steps,usage_ratio}]
+            [--migration-load-metric {remaining_steps,usage_ratio}]
+            [--scaling-load-metric {remaining_steps,usage_ratio}]
             [--polling-interval POLLING_INTERVAL]
             [--dispatch-policy {balanced,load,queue,rr}]
             [--enable-migration]
@@ -49,12 +51,14 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--migration-backend-transfer-type {cuda_ipc,rdma,}]
             [--grpc-migration-backend-server-address GRPC_MIGRATION_BACKEND_SERVER_ADDRESS]
             [--kvtransfer-migration-backend-naming-url KVTRANSFER_MIGRATION_BACKEND_NAMING_URL]
-            [--max-stages MAX_STAGES]
-            [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
+            [--migration-max-stages MIGRATION_MAX_STAGES]
+            [--migration-last-stage-max-blocks MIGRATION_LAST_STAGE_MAX_BLOCKS]
             [--enable-pd-disagg]
-            [--num-dispatch-instances NUM_DISPATCH_INSTANCES]
+            [--pd-ratio PD_RATIO]
             [--enable-port-increment]
             [--enable-port-offset-store]
+            [--instance-type INSTANCE_TYPE]
+
 ```
 
 `--host`
@@ -111,8 +115,18 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Number of instances created at initialization.
 - Default: 1
 
-`--load-metric`
-- Instance load metric.
+`--dispatch-load-metric`
+- Instance dispatch load metric.
+- Possible choices: remaining_steps, usage_ratio
+- Default: "remaining_steps"
+
+`--migration-load-metric`
+- Instance migration load metric.
+- Possible choices: remaining_steps, usage_ratio
+- Default: "remaining_steps"
+
+`--scaling-load-metric`
+- Instance scaling load metric.
 - Possible choices: remaining_steps, usage_ratio
 - Default: "remaining_steps"
 
@@ -224,27 +238,31 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - URL of naming server for kvtransfer migration backend
 - Default: "file:/tmp/llumnix/naming/"
 
-`--max-stages`
-- Drop migration if the number of stages > max_stages.
+`--migration-max-stages`
+- Drop migration if the number of stages > migration_max_stages.
 - Default: 3
 
-`--last-stage-max-blocks`
-- If the number of remaining blocks < last_stage_max_blocks, do last stage migration.
+`--migration-last-stage-max-blocks`
+- If the number of remaining blocks < migration_last_stage_max_blocks, do last stage migration.
 - Default: 16
 
 `--enable-pd-disagg`
 - Enable prefill decoding disaggregation.
 
-`--num-dispatch-instances`
-- Number of available instances for dispatch.
-- Default: math.inf
+`--pd-ratio`
+- The p:d ratio used in gloabl launch model.
+- Default: "1:1"
 
 `--enable-port-increment`
 - Enable port increment when desploying multiple servers.
 
 `--enable-port-offset-store`
 - Enable store port offset when desploying multiple servers.
 
+`--instance-type`
+- Instance types for the engine.
+- Possible choices: prefill, decode, no_constraints
+
 # Unsupported vLLM feature options
 
 `--device`

diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
@@ -5,7 +5,7 @@
 import ray
 
 from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager
-from llumnix import (ManagerArgs, EngineArgs, Manager,
+from llumnix import (ManagerArgs, InstanceArgs, EngineArgs, Manager,
                      Llumlet, ServerInfo, QueueType, BackendType,
                      SamplingParams)
 from llumnix.utils import random_uuid
@@ -35,6 +35,7 @@
 
 # Set manager args and engine args.
 manager_args = ManagerArgs()
+instance_args = InstanceArgs()
 engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True,
                          trust_remote_code=True, max_model_len=370)
 
@@ -45,7 +46,8 @@
 # Create instances.
 instance_ids: List[str] = None
 instances: List[Llumlet] = None
-instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args))
+instance_ids, instances = ray.get(manager.init_instances.remote(
+    QueueType("rayqueue"), BackendType.VLLM, instance_args, engine_args))
 
 # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
 server_id = random_uuid()

diff --git a/llumnix/__init__.py b/llumnix/__init__.py
@@ -15,7 +15,7 @@
 from llumnix.entrypoints.setup import (launch_ray_cluster,
                                        connect_to_ray_cluster,
                                        init_manager)
-from llumnix.arg_utils import ManagerArgs
+from llumnix.arg_utils import ManagerArgs, InstanceArgs
 from llumnix.manager import Manager
 from llumnix.llumlet.llumlet import Llumlet
 from llumnix.queue.queue_type import QueueType
@@ -29,6 +29,7 @@
     "connect_to_ray_cluster",
     "init_manager",
     "ManagerArgs",
+    "InstanceArgs",
     "Manager",
     "Llumlet",
     "QueueType",