[Deployment] Support global launch in addition to local launch (#88)

AlibabaPAI · Jan 10, 2025 · 3e319f0 · 3e319f0
1 parent a98927c
commit 3e319f0
Show file tree

Hide file tree

Showing 57 changed files with 1,725 additions and 948 deletions.
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -20,7 +20,7 @@ jobs:
   unit_tests:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 30
+    timeout-minutes: 45
     steps:
     - name: Checkout
       uses: actions/checkout@v4

diff --git a/Makefile b/Makefile
@@ -21,8 +21,8 @@ install:
 
 .PHONY: lint
 lint: check_pylint_installed check_pytest_installed
-	@pylint --rcfile=.pylintrc -s n  --jobs=128 ./llumnix
-	
+	@pylint --rcfile=.pylintrc -s n --jobs=128 ./llumnix
+
 	@pylint --rcfile=.pylintrc \
 			--disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \
 			-s n --jobs=128 ./tests
@@ -53,15 +53,15 @@ proto-clean:
 
 .PHONY: test
 test: check_pytest_installed
-	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
+	@pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings
 	@python examlpes/offline_inference.py
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 .PHONY: unit_test
 unit_test: check_pytest_installed
-	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
+	@pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings
 
 .PHONY: offline_test
 offline_test:

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ Llumnix is easy to use with:
 
 ## Getting Started
 
-If you are already utilizing vLLM for multi-instance LLM serving deployments, simply replace the vLLM serving deployment command `python -m vllm.entrypoints.api_server ...` for each instance with the command provided below:
+If you are already utilizing vLLM for multi-instance LLM serving deployments, simply replace the vLLM serving deployment command `python -m entrypoints.vllm.api_server ...` for each instance with the command provided below:
 ```
 python -m llumnix.entrypoints.vllm.api_server \
     --host $HOST \

diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -6,17 +6,28 @@ Note: since Llumnix is still in alpha stage, the interface and arguments are *su
 
 ```
 usage: -m llumnix.entrypoints.vllm.api_server [-h]
+            [--host HOST]
+            [--port PORT]
+            [--ssl-keyfile SSL_KEYFILE]
+            [--ssl-certfile SSL_CERTFILE]
+            [--log-level {debug,info,warning,error}]
+            [--launch-ray-cluster]
+            [--ray-cluster-port RAY_CLUSTER_PORT]
+            [--request-output-queue-type {rayqueue,zmq}]
+            [--request-output-queue-port REQUEST_OUTPUT_QUEUE_PORT]
+            [--disable-log-requests-server]
+            [--log-request-timestamps]
             [--config-file CONFIG_FILE]
             [--initial-instances INITIAL_INSTANCES]
             [--load-metric {remaining_steps,usage_ratio}]
             [--polling-interval POLLING_INTERVAL]
             [--dispatch-policy {balanced,load,queue,rr}]
             [--enable-migration]
+            [--enable-defrag]
             [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY]
             [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}]
             [--migrate-out-threshold MIGRATE_OUT_THRESHOLD]
             [--request-migration-policy {LCR,SR,LR,FCW,FCWSR}]
-            [--enable-defrag ENABLE_DEFRAG]
             [--enable-scaling]
             [--min-instances MIN_INSTANCES]
             [--max-instances MAX_INSTANCES]
@@ -27,26 +38,69 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--disable-log-requests-manager]
             [--log-instance-info]
             [--log-filename LOG_FILENAME]
+            [--simulator-mode]
             [--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
             [--gpu-type GPU_TYPE]
-            [--polling-interval POLLING_INTERVAL]
             [--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}]
             [--migration-buffer-blocks MIGRATION_BUFFER_BLOCKS]
-            [--migration-backend-transfer-type {cuda_ipc,rdma,}]
-            [--migration-backend-kvtransfer-naming-url MIGRATION_BACKEND_KVTRANSFER_NAMING_URL]
-            [--migration-backend-server-address MIGRATION_BACKEND_SERVER_ADDRESS]
-            [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT]
             [--migration-num-layers MIGRATION_NUM_LAYERS]
-            [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
+            [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT]
+            [--migration-backend-transfer-type {cuda_ipc,rdma,}]
+            [--grpc-migration-backend-server-address GRPC_MIGRATION_BACKEND_SERVER_ADDRESS]
+            [--kvtransfer-migration-backend-naming-url KVTRANSFER_MIGRATION_BACKEND_NAMING_URL]
             [--max-stages MAX_STAGES]
+            [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
             [--enable-pd-disagg]
             [--num-dispatch-instances NUM_DISPATCH_INSTANCES]
-            [--log-request-timestamps]
-
+            [--enable-port-increment]
 ```
 
+`--host`
+- Hostname of the server.
+- Default: "localhost"
+
+`--port`
+- Port number of the server.
+- Default: 8000
+
+`--ssl-keyfile`
+- Path to SSL key file.
+- Default: None
+
+`--ssl-certfile`
+- Path to SSL certificate file.
+- Default: None
+
+`--log-level`
+- Log level for the server.
+- Possible choices: debug, info, warning, error
+- Default: "info"
+
+`--launch-ray-cluster`
+- If launch ray cluster.
+
+`--ray-cluster-port`
+- Ray cluster port.
+- Default: 6379
+
+`--request-output-queue-type`
+- Queue type for request output queue.
+- Possible choices: rayqueue, zmq
+- Default: "rayqueue"
+
+`--request-output-queue-port`
+- Port number for the zmq request output queue.
+- Default: 1234
+
+`--disable-log-requests-server`
+- Disable logging requests in server.
+
+`--log-request-timestamps`
+- If log request timestamps.
+
 `--config-file`
-- Path to config file.
+- Path to config file of arguments.
+- Default: None
 
 `--initial-instances`
 - Number of instances created at initialization.
@@ -69,6 +123,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 `--enable-migration`
 - Enable migrate requests between instances.
 
+`--enable-defrag`
+- Enable defragmentation through migration based on virtual usage.
+
 `--pair-migration-frequency`
 - Pair migration frequency.
 - Default: 1
@@ -87,10 +144,6 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Possible choices: LCR, SR, LR, FCW, FCWSR
 - Default: "SR"
 
-`--enable-defrag`
-- Enable defragmentation through migration based on virtual usage.
-- Default: False
-
 `--enable-scaling`
 - Enable auto scaling.
 
@@ -129,60 +182,60 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Log filename.
 - Default: "server.log"
 
-`--profiling-result-file-path`
-- Profiling result file path.
-- Default: ""
+`--simulator-mode`
+- Enable simulator mode.
 
-`--gpu-type`
-- GPU type specified when using simulator.
-- Default: "a10"
+`--profiling-result-file-path`
+- Profiling result file path when using simulator.
+- Default: None
 
 `--migration-backend`
 - Communication backend of migration.
 - Possible choices: gloo, rayrpc, nccl, grpc, kvtransfer. [gloo, rayrpc, nccl] are available for vllm and [grpc, kvtransfer] are available for bladellm.
 - Default: "gloo"
 
-`--migration-backend-transfer-type`
-- Transfer type for migration backend kvTransfer.
-- Possible choices: cuda_ipc, rdma
-- Default: "rdma"
-
-`--migration-backend-server-address`
-- Address of grpc server for migration backend
-- Default: "127.0.0.1:50051"
-
-`--migration-backend-kvtransfer-naming-url`
-- URL of naming server for kvtransfer migration backend
-- Default: "file:/tmp/llumnix/naming/"
-
 `--migration-buffer-blocks`
 - Number of buffer blocks in migration.
 - Default: 512
 
+`--migration-num-layers`
+- number of kv-cache layers to transfer in each round during migration
+- Default: 1
+
 `--migration-backend-init-timeout`
 - Timeout(s) for initializing migration backend.
 - Default: 10.0
 
-`--migration-num-layers`
-- number of kv-cache layers to transfer in each round during migration
-- Default: 1
+`--migration-backend-transfer-type`
+- Transfer type for migration backend grpc and kvTransfer.
+- Possible choices: cuda_ipc, rdma
+- Default: "rdma"
 
-`--last-stage-max-blocks`
-- If the number of remaining blocks < last_stage_max_blocks, do last stage migration.
-- Default: 4
+`--grpc-migration-backend-server-address`
+- Address of grpc server for migration backend
+- Default: "127.0.0.1:50051"
+
+`--kvtransfer-migration-backend-naming-url`
+- URL of naming server for kvtransfer migration backend
+- Default: "file:/tmp/llumnix/naming/"
 
 `--max-stages`
 - Drop migration if the number of stages > max_stages.
 - Default: 3
 
-`--log-request-timestamps`
-- Enable logging request timestamps.
+`--last-stage-max-blocks`
+- If the number of remaining blocks < last_stage_max_blocks, do last stage migration.
+- Default: 16
 
 `--enable-pd-disagg`
 - Enable prefill decoding disaggregation.
 
 `--num-dispatch-instances`
 - Number of available instances for dispatch.
+- Default: math.inf
+
+`--enable-port-increment`
+- Enable port increment when desploying multiple servers.
 
 # Unsupported vLLM feature options
 

diff --git a/docs/Quickstart.md b/docs/Quickstart.md
@@ -34,7 +34,7 @@ After installation, you can follow this guide to use Llumnix for multi-instance
 
 ## Migrating from Existing Deployments
 
-Inference engines like vLLM provide an API server user interface, e.g., `python -m vllm.entrypoints.api_server`. To deploy multiple instances, people start multiple such API servers, each corresponding to one instance, on multiple nodes / containers / k8s pods.
+Inference engines like vLLM provide an API server user interface, e.g., `python -m entrypoints.vllm.api_server`. To deploy multiple instances, people start multiple such API servers, each corresponding to one instance, on multiple nodes / containers / k8s pods.
 
 Llumnix provides a similar user interface to enable seamless integration with such existing multi-instance deployments.
 You only need two simple steps to migrate from a deployed vLLM service to Llumnix:
@@ -62,11 +62,25 @@ export HEAD_NODE=1
 
 During the execution of serving deployment, Llumnix will:
 - Initiate the Ray cluster for distributed execution.
-- Start Llumnix actor components, including LLMEngineManager, Llumlet, among others.
+- Start Llumnix actor components, including Manager, Llumlet, among others.
 - Launch the vLLM engine instances.
 
 Following these steps, Llumnix acts as the request scheduling layer situated behind the multiple frontend API servers and above the multiple backend vLLM engine instances. This positioning allows Llumnix to significantly enhance serving performance through its dynamic, fine-grained, and KV-cache-aware request scheduling and rescheduling across instances.
 
+## Centralized Deployment
+
+Llumnix also supports deploying multiple servers and instances at once by running `python -m entrypoints.vllm.serve`, which is named as centralized deployment.
+
+```
+python -m llumnix.entrypoints.vllm.serve \
+    --config-file $CONFIG_PATH \
+    # vLLM arguments ...
+    # Llumnix arguments ...
+    ...
+```
+
+Centralized deployment assumes that user has already launch a Ray cluter. Upon running the serve module, Llumnix will automatically connect to the existing Ray cluster, start the Llumnix components, and deploy multiple servers and instances to the Ray cluster until there is no more available gpus or cpus.
+
 ## Ray Cluster Notice
 When you include the --launch-ray-cluster option in Llumnix's serving deployment command, Llumnix automatically builds a Ray cluster during the execution of serving deployment. This action will overwrite any existing Ray cluster. If this behavior is not desired, simply omit the --launch-ray-cluster option, and Llumnix will initiate its actor components within the current Ray cluster.
 
@@ -84,7 +98,8 @@ HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \
                 --model $MODEL_PATH \
                 --engine-use-ray \
                 --worker-use-ray \
-                --max-model-len 4096
+                --max-model-len 4096 \
+                --migration-backend rayrpc \
 ```
 `CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node, 
 

diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
@@ -5,8 +5,9 @@
 import ray
 
 from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager
-from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet,
-                     EngineArgs, QueueType, BackendType)
+from llumnix import (ManagerArgs, EngineArgs, Manager,
+                     Llumlet, ServerInfo, QueueType, BackendType,
+                     SamplingParams)
 from llumnix.utils import random_uuid
 from llumnix.queue.ray_queue_server import RayQueueServer
 
@@ -33,23 +34,18 @@
 connect_to_ray_cluster(port=ray_cluster_port)
 
 # Set manager args and engine args.
-manager_args = EngineManagerArgs()
+manager_args = ManagerArgs()
 engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True,
                          trust_remote_code=True, max_model_len=370)
 
-# Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up
-# need to be called to add the newly created llumlets to the management of the manager.
-manager: LLMEngineManager = init_manager(manager_args)
+# Create a manager. If the manager is created first, and then the instances are created.
+manager: Manager = init_manager(manager_args)
 ray.get(manager.is_ready.remote())
 
-# Create llumlets.
+# Create instances.
 instance_ids: List[str] = None
-llumlets: List[Llumlet] = None
-instance_ids, llumlets = ray.get(manager.init_llumlets.remote(
-    engine_args, QueueType("rayqueue"), BackendType.VLLM, 1,
-))
-
-ray.get(manager.scale_up.remote(instance_ids, llumlets))
+instances: List[Llumlet] = None
+instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args))
 
 # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in.
 server_id = random_uuid()

diff --git a/llumnix/__init__.py b/llumnix/__init__.py
@@ -15,8 +15,8 @@
 from llumnix.entrypoints.setup import (launch_ray_cluster,
                                        connect_to_ray_cluster,
                                        init_manager)
-from llumnix.arg_utils import EngineManagerArgs
-from llumnix.llm_engine_manager import LLMEngineManager
+from llumnix.arg_utils import ManagerArgs
+from llumnix.manager import Manager
 from llumnix.llumlet.llumlet import Llumlet
 from llumnix.queue.queue_type import QueueType
 from llumnix.backends.backend_interface import BackendType
@@ -28,8 +28,8 @@
     "launch_ray_cluster",
     "connect_to_ray_cluster",
     "init_manager",
-    "EngineManagerArgs",
-    "LLMEngineManager",
+    "ManagerArgs",
+    "Manager",
     "Llumlet",
     "QueueType",
     "BackendType",