Skip to content

Commit

Permalink
Add global deployment mode in bench test
Browse files Browse the repository at this point in the history
  • Loading branch information
s5u13b committed Jan 7, 2025
1 parent 43b830e commit fd2bbf7
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 17 deletions.
2 changes: 1 addition & 1 deletion llumnix/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class ManagerArgs:

enable_pd_disagg: bool = None

enbale_port_increment: bool = None
enable_port_increment: bool = None

def __post_init__(self):
# Check if all fields default to None
Expand Down
45 changes: 30 additions & 15 deletions tests/e2e_test/test_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
# pylint: disable=unused-import
from tests.conftest import ray_env
from .utils import (generate_launch_command, generate_bench_command, to_markdown_table,
wait_for_llumnix_service_ready, shutdown_llumnix_service)
wait_for_llumnix_service_ready, shutdown_llumnix_service,
generate_serve_command)

BENCH_TEST_TIMEOUT_MINS = 30

Expand Down Expand Up @@ -63,21 +64,34 @@ def get_markdown_data(key: str, head_name: str):
@pytest.mark.asyncio
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="at least 1 gpus required for simple benchmark")
@pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B'])
async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model):
device_count = torch.cuda.device_count()
@pytest.mark.parametrize("deployment_mode", ['global', 'local'])
async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, deployment_mode):
ip = "127.0.0.1"
base_port = 37037
ip_ports = []
for i in range(device_count):
port = base_port+i
ip_port = f"{ip}:{port}"
ip_ports.append(ip_port)
launch_command = generate_launch_command(result_filename=str(base_port+i)+".out",
launch_ray_cluster=False,
ip=ip,
port=port,
model=model)
subprocess.run(launch_command, shell=True, check=True)
if deployment_mode == 'local':
device_count = torch.cuda.device_count()
for i in range(device_count):
port = base_port+i
ip_port = f"{ip}:{port}"
ip_ports.append(ip_port)
launch_command = generate_launch_command(result_filename=str(base_port+i)+".out",
launch_ray_cluster=False,
ip=ip,
port=port,
model=model)
subprocess.run(launch_command, shell=True, check=True)
else: # global
device_count = torch.cuda.device_count()
for i in range(device_count):
port = base_port+i
ip_port = f"{ip}:{port}"
ip_ports.append(ip_port)
serve_command = generate_serve_command(result_filename=str(base_port)+".out",
ip=ip,
port=base_port,
model=model)
subprocess.run(serve_command, shell=True, check=True)

wait_for_llumnix_service_ready(ip_ports)

Expand Down Expand Up @@ -113,7 +127,8 @@ def run_bench_command(command):
process.kill()
assert False, "bench_test timed out after {} minutes.".format(BENCH_TEST_TIMEOUT_MINS)

with open("performance.txt", "w", encoding="utf-8") as f:
f.write(parse_log_file())
if deployment_mode == 'local':
with open("performance.txt", "w", encoding="utf-8") as f:
f.write(parse_log_file())

await asyncio.sleep(3)
35 changes: 35 additions & 0 deletions tests/e2e_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,40 @@ def generate_launch_command(result_filename: str = "",
)
return command

def generate_serve_command(result_filename: str = "",
ip: str = "127.0.0.1",
port: int = 37000,
dispatch_policy: str = "load",
migration_backend = "gloo",
model = "facebook/opt-125m",
max_model_len: int = 4096,
log_instance_info: bool = False,
request_migration_policy: str = 'SR',
max_num_batched_tokens: int = 16000):
command = (
f"RAY_DEDUP_LOGS=0 "
f"nohup python -u -m llumnix.entrypoints.vllm.serve "
f"--host {ip} "
f"--port {port} "
f"{'--log-filename manager ' if log_instance_info else ''}"
f"{'--log-instance-info ' if log_instance_info else ''}"
f"--enable-migration "
f"--model {model} "
f"--engine-use-ray "
f"--worker-use-ray "
f"--max-model-len {max_model_len} "
f"--dispatch-policy {dispatch_policy} "
f"--trust-remote-code "
f"--request-migration-policy {request_migration_policy} "
f"--migration-backend {migration_backend} "
f"--migration-buffer-blocks 32 "
f"--tensor-parallel-size 1 "
f"--request-output-queue-port {1234+port} "
f"--max-num-batched-tokens {max_num_batched_tokens} "
f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &"
)
return command

def wait_for_llumnix_service_ready(ip_ports, timeout=120):
start_time = time.time()
while True:
Expand Down Expand Up @@ -112,6 +146,7 @@ def generate_bench_command(ip_ports: str,
def shutdown_llumnix_service_func():
subprocess.run('pkill -f llumnix.entrypoints.vllm.api_server', shell=True, check=False)
subprocess.run('pkill -f benchmark_serving.py', shell=True, check=False)
subprocess.run('pkill -f llumnix.entrypoints.vllm.serve', shell=True, check=False)

@pytest.fixture
def shutdown_llumnix_service():
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_test/global_scheduler/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def init_manager():
return manager

def init_manager_with_deployment_mode(deployment_mode, request_output_queue_type="rayqueue"):
manager_args = ManagerArgs(migration_backend="rayrpc", enbale_port_increment=True)
manager_args = ManagerArgs(migration_backend="rayrpc", enable_port_increment=True)
entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type=request_output_queue_type)
engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True)
deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM)
Expand Down

0 comments on commit fd2bbf7

Please sign in to comment.