From 8db0fa5c29ae6895d488ad7b9f5b65be05f739cc Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 22 Mar 2024 20:28:14 +0100 Subject: [PATCH] Dynamic scheduler delay to improve ITL performance (#3279) Co-authored-by: Jan van Lunteren --- tests/core/test_scheduler.py | 34 ++++++++++++++++++++++++++++++++++ vllm/config.py | 4 ++++ vllm/core/scheduler.py | 26 +++++++++++++++++++++++++- vllm/engine/arg_utils.py | 10 +++++++++- 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 397101fa8..4a690e24e 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,5 +1,6 @@ from typing import List import pytest # noqa +import time from vllm.config import CacheConfig, SchedulerConfig from vllm.core.scheduler import Scheduler @@ -168,3 +169,36 @@ def test_scheduler_max_seqs(): # and one is prompting. _, out = scheduler.schedule() assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]]) + + +def test_scheduler_delay_factor(): + + block_size = 4 + scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 8 + cache_config.num_gpu_blocks = 8 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # schedule first prompt + _, seq_group = create_dummy_prompt("0", prompt_length=block_size) + scheduler.add_seq_group(seq_group) + seq_group_meta, out = scheduler.schedule() + assert out.prompt_run + assert seq_group_meta[0].request_id == '0' + + # wait for a second before scheduling next prompt + time.sleep(1) + _, seq_group = create_dummy_prompt("1", prompt_length=block_size) + scheduler.add_seq_group(seq_group) + + # second prompt should *not* be scheduled + seq_group_meta, out = scheduler.schedule() + assert not out.prompt_run + assert seq_group_meta[0].request_id == '0' + + # wait for more than 0.5 second and try again + time.sleep(0.6) + seq_group_meta, out = scheduler.schedule() + assert out.prompt_run + assert seq_group_meta[0].request_id == '1' diff --git a/vllm/config.py b/vllm/config.py index 6dfb51586..2003563e4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -517,6 +517,8 @@ class SchedulerConfig: iteration. max_model_len: Maximum length of a sequence (including prompt and generated text). + delay_factor: Apply a delay (of delay factor multiplied by previous + prompt latency) before scheduling next prompt. """ def __init__( @@ -524,6 +526,7 @@ def __init__( max_num_batched_tokens: Optional[int], max_num_seqs: int, max_model_len: int, + delay_factor: float = 0.0, ) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens @@ -533,6 +536,7 @@ def __init__( self.max_num_batched_tokens = max(max_model_len, 2048) self.max_num_seqs = max_num_seqs self.max_model_len = max_model_len + self.delay_factor = delay_factor self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index be55e8520..4bd0ef360 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -103,6 +103,13 @@ def __init__( # Sequence groups in the SWAPPED state. self.swapped: Deque[SequenceGroup] = deque() + # Time at previous scheduling step + self.prev_time = 0.0 + # Did we schedule a prompt at previous step? + self.prev_prompt = False + # Latency of the last prompt step + self.last_prompt_latency = 0.0 + @property def lora_enabled(self) -> bool: return bool(self.lora_config) @@ -179,7 +186,7 @@ def _schedule(self) -> SchedulerOutputs: # are added to the back. leftover_waiting_sequences = deque() num_batched_tokens = 0 - while self.waiting: + while self._passed_delay(now) and self.waiting: seq_group = self.waiting[0] waiting_seqs = seq_group.get_seqs( status=SequenceStatus.WAITING) @@ -246,6 +253,7 @@ def _schedule(self) -> SchedulerOutputs: self.waiting.extendleft(leftover_waiting_sequences) if scheduled or ignored_seq_groups: + self.prev_prompt = True scheduler_outputs = SchedulerOutputs( scheduled_seq_groups=scheduled, prompt_run=True, @@ -491,3 +499,19 @@ def _swap_out( def mark_blocks_as_computed(self, seq_group: SequenceGroup): self.block_manager.mark_blocks_as_computed(seq_group) + + def _passed_delay(self, now: float) -> bool: + if self.prev_prompt: + self.last_prompt_latency = now - self.prev_time + self.prev_time, self.prev_prompt = now, False + # Delay scheduling prompts to let waiting queue fill up + if self.scheduler_config.delay_factor > 0 and self.waiting: + earliest_arrival_time = min( + [e.metrics.arrival_time for e in self.waiting]) + passed_delay = ( + (now - earliest_arrival_time) > + (self.scheduler_config.delay_factor * self.last_prompt_latency) + or not self.running) + else: + passed_delay = True + return passed_delay diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 94c80f428..2070686ea 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -51,6 +51,7 @@ class EngineArgs: max_cpu_loras: Optional[int] = None device: str = 'auto' ray_workers_use_nsight: bool = False + scheduler_delay_factor: float = 0.0 def __post_init__(self): if self.tokenizer is None: @@ -305,6 +306,12 @@ def add_cli_args( default=EngineArgs.device, choices=["auto", "cuda", "neuron"], help='Device type for vLLM execution.') + parser.add_argument( + '--scheduler-delay-factor', + type=float, + default=EngineArgs.scheduler_delay_factor, + help='Apply a delay (of delay factor multiplied by previous' + 'prompt latency) before scheduling next prompt.') return parser @classmethod @@ -342,7 +349,8 @@ def create_engine_configs( ), self.ray_workers_use_nsight) scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, - model_config.max_model_len) + model_config.max_model_len, + self.scheduler_delay_factor) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras,