Skip to content

Commit

Permalink
formatter
Browse files Browse the repository at this point in the history
  • Loading branch information
Kacper-Pietkun committed Jan 29, 2025
1 parent cdebd5e commit bbf9258
Show file tree
Hide file tree
Showing 27 changed files with 951 additions and 867 deletions.
200 changes: 105 additions & 95 deletions tests/core/block/e2e/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,28 @@

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
Expand Down Expand Up @@ -94,26 +96,28 @@ def test_block_manager_with_preemption(baseline_llm_generator,

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
"max_model_len": 160,
# Our prompts will generate 128 tokens; since the prompts
# themselves are small, we don't need much KV space beyond 128.
"max_model_len": 160,
# skip cuda graph creation for fast test.
"enforce_eager": True,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
"max_model_len": 160,
"enforce_eager": False,
}])
# skip cuda graph creation for fast test.
"enforce_eager": True,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# Our prompts will generate 128 tokens; since the prompts
# themselves are small, we don't need much KV space beyond 128.
"max_model_len": 160,
"enforce_eager": False,
}
])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
Expand Down Expand Up @@ -283,32 +287,34 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
"enable_prefix_caching": True,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Enable prefill cache
"enable_prefix_caching": True,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
"enable_prefix_caching": True,
}])
# Enable prefill cache
"enable_prefix_caching": True,
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
Expand Down Expand Up @@ -376,26 +382,28 @@ def test_block_manager_prefix_caching_enabled_with_preemption(

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
Expand Down Expand Up @@ -459,28 +467,30 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# skip cuda graph creation for fast test.
"enforce_eager": True,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
}])
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
},
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"enforce_eager": False,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
Expand Down
64 changes: 34 additions & 30 deletions tests/core/block/e2e/test_correctness_sliding_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
},
{
"model": MODEL,
"enforce_eager": False,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
[
{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
},
{
"model": MODEL,
"enforce_eager": False,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
Expand Down Expand Up @@ -83,20 +85,22 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,

@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
},
{
"model": MODEL,
"enforce_eager": False,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
[
{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
},
{
"model": MODEL,
"enforce_eager": False,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
@pytest.mark.parametrize("batch_size", [5])
Expand Down
3 changes: 2 additions & 1 deletion tests/engine/test_short_mm_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_context_length_too_short(vllm_runner, image_assets, model, enforce_eager):
def test_context_length_too_short(vllm_runner, image_assets, model,
enforce_eager):
images = [asset.pil_image for asset in image_assets]

with pytest.raises(ValueError, match="too long to fit into the model"):
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/llm/test_lazy_outlines.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import pytest
from contextlib import nullcontext

import pytest
from vllm_test_utils import BlameResult, blame

from vllm import LLM, SamplingParams
Expand Down
3 changes: 2 additions & 1 deletion tests/lora/test_phi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import List

import vllm
import pytest

import vllm
from vllm.lora.request import LoRARequest

MODEL_PATH = "microsoft/phi-2"
Expand Down
3 changes: 2 additions & 1 deletion tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ def run_multi_audio_test(
])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
num_logprobs: int, vllm_kwargs: dict, enforce_eager: bool) -> None:
num_logprobs: int, vllm_kwargs: dict,
enforce_eager: bool) -> None:

vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
Expand Down
3 changes: 2 additions & 1 deletion tests/models/decoder_only/vision_language/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def run_awq_test(
@pytest.mark.parametrize("enforce_eager", [False, True])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs, enforce_eager) -> None:
size_factors, dtype, max_tokens, num_logprobs,
enforce_eager) -> None:
run_awq_test(
vllm_runner,
image_assets,
Expand Down
7 changes: 4 additions & 3 deletions tests/models/decoder_only/vision_language/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ def run_test(
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int, enforce_eager: bool) -> None:
dtype: str, max_tokens: int, num_logprobs: int,
enforce_eager: bool) -> None:
images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
Expand All @@ -175,8 +176,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
dtype, enforce_eager) -> None:
def test_regression_7840(hf_runner, vllm_runner, image_assets, model, dtype,
enforce_eager) -> None:
images = [asset.pil_image for asset in image_assets]

inputs_regresion_7840 = [
Expand Down
Loading

0 comments on commit bbf9258

Please sign in to comment.