formatter

HabanaAI · Jan 29, 2025 · bbf9258 · bbf9258
1 parent cdebd5e
commit bbf9258
Show file tree

Hide file tree

Showing 27 changed files with 951 additions and 867 deletions.
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
@@ -9,26 +9,28 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
 
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
 
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    },
-    {
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "enforce_eager": False,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        },
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            "enforce_eager": False,
+
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
@@ -94,26 +96,28 @@ def test_block_manager_with_preemption(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
 
-        # Our prompts will generate 128 tokens; since the prompts themselves are
-        # small, we don't need much KV space beyond 128.
-        "max_model_len": 160,
+            # Our prompts will generate 128 tokens; since the prompts
+            # themselves are small, we don't need much KV space beyond 128.
+            "max_model_len": 160,
 
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-    },
-    {
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # Our prompts will generate 128 tokens; since the prompts themselves are
-        # small, we don't need much KV space beyond 128.
-        "max_model_len": 160,
-        "enforce_eager": False,
-    }])
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+        },
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+
+            # Our prompts will generate 128 tokens; since the prompts
+            # themselves are small, we don't need much KV space beyond 128.
+            "max_model_len": 160,
+            "enforce_eager": False,
+        }
+    ])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -283,32 +287,34 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
 
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
 
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
 
-        # Enable prefill cache
-        "enable_prefix_caching": True,
-    },
-    {
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "enforce_eager": False,
+            # Enable prefill cache
+            "enable_prefix_caching": True,
+        },
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            "enforce_eager": False,
 
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
 
-        # Enable prefill cache
-        "enable_prefix_caching": True,
-    }])
+            # Enable prefill cache
+            "enable_prefix_caching": True,
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
@@ -376,26 +382,28 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
 
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
 
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    },
-    {
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "enforce_eager": False,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        },
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            "enforce_eager": False,
+
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "enable_prefix_caching": False
@@ -459,28 +467,30 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
 
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
 
-        # we keep the blocks small, so that hit eviction quickly
-        "max_model_len": 48,
-        "block_size": 16,
-        "num_gpu_blocks_override": 3,
-    },
-    {
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "enforce_eager": False,
-
-        # we keep the blocks small, so that hit eviction quickly
-        "max_model_len": 48,
-        "block_size": 16,
-        "num_gpu_blocks_override": 3,
-    }])
+            # we keep the blocks small, so that hit eviction quickly
+            "max_model_len": 48,
+            "block_size": 16,
+            "num_gpu_blocks_override": 3,
+        },
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            "enforce_eager": False,
+
+            # we keep the blocks small, so that hit eviction quickly
+            "max_model_len": 48,
+            "block_size": 16,
+            "num_gpu_blocks_override": 3,
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "enable_prefix_caching": False

diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -15,22 +15,24 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    },
-    {
-        "model": MODEL,
-        "enforce_eager": False,
-        "block_size": BLOCK_SIZE,
-        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
+    [
+        {
+            "model": MODEL,
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "block_size": BLOCK_SIZE,
+            # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        },
+        {
+            "model": MODEL,
+            "enforce_eager": False,
+            "block_size": BLOCK_SIZE,
+            # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
@@ -83,20 +85,22 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    },
-    {
-        "model": MODEL,
-        "enforce_eager": False,
-        "block_size": BLOCK_SIZE,
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
+    [
+        {
+            "model": MODEL,
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "block_size": BLOCK_SIZE,
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        },
+        {
+            "model": MODEL,
+            "enforce_eager": False,
+            "block_size": BLOCK_SIZE,
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        }
+    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])

diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
@@ -14,7 +14,8 @@
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("enforce_eager", [False, True])
-def test_context_length_too_short(vllm_runner, image_assets, model, enforce_eager):
+def test_context_length_too_short(vllm_runner, image_assets, model,
+                                  enforce_eager):
     images = [asset.pil_image for asset in image_assets]
 
     with pytest.raises(ValueError, match="too long to fit into the model"):

diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,7 +1,7 @@
 import sys
-import pytest
 from contextlib import nullcontext
 
+import pytest
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams

diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
@@ -1,7 +1,8 @@
 from typing import List
 
-import vllm
 import pytest
+
+import vllm
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "microsoft/phi-2"

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -196,7 +196,8 @@ def run_multi_audio_test(
 ])
 @pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
-                num_logprobs: int, vllm_kwargs: dict, enforce_eager: bool) -> None:
+                num_logprobs: int, vllm_kwargs: dict,
+                enforce_eager: bool) -> None:
 
     vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
     hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)

diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
@@ -108,7 +108,8 @@ def run_awq_test(
 @pytest.mark.parametrize("enforce_eager", [False, True])
 @torch.inference_mode()
 def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs, enforce_eager) -> None:
+                    size_factors, dtype, max_tokens, num_logprobs,
+                    enforce_eager) -> None:
     run_awq_test(
         vllm_runner,
         image_assets,

diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -150,7 +150,8 @@ def run_test(
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int, enforce_eager: bool) -> None:
+                dtype: str, max_tokens: int, num_logprobs: int,
+                enforce_eager: bool) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -175,8 +176,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("enforce_eager", [False, True])
-def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
-                         dtype, enforce_eager) -> None:
+def test_regression_7840(hf_runner, vllm_runner, image_assets, model, dtype,
+                         enforce_eager) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_regresion_7840 = [