From 58fd57ff1d6d4ed56bed40aaaf9fe133b93b2efa Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 17 Jan 2025 13:24:22 -0300
Subject: [PATCH 01/50] [Bugfix] Fix score api for missing max_model_len
 validation (#12119)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/entrypoints/openai/test_score.py    | 45 ++++++++++++++++---
 vllm/entrypoints/openai/serving_engine.py | 14 +++---
 vllm/entrypoints/openai/serving_score.py  | 54 ++++++++++++++---------
 3 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index a803ea4a8d6ad..06e0f93dbe269 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -12,6 +12,9 @@
 def server():
     args = [
         "--enforce-eager",
+        # Will be used on tests to compare prompt input length
+        "--max-model-len",
+        "100"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -20,8 +23,7 @@ def server():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
-                                      model_name: str):
+def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
     text_1 = "What is the capital of France?"
     text_2 = [
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
@@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
-                                       model_name: str):
+def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
     text_1 = [
         "What is the capital of the United States?",
         "What is the capital of France?"
@@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
-                                     model_name: str):
+def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
     text_1 = "What is the capital of France?"
     text_2 = "The capital of France is Paris."
 
@@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     assert score.data is not None
     assert len(score.data) == 1
     assert score.data[0].score >= 0.9
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    text_1 = "What is the capital of France?" * 20
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    assert score_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        score_response.text
+
+    # Test truncation
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                       "truncate_prompt_tokens": 101
+                                   })
+    assert score_response.status_code == 400
+    assert "Please, select a smaller truncation size." in \
+        score_response.text
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 88859255f202a..3da447be06430 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -203,15 +203,19 @@ def _validate_input(
     ) -> TextTokensPrompt:
         token_num = len(input_ids)
 
-        # Note: EmbeddingRequest doesn't have max_tokens
-        if isinstance(request,
-                      (EmbeddingChatRequest, EmbeddingCompletionRequest)):
+        # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens
+        if isinstance(
+                request,
+            (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)):
+
+            operation = "score" if isinstance(request, ScoreRequest) \
+                else "embedding generation"
             if token_num > self.max_model_len:
                 raise ValueError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
-                    f"{token_num} tokens in the input for embedding "
-                    f"generation. Please reduce the length of the input.")
+                    f"{token_num} tokens in the input for {operation}. "
+                    f"Please reduce the length of the input.")
             return TextTokensPrompt(prompt=input_text,
                                     prompt_token_ids=input_ids)
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 5d3e7139d7a17..381edf8fac49e 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -101,6 +101,38 @@ async def create_score(
             if not self.model_config.is_cross_encoder:
                 raise ValueError("Model is not cross encoder.")
 
+            if truncate_prompt_tokens is not None and \
+                truncate_prompt_tokens > self.max_model_len:
+                raise ValueError(
+                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                    f"is greater than max_model_len ({self.max_model_len})."
+                    f" Please, select a smaller truncation size.")
+
+            input_pairs = make_pairs(request.text_1, request.text_2)
+            for q, t in input_pairs:
+                request_prompt = f"{q}{tokenizer.sep_token}{t}"
+
+                tokenization_kwargs: Dict[str, Any] = {}
+                if truncate_prompt_tokens is not None:
+                    tokenization_kwargs["truncation"] = True
+                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+                tokenize_async = make_async(tokenizer.__call__,
+                                            executor=self._tokenizer_executor)
+                prompt_inputs = await tokenize_async(text=q,
+                                                     text_pair=t,
+                                                     **tokenization_kwargs)
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -108,28 +140,6 @@ async def create_score(
         # Schedule the request and get the result generator.
         generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        input_pairs = make_pairs(request.text_1, request.text_2)
-
-        for q, t in input_pairs:
-            request_prompt = f"{q}{tokenizer.sep_token}{t}"
-
-            tokenization_kwargs: Dict[str, Any] = {}
-            if truncate_prompt_tokens is not None:
-                tokenization_kwargs["truncation"] = True
-                tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-            tokenize_async = make_async(tokenizer.__call__,
-                                        executor=self._tokenizer_executor)
-            prompt_inputs = await tokenize_async(text=q,
-                                                 text_pair=t,
-                                                 **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-
-            request_prompts.append(request_prompt)
-            engine_prompts.append(engine_prompt)
-
         try:
             pooling_params = request.to_pooling_params()
 

From 54cacf008f00d35d46273fed4d538cf5740d0965 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 18 Jan 2025 00:47:53 +0800
Subject: [PATCH 02/50] [Bugfix] Mistral tokenizer encode accept list of str
 (#12149)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 38 +++++++++++++++----
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 17d722e3d88fe..d801cf4e4c7b1 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -18,6 +18,7 @@
                                                      Tekkenizer)
 
 from vllm.logger import init_logger
+from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -27,7 +28,7 @@
 
 @dataclass
 class Encoding:
-    input_ids: List[int]
+    input_ids: Union[List[int], List[List[int]]]
 
 
 def maybe_serialize_tool_calls(request: ChatCompletionRequest):
@@ -223,17 +224,25 @@ def __len__(self) -> int:
 
     def __call__(
         self,
-        prompt: str,
+        prompt: Union[str, List[str], List[int]],
         add_special_tokens: bool = False,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ):
-        # Mistral Tokenizers should not add special tokens
-        input_ids = self.encode(prompt)
-
-        if truncation:
-            input_ids = input_ids[:max_length]
-
+        input_ids: Union[List[int], List[List[int]]]
+        # For List[str], original prompt text
+        if is_list_of(prompt, str):
+            input_ids_: List[List[int]] = []
+            for p in prompt:
+                each_input_ids = self.encode_one(p, truncation, max_length)
+                input_ids_.append(each_input_ids)
+            input_ids = input_ids_
+        # For List[int], apply chat template output, already tokens.
+        elif is_list_of(prompt, int):
+            input_ids = prompt
+        # For str, single prompt text
+        else:
+            input_ids = self.encode_one(prompt, truncation, max_length)
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
@@ -245,6 +254,19 @@ def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
         return {}
 
+    def encode_one(
+        self,
+        prompt: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        # Mistral Tokenizers should not add special tokens
+        input_ids = self.encode(prompt)
+
+        if truncation:
+            input_ids = input_ids[:max_length]
+        return input_ids
+
     def encode(self, prompt: str) -> List[int]:
         # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.

From b5b57e301e7bce3a90af7a3ed206414c46eb64e0 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 17 Jan 2025 12:12:26 -0500
Subject: [PATCH 03/50] [AMD][FP8] Using MI300 FP8 format on ROCm for
 block_quant (#12134)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 .../model_executor/layers/quantization/fp8.py | 33 +++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    | 14 ++++++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4969ee559522e..26dd5df4e55b2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -247,6 +247,15 @@ def create_weights(
     def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            if current_platform.is_rocm():
+                weight, weight_scale, _ = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=layer.weight,
+                        weight_scale=layer.weight_scale_inv,
+                        input_scale=layer.input_scale)
+                layer.weight = Parameter(weight, requires_grad=False)
+                layer.weight_scale_inv = Parameter(weight_scale,
+                                                   requires_grad=False)
             return
         layer.weight = torch.nn.Parameter(layer.weight.data,
                                           requires_grad=False)
@@ -495,6 +504,30 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            if current_platform.is_rocm():
+                w13_weight, w13_weight_scale_inv, w13_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale_inv,
+                        layer.w13_input_scale)
+                w2_weight, w2_weight_scale_inv, w2_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale_inv,
+                        layer.w2_input_scale)
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                      requires_grad=False)
+                layer.w13_weight_scale_inv = torch.nn.Parameter(
+                    w13_weight_scale_inv, requires_grad=False)
+                if w13_input_scale is not None:
+                    layer.w13_input_scale = torch.nn.Parameter(
+                        w13_input_scale, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                     requires_grad=False)
+                layer.w2_weight_scale_inv = torch.nn.Parameter(
+                    w2_weight_scale_inv, requires_grad=False)
+                if w2_input_scale is not None:
+                    layer.w2_input_scale = torch.nn.Parameter(
+                        w2_input_scale, requires_grad=False)
             return
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f3c3e130e4161..b6882cc7c837c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+from vllm.platforms import current_platform
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear(
 
 
 def input_to_float8(
-    x: torch.Tensor,
-    dtype: torch.dtype = torch.float8_e4m3fn
+        x: torch.Tensor,
+        dtype: Optional[torch.dtype] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""
+    if dtype is None:
+        dtype = (torch.float8_e4m3fnuz
+                 if current_platform.is_rocm() else torch.float8_e4m3fn)
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
@@ -125,7 +130,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: Optional[torch.dtype] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -140,6 +145,9 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
+    if dtype is None:
+        dtype = (torch.float8_e4m3fnuz
+                 if current_platform.is_rocm() else torch.float8_e4m3fn)
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
         f"by `group_size` {group_size}")

From 7b98a65ae6f011fb31fefa1f563b7d6e554df434 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 04:29:31 +0800
Subject: [PATCH 04/50] [torch.compile] disable logging when cache is disabled
 (#12043)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 87655530cead4..157e3f7f39c9c 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -251,15 +251,27 @@ def _check_can_cache(*args, **kwargs):
         def _get_shape_env() -> AlwaysHitShapeEnv:
             return AlwaysHitShapeEnv()
 
-        with patch(# for hijacking the hash of the compiled graph
-                "torch._inductor.codecache.compiled_fx_graph_hash",
-                hijack_compiled_fx_graph_hash), \
-            patch(# for providing a dummy shape environment
-                "torch._inductor.codecache.FxGraphCache._get_shape_env",
-                 _get_shape_env), \
-            patch(# for forcing the graph to be cached
-                "torch._inductor.codecache.FxGraphCache._check_can_cache",
-                _check_can_cache):
+        with ExitStack() as stack:
+            if not cache_data.disabled:
+                # compilation cache is enabled, patch several functions
+
+                # for hijacking the hash of the compiled graph
+                stack.enter_context(
+                    patch("torch._inductor.codecache.compiled_fx_graph_hash",
+                          hijack_compiled_fx_graph_hash))
+
+                # for providing a dummy shape environment
+                stack.enter_context(
+                    patch(
+                        "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                        _get_shape_env))
+
+                # for forcing the graph to be cached
+                stack.enter_context(
+                    patch(
+                        "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                        _check_can_cache))
+
             compiled_graph = compile_fx(graph,
                                         example_inputs,
                                         config_patches=current_config)

From 2b835032275622d70b19b8cd834740336dc26138 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 10:53:27 +0800
Subject: [PATCH 05/50] [misc] fix cross-node TP (#12166)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/mp_distributed_executor.py | 38 ++++++++++++++++++++++--
 vllm/platforms/cuda.py                   | 22 --------------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 8ae88e646aad6..a80b0ee8b3122 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 from typing import Any, Callable, List, Optional, Union
 
 import cloudpickle
@@ -10,8 +11,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async, run_method)
+from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
+                        make_async, run_method, update_environment_variables)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -22,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
 
     uses_ray: bool = False
 
+    def _check_cuda(self) -> None:
+        """Check that the number of GPUs is sufficient for the parallel
+        configuration. Separate from _init_executor to reduce the number of
+        indented blocks.
+        """
+        parallel_config = self.parallel_config
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        if tensor_parallel_size > cuda_device_count:
+            raise RuntimeError(
+                f"please set tensor_parallel_size ({tensor_parallel_size}) "
+                f"to less than max local gpu count ({cuda_device_count})")
+
+        if world_size > cuda_device_count:
+            raise RuntimeError(
+                f"please ensure that world_size ({world_size}) "
+                f"is less than than max local gpu count ({cuda_device_count})")
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
     def _init_executor(self) -> None:
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            self._check_cuda()
+
         # Create the parallel GPU workers.
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8350177b68ade..2587e3a11dde3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -139,28 +139,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
-        world_size = parallel_config.world_size
-        tensor_parallel_size = parallel_config.tensor_parallel_size
-
-        from vllm.utils import (cuda_device_count_stateless,
-                                update_environment_variables)
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        assert tensor_parallel_size <= cuda_device_count, (
-            f"please set tensor_parallel_size ({tensor_parallel_size}) "
-            f"to less than max local gpu count ({cuda_device_count})")
-
-        assert world_size <= cuda_device_count, (
-            f"please ensure that world_size ({world_size}) "
-            f"is less than than max local gpu count ({cuda_device_count})")
-
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16

From c09503ddd657850c66548b5bb28e58bac1c4afb7 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:15:53 -0500
Subject: [PATCH 06/50] [AMD][CI/Build][Bugfix] use pytorch stale wheel
 (#12172)

Signed-off-by: hongxyan <hongxyan@amd.com>
---
 Dockerfile.rocm                                          | 6 +++---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e733994f8c33e..e922cb207b899 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -51,10 +51,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         *"rocm-6.2"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --pre \
-                torch==2.6.0.dev20241113+rocm6.2 \
+                torch \
                 'setuptools-scm>=8' \
-                torchvision==0.20.0.dev20241113+rocm6.2 \
-                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+                torchvision \
+                --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
         *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index f6f9d3c303f89..4256027e6c40e 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -70,7 +70,7 @@ Currently, there are no pre-built ROCm wheels.
 
     # Install PyTorch
     $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2
 
     # Build & install AMD SMI
     $ pip install /opt/rocm/share/amd_smi

From da02cb4b274ab8bcebb8b8e677ff4b43440bc499 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 12:25:08 +0800
Subject: [PATCH 07/50] [core] further polish memory profiling (#12126)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/test_utils.py   | 26 ++++++------
 vllm/utils.py         | 95 +++++++++++++++++++++++++------------------
 vllm/worker/worker.py | 31 +++++++-------
 3 files changed, 85 insertions(+), 67 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index c68d730af7f8a..d5dc4464e634d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,10 +9,10 @@
 from vllm_test_utils import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
+                        PlaceholderModule, StoreBoolean, bind_kv_cache,
+                        deprecate_kwargs, get_open_port, memory_profiling,
+                        merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -284,14 +284,13 @@ def test_memory_profiling():
     # 512 MiB allocation outside of this instance
     handle1 = lib.cudaMalloc(512 * 1024 * 1024)
 
-    baseline_memory_in_bytes = \
-        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+    baseline_snapshot = MemorySnapshot()
 
     # load weights
 
     weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
 
-    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
 
     def measure_current_non_torch():
         free, total = torch.cuda.mem_get_info()
@@ -300,8 +299,8 @@ def measure_current_non_torch():
         current_non_torch = current_used - current_torch
         return current_non_torch
 
-    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+    with memory_profiling(baseline_snapshot=baseline_snapshot,
+    weights_memory=weights_memory) as result, \
         monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
@@ -316,13 +315,12 @@ def measure_current_non_torch():
     assert measured_diff == 256 * 1024 * 1024
 
     # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by PyTorch caching allocator,
-    # we cannot control PyTorch's behavior of its internal buffers,
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
     # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
-    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
-    assert abs(torch_peak_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
diff --git a/vllm/utils.py b/vllm/utils.py
index 89ba119bb5e55..17bffd2846b46 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1923,36 +1923,57 @@ def kill_process_tree(pid: int):
 @dataclass
 class MemorySnapshot:
     """Memory snapshot."""
-    torch_peak_in_bytes: int = 0
-    torch_memory_in_bytes: int = 0
+    torch_peak: int = 0
+    cuda_memory: int = 0
+    torch_memory: int = 0
+    non_torch_memory: int = 0
     timestamp: float = 0.0
+    auto_measure: bool = True
+
+    def __post_init__(self):
+        if self.auto_measure:
+            self.measure()
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # we measure the torch peak memory usage via allocated_bytes,
+        # rather than `torch.cuda.memory_reserved()` .
+        # After `torch.cuda.reset_peak_memory_stats()`,
+        # `torch.cuda.memory_reserved()` will keep growing, and only shrink
+        # when we call `torch.cuda.empty_cache()` or OOM happens.
+        self.torch_peak = torch.cuda.memory_stats().get(
+            "allocated_bytes.all.peak", 0)
+
+        self.cuda_memory = torch.cuda.mem_get_info(
+        )[1] - torch.cuda.mem_get_info()[0]
+
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
-        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
+        # this is used to measure the non-torch memory usage
+        self.torch_memory = torch.cuda.memory_reserved()
+
+        self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
-        """support a - b"""
         return MemorySnapshot(
-            torch_peak_in_bytes=self.torch_peak_in_bytes -
-            other.torch_peak_in_bytes,
-            torch_memory_in_bytes=self.torch_memory_in_bytes -
-            other.torch_memory_in_bytes,
-            timestamp=self.timestamp - other.timestamp)
+            torch_peak=self.torch_peak - other.torch_peak,
+            cuda_memory=self.cuda_memory - other.cuda_memory,
+            torch_memory=self.torch_memory - other.torch_memory,
+            non_torch_memory=self.non_torch_memory - other.non_torch_memory,
+            timestamp=self.timestamp - other.timestamp,
+            auto_measure=False,
+        )
 
 
 @dataclass
 class MemoryProfilingResult:
-    """Memory profiling result.
-    """  # noqa
-    baseline_memory_in_bytes: int = 0
-    non_kv_cache_memory_in_bytes: int = 0
-    torch_peak_increase_in_bytes: int = 0
-    non_torch_increase_in_bytes: int = 0
-    weights_memory_in_bytes: float = 0
+    """Memory profiling result. All numbers are in bytes.
+    """
+    non_kv_cache_memory: int = 0
+    torch_peak_increase: int = 0
+    non_torch_increase: int = 0
+    weights_memory: float = 0
+    before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
     before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
@@ -1960,18 +1981,14 @@ class MemoryProfilingResult:
 
 @contextlib.contextmanager
 def memory_profiling(
-    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
-) -> Generator[MemoryProfilingResult, None, None]:
+        baseline_snapshot: MemorySnapshot,
+        weights_memory: int) -> Generator[MemoryProfilingResult, None, None]:
     """Memory profiling context manager.
-    baseline_memory_in_bytes: memory used by all the components other than
-        the current vLLM instance. It contains: memory used by other processes, memory
-        used by another vLLM instance in the same process, etc. It is usually measured
-        before the current vLLM instance initialize the device. And we assume it is
-        constant during the profiling of the current vLLM instance.
-    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+    baseline_snapshot: the memory snapshot before the current vLLM instance.
+    weights_memory: memory used by PyTorch when loading the model weights.
         Note that, before loading the model weights, we also initialize the device
         and distributed environment, which may consume some memory. This part is not
-        included in the weights_memory_in_bytes because PyTorch does not control it.
+        included in the weights_memory because PyTorch does not control it.
 
     The memory in one GPU can be classified into 3 categories:
     1. memory used by anything other than the current vLLM instance.
@@ -2006,20 +2023,21 @@ def memory_profiling(
     b. 2 GiB reserved for the peak activation tensors (category 2)
     c. 1 GiB used by non-torch components (category 3)
 
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
 
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
 
-    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
+    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
     """ # noqa
+    gc.collect()
+    torch.cuda.empty_cache()
     torch.cuda.reset_peak_memory_stats()
 
     result = MemoryProfilingResult()
 
-    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    result.before_create = baseline_snapshot
     # the part of memory used for holding the model weights
-    result.weights_memory_in_bytes = weights_memory_in_bytes
+    result.weights_memory = weights_memory
 
     result.before_profile.measure()
 
@@ -2030,13 +2048,12 @@ def memory_profiling(
 
     result.after_profile.measure()
 
-    diff = result.after_profile - result.before_profile
-    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
-    current_cuda_memory_bytes = torch.cuda.mem_get_info(
-    )[1] - torch.cuda.mem_get_info()[0]
-    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
-    result.profile_time = diff.timestamp
-    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
+    diff_profile = result.after_profile - result.before_profile
+    diff_from_create = result.after_profile - result.before_create
+    result.torch_peak_increase = diff_profile.torch_peak
+    result.non_torch_increase = diff_from_create.non_torch_memory
+    result.profile_time = diff_profile.timestamp
+    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
 
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 43eeb287d64eb..29d62ddda3dc0 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,8 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
+from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
+                        memory_profiling)
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -137,7 +138,8 @@ def init_device(self) -> None:
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+            torch.cuda.reset_peak_memory_stats()
+            self.baseline_snapshot = MemorySnapshot()
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -192,10 +194,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
-                              self.init_gpu_memory,
-                              weights_memory_in_bytes=self.model_runner.
-                              model_memory_usage) as result:
+        with memory_profiling(
+                self.baseline_snapshot,
+                weights_memory=self.model_runner.model_memory_usage) as result:
             self.model_runner.profile_run()
 
         self._assert_memory_footprint_increased_during_profiling()
@@ -203,7 +204,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         memory_for_current_instance = total_gpu_memory * \
             self.cache_config.gpu_memory_utilization
         available_kv_cache_memory = (memory_for_current_instance -
-                                     result.non_kv_cache_memory_in_bytes)
+                                     result.non_kv_cache_memory)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -226,11 +227,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                f"({self.cache_config.gpu_memory_utilization:.2f})"
                f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
                "model weights take "
-               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
                " non_torch_memory takes "
-               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
                " PyTorch activation peak memory takes "
-               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
                " the rest of the memory reserved for KV Cache is "
                f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
 
@@ -246,11 +247,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
     def _assert_memory_footprint_increased_during_profiling(self):
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
-        assert self.init_gpu_memory - free_gpu_memory > 0, (
+        free_gpu_memory, total = torch.cuda.mem_get_info()
+        cuda_memory = total - free_gpu_memory
+        assert self.baseline_snapshot.cuda_memory < cuda_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
+            f"Initial used memory {self.baseline_snapshot.cuda_memory}, "
+            f"currently used memory {cuda_memory}. "
+            f"This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
     def initialize_cache(self, num_gpu_blocks: int,

From 813f249f022a44aded2a843f0c7108ea0b7d1f6b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 17 Jan 2025 23:35:21 -0500
Subject: [PATCH 08/50] [Docs] Fix broken link in SECURITY.md (#12175)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SECURITY.md b/SECURITY.md
index de0032d26c87b..47196a1f1221e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@
 
 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
 
 ---
 

From 02798ecabed36f4c255f5a12ad6c271f95cd8c4e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 18 Jan 2025 13:59:39 +0800
Subject: [PATCH 09/50] [Model] Port deepseek-vl2 processor, remove dependency
 (#12169)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/test-pipeline.yaml                 |   1 -
 docs/source/models/supported_models.md        |  10 +-
 .../vision_language_multi_image.py            |   2 +-
 .../vision_language/test_models.py            |   2 +-
 .../multimodal/processing/test_common.py      |   3 +
 vllm/model_executor/models/deepseek_vl2.py    |  51 +--
 .../transformers_utils/processors/__init__.py |   4 +
 .../processors/deepseek_vl2.py                | 361 ++++++++++++++++++
 8 files changed, 385 insertions(+), 49 deletions(-)
 create mode 100644 vllm/transformers_utils/processors/__init__.py
 create mode 100644 vllm/transformers_utils/processors/deepseek_vl2.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bff557d7fc92f..d2b140e718501 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -52,7 +52,6 @@ steps:
   - tests/worker
   - tests/standalone_tests/lazy_torch_compile.py
   commands:
-  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
   - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index d07cde3db5c6e..2edb610ddf959 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -767,16 +767,10 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{note}
-To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
-
-```shell
-pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
+```{note}
+To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
 ```
 
-Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-````
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index cf3c5dd4e0a2c..43c44fa867e0a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
-    "deepseek_vl2": load_deepseek_vl2,
+    "deepseek_vl_v2": load_deepseek_vl2,
     "h2ovl_chat": load_h2onvl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5710303548c34..ca572cc39e538 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -190,7 +190,7 @@
         dtype="bfloat16",
     ),
     "deepseek_vl_v2": VLMTestInfo(
-        models=["deepseek-ai/deepseek-vl2-tiny"],
+        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
         max_model_len=4096,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0a38779e0e4f0..1e3e7ea50b122 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,6 +22,8 @@ def _test_processing_correctness(
 ):
     if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
         hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    elif model_id == "deepseek-ai/deepseek-vl2-tiny":
+        hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]}
     else:
         hf_overrides = {}
 
@@ -139,6 +141,7 @@ def _test_processing_correctness(
     ("rhymes-ai/Aria", {"image": True}),
     ("Salesforce/blip2-opt-2.7b", {"image": False}),
     ("facebook/chameleon-7b", {"image": False}),
+    ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4553695022169..4d3d1c329a2c0 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,7 +1,7 @@
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
-from functools import cached_property, partial
+from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
@@ -9,7 +9,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import AutoProcessor, BatchFeature, ProcessorMixin
+from transformers import BatchFeature
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -31,6 +31,8 @@
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
                                                           MlpProjectorConfig,
                                                           VisionEncoderConfig)
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(DeepseekVLV2Config)
 
-    def get_hf_processor(self) -> ProcessorMixin:
-        # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2
-        # in the future, because it's flasky and lack of maintenance.
-        try:
-            from deepseek_vl2.models.processing_deepseek_vl_v2 import (
-                DeepseekVLV2Processor, select_best_resolution)
-            AutoProcessor.register("DeepseekVLV2Processor",
-                                   DeepseekVLV2Processor)
-        except ModuleNotFoundError as exc:
-            raise ModuleNotFoundError(
-                "You need to `pip install "
-                "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` "
-                "to use this model") from exc
-
-        processor = self.ctx.get_hf_processor(DeepseekVLV2Processor)
-        processor.select_best_resolution = partial(
-            select_best_resolution,
-            candidate_resolutions=processor.candidate_resolutions)
-        return processor
+    def get_hf_processor(self) -> DeepseekVLV2Processor:
+        return self.ctx.get_hf_processor(DeepseekVLV2Processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -224,31 +209,21 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if mm_data:
-            outputs = self.info.ctx.call_hf_processor(
+            processed_outputs = self.info.ctx.call_hf_processor(
                 self.info.get_hf_processor(**mm_kwargs),
                 dict(prompt=prompt, **mm_data),
                 mm_kwargs,
             )
-
-            # Deepseek-vl2 processor don't return BatchFeature,
-            # we need to manually create it
-            processed_outputs = dict(input_ids=outputs["input_ids"])
-            processed_outputs = BatchFeature(data=dict(processed_outputs),
-                                             tensor_type="pt")
-
-            # Remove batch dimension from processor outputs,
-            # because we will try batch to create NestedTensors
             target_dtype = self.info.ctx.model_config.dtype
-            pixel_values = outputs["images"].to(target_dtype).squeeze(0)
-            images_spatial_crop = outputs["images_spatial_crop"].squeeze(0)
+            pixel_values = processed_outputs.pop("pixel_values").to(
+                target_dtype)
+            # split pixel values into patches corresponding to each image
+            images_spatial_crop = processed_outputs["images_spatial_crop"]
             patches_per_image = [
                 x.prod().item() + 1 for x in images_spatial_crop
             ]
-
-            # Rename `images` -> `pixel_values` to avoid confusion
-            processed_outputs["pixel_values"] = list(
-                pixel_values.split(patches_per_image))
-            processed_outputs["images_spatial_crop"] = images_spatial_crop
+            pixel_values = pixel_values.split(patches_per_image)
+            processed_outputs["pixel_values"] = pixel_values
         else:
             tokenizer = self.info.get_tokenizer()
             processed_outputs = tokenizer(prompt,
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
new file mode 100644
index 0000000000000..9c71b8cada32e
--- /dev/null
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -0,0 +1,4 @@
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
+
+__all__ = ["DeepseekVLV2Processor"]
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
new file mode 100644
index 0000000000000..27cdf6bc22d0e
--- /dev/null
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -0,0 +1,361 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+from typing import List, Tuple
+
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+class ImageTransform:
+
+    def __init__(self,
+                 mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 normalize: bool = True):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: Tuple[Tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def select_best_resolution(self, image_size):
+        # used for cropping
+        original_width, original_height = image_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+
+        for width, height in self.candidate_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(
+                original_width * scale), int(original_height * scale)
+            effective_resolution = min(downscaled_width * downscaled_height,
+                                       original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+
+            if effective_resolution > max_effective_resolution or (
+                    effective_resolution == max_effective_resolution
+                    and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+
+        return best_fit
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str,
+        images: List[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        assert (prompt is not None and images is not None
+                ), "prompt and images must be used at the same time."
+
+        sft_format = prompt
+        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
+            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) |
+                   (input_ids == self.image_token_id)] = self.ignore_id
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            # 去掉结尾的eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        input_ids = input_ids.unsqueeze(0)
+
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: List[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            inference_mode=inference_mode,
+        )
+
+        return prepare
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = self.select_best_resolution(image.size)
+            else:
+                best_width, best_height = self.image_size, self.image_size
+
+            """process the global view"""
+            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
+                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(image, (best_width, best_height),
+                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+
+
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

From 6d0e3d372446cde48b387d4d3530e25fc6e06320 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 14:35:15 +0800
Subject: [PATCH 10/50] [core] clean up executor class hierarchy between v1 and
 v0 (#12171)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/executor_base.py         |  10 -
 vllm/v1/executor/abstract.py           |  87 ++++---
 vllm/v1/executor/multiproc_executor.py |  50 +---
 vllm/v1/executor/ray_executor.py       | 344 -------------------------
 vllm/v1/executor/ray_utils.py          | 280 --------------------
 vllm/v1/executor/uniproc_executor.py   |  88 -------
 6 files changed, 61 insertions(+), 798 deletions(-)
 delete mode 100644 vllm/v1/executor/ray_executor.py
 delete mode 100644 vllm/v1/executor/ray_utils.py
 delete mode 100644 vllm/v1/executor/uniproc_executor.py

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index d8457cb693cdb..e5952b388c543 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -79,16 +79,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         b = min([r[1] for r in results])
         return a, b
 
-    def initialize(self, num_gpu_blocks: int) -> None:
-        """
-        Initialize the KV caches and begin the model execution loop of the
-        underlying workers.
-        For V1 compatibility.
-        """
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
-        self.collective_rpc("compile_or_warm_up_model")
-
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 5240778ebf330..131be759842c7 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,63 +1,92 @@
-from abc import ABC, abstractmethod
 from typing import Type
 
 from vllm.config import VllmConfig
+from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.ray_distributed_executor import (  # noqa
+    RayDistributedExecutor as RayDistributedExecutorV0)
+from vllm.executor.uniproc_executor import (  # noqa
+    ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
+from vllm.executor.uniproc_executor import (  # noqa
+    UniProcExecutor as UniProcExecutorV0)
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 
-class Executor(ABC):
-    """Abstract class for executors."""
+class Executor(ExecutorBase):
+    """
+    Abstract class for v1 executors, mainly define some methods for v1.
+    For methods shared by v0 and v1, define them in ExecutorBase"""
 
     @staticmethod
     def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         executor_class: Type[Executor]
+        parallel_config = vllm_config.parallel_config
         distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend)
+        if distributed_executor_backend is None:
+            # If the user does not specify the distributed executor backend,
+            # we will choose the backend based on the world size.
+            if parallel_config.world_size > 1:
+                distributed_executor_backend = "mp"
+            else:
+                distributed_executor_backend = "uni"
+
         if distributed_executor_backend == "ray":
-            from vllm.executor.ray_distributed_executor import (  # noqa
-                RayDistributedExecutor)
             executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
+        elif distributed_executor_backend == "uni":
+            executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # TODO: make v1 scheduling deterministic
+            # to support external launcher
+            executor_class = ExecutorWithExternalLauncher
         else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
+            raise ValueError("Unknown distributed executor backend: "
+                             f"{distributed_executor_backend}")
         return executor_class
 
-    @abstractmethod
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
     def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        raise NotImplementedError
+        """
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        """
+        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
+        self.collective_rpc("compile_or_warm_up_model")
 
-    @abstractmethod
     def determine_available_memory(self) -> int:  # in bytes
-        raise NotImplementedError
+        output = self.collective_rpc("determine_available_memory")
+        # Since we use a shared centralized controller, we take the minimum
+        # memory size across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        return min(output)
 
-    @abstractmethod
     def get_kv_cache_spec(self) -> KVCacheSpec:
-        raise NotImplementedError
+        output = self.collective_rpc("get_kv_cache_spec")
+        for x in output:
+            assert x == output[0]
+        return output[0]
 
-    @abstractmethod
     def execute_model(
         self,
         scheduler_output,
     ) -> ModelRunnerOutput:
-        raise NotImplementedError
+        output = self.collective_rpc("execute_model",
+                                     args=(scheduler_output, ))
+        return output[0]
 
-    @abstractmethod
     def profile(self, is_start: bool = True):
-        raise NotImplementedError
+        self.collective_rpc("profile", args=(is_start, ))
+
+
+class UniProcExecutor(UniProcExecutorV0, Executor):
+    pass
+
+
+class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
+    pass
 
-    @abstractmethod
-    def shutdown(self):
-        pass
 
-    @abstractmethod
-    def check_health(self) -> None:
-        raise NotImplementedError
+class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
+    pass
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index fd977d07e8d81..93026029ad13e 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -25,8 +25,6 @@
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -37,7 +35,7 @@
 
 class MultiprocExecutor(Executor):
 
-    def __init__(self, vllm_config: VllmConfig) -> None:
+    def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
@@ -55,9 +53,6 @@ def sigusr1_handler(signum, frame):
 
         signal.signal(signal.SIGUSR1, sigusr1_handler)
 
-        self.vllm_config = vllm_config
-        self.parallel_config = vllm_config.parallel_config
-
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         assert self.world_size == tensor_parallel_size, (
@@ -82,7 +77,8 @@ def sigusr1_handler(signum, frame):
         # Create workers
         self.workers: List[WorkerProcHandle] = []
         for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(vllm_config, rank, rank,
+            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
+                                                    rank,
                                                     distributed_init_method,
                                                     scheduler_output_handle)
             self.workers.append(worker)
@@ -93,34 +89,6 @@ def sigusr1_handler(signum, frame):
         for w in self.workers:
             w.worker_response_mq.wait_until_ready()
 
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """
-        Initialize the KV caches and begin the model execution loop of the
-        underlying workers.
-        """
-        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
-        self.collective_rpc("compile_or_warm_up_model")
-
-    def determine_available_memory(self) -> int:
-        """
-        Determine the available memory (in bytes) for KV cache by invoking the
-        underlying worker.
-        """
-        memory_sizes = self.collective_rpc("determine_available_memory")
-
-        # Since we use a shared centralized controller, we take the minimum
-        # memory size across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        return min(memory_sizes)
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """
-        Get all kv cache needed by the model by invoking the underlying worker.
-        """
-        kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
-        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
-        return kv_cache_specs[0]
-
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
@@ -172,18 +140,6 @@ def collective_rpc(self,
             # Re-raise any other exceptions
             raise e
 
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        model_output = self.collective_rpc("execute_model",
-                                           args=(scheduler_output, ))[0]
-        return model_output
-
-    def profile(self, is_start: bool = True):
-        self.collective_rpc("profile", args=(is_start, ))
-        return
-
     def _ensure_worker_termination(self):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
deleted file mode 100644
index fd67fa2235770..0000000000000
--- a/vllm/v1/executor/ray_executor.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-import vllm.envs as envs
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
-                                        initialize_ray_cluster, ray)
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayExecutor(Executor):
-
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        self.vllm_config = vllm_config
-        self.parallel_config = vllm_config.parallel_config
-        self.model_config = vllm_config.model_config
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        initialize_ray_cluster(self.parallel_config)
-        placement_group = self.parallel_config.placement_group
-
-        # Create the parallel GPU workers.
-        self._init_workers_ray(placement_group)
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # A list of workers to run a model.
-        self.workers: List[RayWorkerWrapper] = []
-        if self.parallel_config.ray_workers_use_nsight:
-            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
-                ray_remote_kwargs)
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
-                # Skip bundles that don't have GPUs,
-                # as each worker needs one GPU.
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=1,
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        worker_to_ip = dict(zip(self.workers, worker_ips))
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first. This is simply a tiebreaker to make
-                sure the workers are sorted in a deterministic way.
-            """
-            ip = worker_to_ip[worker]
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
-
-        node_workers = defaultdict(list)  # node id -> list of worker ranks
-        node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-            # `gpu_ids` can be a list of strings or integers.
-            # convert them to integers for consistency.
-            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-            # string sorting is not sufficient.
-            # see https://github.com/vllm-project/vllm/issues/5590
-            gpu_ids = [int(x) for x in gpu_ids]
-            node_gpus[node_id].extend(gpu_ids)
-
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        all_ips = set(worker_ips)
-        n_ips = len(all_ips)
-        n_nodes = len(node_workers)
-
-        if n_nodes != n_ips:
-            raise RuntimeError(
-                f"Every node should have a unique IP address. Got {n_nodes}"
-                f" nodes with node ids {list(node_workers.keys())} and "
-                f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
-                " each node.")
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "CUDA_VISIBLE_DEVICES":
-            ",".join(map(str, node_gpus[node_id])),
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-            "VLLM_USE_V1":
-            str(int(envs.VLLM_USE_V1)),
-            **({
-                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
-            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
-
-        self._env_vars_for_all_workers = (
-            all_args_to_update_environment_variables)
-
-        self._run_workers("update_environment_variables",
-                          all_args=self._get_env_vars_to_be_updated())
-
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-        self._run_workers("initialize")
-        self._run_workers("load_model")
-
-    def _configure_ray_workers_use_nsight(self,
-                                          ray_remote_kwargs) -> Dict[str, Any]:
-        # If nsight profiling is enabled, we need to set the profiling
-        # configuration for the ray workers as runtime env.
-        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
-        runtime_env.update({
-            "nsight": {
-                "t": "cuda,cudnn,cublas",
-                "o": "'worker_process_%p'",
-                "cuda-graph-trace": "node",
-            }
-        })
-
-        return ray_remote_kwargs
-
-    def _get_env_vars_to_be_updated(self):
-        return self._env_vars_for_all_workers
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """
-        Return worker init args for a given rank.
-        """
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-        )
-
-    def determine_available_memory(self) -> int:
-        """
-        Determine the available GPU memory in bytes.
-        
-        This invokes `determine_available_memory` on each worker and takes
-        the min of the results, guaranteeing that the selected cache sizes are
-        compatible with all workers.
-        """
-
-        memory_sizes = self._run_workers("determine_available_memory")
-
-        # Since we use a shared centralized controller, we take the minimum
-        # memory size across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        return min(memory_sizes)
-
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """
-        Initialize the KV cache in all workers.
-        """
-        self._run_workers("initialize_cache", kv_cache_config)
-        self._run_workers("compile_or_warm_up_model")
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """
-        Get all kv cache needed by the model
-        
-        This invokes `get_kv_cache_spec` on each worker and asserts that
-        they are identical. The KVCacheSpec is then returned.
-        """
-        kv_cache_specs = self._run_workers("get_kv_cache_spec")
-        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
-        return kv_cache_specs[0]
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        **kwargs,
-    ) -> Any:
-        """
-        Runs the given method on all workers. Can be used in the following
-        ways:
-
-        Args:
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-        count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 0, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 0, None)
-
-        ray_worker_refs = [
-            worker.execute_method.remote(  # type: ignore[attr-defined]
-                method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-        ]
-        return ray.get(ray_worker_refs)
-
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag()
-        # Only the first worker (with rank 0) returns the execution result.
-        # Others return None.
-        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
-        return output
-
-    def profile(self, is_start=True):
-        raise NotImplementedError
-
-    def shutdown(self):
-        if hasattr(self, "forward_dag") and self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
-            self.forward_dag = None
-
-    def check_health(self) -> None:
-        logger.debug("Called check_health.")
-
-    def _check_ray_compiled_graph_installation(self):
-        import pkg_resources
-        from packaging import version
-
-        required_version = version.parse("2.39")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
-        if current_version < required_version:
-            raise ValueError(f"Ray version {required_version} is "
-                             f"required, but found {current_version}")
-
-        import importlib.util
-        raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
-        if raycg is None:
-            raise ValueError("Ray Compiled Graph is not installed. "
-                             "Run `pip install ray[adag]` to install it.")
-
-        cupy_spec = importlib.util.find_spec("cupy")
-        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
-            raise ValueError(
-                "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
-                "Run `pip install ray[adag]` and check cupy installation.")
-
-    def _compiled_ray_dag(self):
-        assert self.parallel_config.use_ray
-        self._check_ray_compiled_graph_installation()
-        from ray.dag import InputNode, MultiOutputNode
-
-        with InputNode() as input_batches:
-            outputs = [
-                worker.execute_model.bind(  # type: ignore[attr-defined]
-                    input_batches) for worker in self.workers
-            ]
-            forward_dag = MultiOutputNode(outputs)
-
-        return forward_dag.experimental_compile()
-
-    def __del__(self):
-        self.shutdown()
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
deleted file mode 100644
index fc9715b7a5909..0000000000000
--- a/vllm/v1/executor/ray_utils.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import time
-from collections import defaultdict
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
-
-from vllm.config import ParallelConfig
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import get_ip
-from vllm.v1.outputs import ModelRunnerOutput
-from vllm.worker.worker_base import WorkerWrapperBase
-
-if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
-
-logger = init_logger(__name__)
-PG_WAIT_TIMEOUT = 60
-
-try:
-    import ray
-    from ray.util import placement_group_table
-    from ray.util.placement_group import PlacementGroup
-    try:
-        from ray._private.state import available_resources_per_node
-    except ImportError:
-        # Ray 2.9.x doesn't expose `available_resources_per_node`
-        from ray._private.state import state as _state
-        available_resources_per_node = _state._available_resources_per_node
-
-    class RayWorkerWrapper(WorkerWrapperBase):
-
-        def __init__(self, *args, **kwargs) -> None:
-            super().__init__(*args, **kwargs)
-            # Since the compiled DAG runs a main execution
-            # in a different thread that calls cuda.set_device.
-            # The flag indicates is set_device is called on
-            # that thread. It will be removed soon.
-            self.compiled_dag_cuda_device_set = False
-
-        def get_node_ip(self) -> str:
-            return get_ip()
-
-        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
-            node_id = ray.get_runtime_context().get_node_id()
-            device_key = current_platform.ray_device_key
-            if not device_key:
-                raise RuntimeError("current platform %s does not support ray.",
-                                   current_platform.device_name)
-            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
-            )[device_key]
-            return node_id, gpu_ids
-
-        def setup_device_if_necessary(self):
-            # TODO(swang): This is needed right now because Ray CG executes
-            # on a background thread, so we need to reset torch's current
-            # device.
-            # We can remove this API after it is fixed in compiled graph.
-            import torch
-            assert self.worker is not None, "Worker is not initialized"
-            if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
-                self.compiled_dag_cuda_device_set = True
-
-        def execute_model(
-            self,
-            scheduler_output: "SchedulerOutput",
-        ) -> ModelRunnerOutput:
-            self.setup_device_if_necessary()
-            assert self.worker is not None, "Worker is not initialized"
-            output = self.worker.model_runner.execute_model(scheduler_output)
-            return output
-
-    ray_import_err = None
-
-except ImportError as e:
-    ray = None  # type: ignore
-    ray_import_err = e
-    RayWorkerWrapper = None  # type: ignore
-
-
-def ray_is_available() -> bool:
-    """Returns True if Ray is available."""
-    return ray is not None
-
-
-def assert_ray_available():
-    """
-    Raise an exception if Ray is not available.
-    """
-    if ray is None:
-        raise ValueError("Failed to import Ray, please install Ray with "
-                         "`pip install ray`.") from ray_import_err
-
-
-def _verify_bundles(placement_group: "PlacementGroup",
-                    parallel_config: ParallelConfig, device_str: str):
-    """
-    Verify a given placement group has bundles located in the right place.
-
-    There are 2 rules.
-    - Warn if all tensor parallel workers cannot fit in a single node.
-    - Fail if driver node is not included in a placement group.
-
-    Args:
-        placement_group: The placement group to verify.
-        parallel_config: The parallel configuration.
-        device_str: The required device.
-    """
-    assert ray.is_initialized(), (
-        "Ray is not initialized although distributed-executor-backend is ray.")
-    pg_data = placement_group_table(placement_group)
-    # bundle_idx -> node_id
-    bundle_to_node_ids = pg_data["bundles_to_node_id"]
-    # bundle_idx -> bundle (e.g., {"GPU": 1})
-    bundles = pg_data["bundles"]
-    # node_id -> List of bundle (e.g., {"GPU": 1})
-    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
-
-    for bundle_idx, node_id in bundle_to_node_ids.items():
-        node_id_to_bundle[node_id].append(bundles[bundle_idx])
-    driver_node_id = ray.get_runtime_context().get_node_id()
-
-    if driver_node_id not in node_id_to_bundle:
-        raise RuntimeError(
-            f"driver node id {driver_node_id} is not included in a placement "
-            f"group {placement_group.id}. Node id -> bundles "
-            f"{node_id_to_bundle}. "
-            "You don't have enough GPUs available in a current node. Check "
-            "`ray status` to see if you have available GPUs in a node "
-            f"{driver_node_id} before starting an vLLM engine.")
-
-    for node_id, bundles in node_id_to_bundle.items():
-        if len(bundles) < parallel_config.tensor_parallel_size:
-            logger.warning(
-                "tensor_parallel_size=%d "
-                "is bigger than a reserved number of %ss (%d "
-                "%ss) in a node %s. Tensor parallel workers can be "
-                "spread out to 2+ nodes which can degrade the performance "
-                "unless you have fast interconnect across nodes, like "
-                "Infiniband. To resolve this issue, make sure you have more "
-                "than %d GPUs available at each node.",
-                parallel_config.tensor_parallel_size, device_str, len(bundles),
-                device_str, node_id, parallel_config.tensor_parallel_size)
-
-
-def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
-    """Wait until a placement group is ready.
-
-    It prints the informative log messages if the placement group is
-    not created within time.
-
-    """
-    # Wait until PG is ready - this will block until all
-    # requested resources are available, and will timeout
-    # if they cannot be provisioned.
-    placement_group_specs = current_placement_group.bundle_specs
-
-    s = time.time()
-    pg_ready_ref = current_placement_group.ready()
-    wait_interval = 10
-    while time.time() - s < PG_WAIT_TIMEOUT:
-        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
-        if len(ready) > 0:
-            break
-
-        # Exponential backoff for warning print.
-        wait_interval *= 2
-        logger.info(
-            "Waiting for creating a placement group of specs for "
-            "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources.",
-            int(time.time() - s), placement_group_specs)
-
-    try:
-        ray.get(pg_ready_ref, timeout=0)
-    except ray.exceptions.GetTimeoutError:
-        raise ValueError(
-            "Cannot provide a placement group of "
-            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` to make sure the cluster has enough resources."
-        ) from None
-
-
-def initialize_ray_cluster(
-    parallel_config: ParallelConfig,
-    ray_address: Optional[str] = None,
-):
-    """Initialize the distributed cluster with Ray.
-
-    it will connect to the Ray cluster and create a placement group
-    for the workers, which includes the specification of the resources
-    for each distributed worker.
-
-    Args:
-        parallel_config: The configurations for parallel execution.
-        ray_address: The address of the Ray cluster. If None, uses
-            the default Ray cluster address.
-    """
-    assert_ray_available()
-
-    # Connect to a ray cluster.
-    if current_platform.is_rocm() or current_platform.is_xpu():
-        # Try to connect existing ray instance and create a new one if not found
-        try:
-            ray.init("auto")
-        except ConnectionError:
-            logger.warning(
-                "No existing RAY instance detected. "
-                "A new instance will be launched with current node resources.")
-            ray.init(address=ray_address,
-                     ignore_reinit_error=True,
-                     num_gpus=parallel_config.world_size)
-    else:
-        ray.init(address=ray_address, ignore_reinit_error=True)
-
-    if parallel_config.placement_group:
-        # Placement group is already set.
-        return
-
-    device_str = current_platform.ray_device_key
-    if not device_str:
-        raise ValueError(
-            f"current platform {current_platform.device_name} does not "
-            "support ray.")
-    # Create placement group for worker processes
-    current_placement_group = ray.util.get_current_placement_group()
-    if current_placement_group:
-        # We are in a placement group
-        bundles = current_placement_group.bundle_specs
-        # Verify that we can use the placement group.
-        device_bundles = 0
-        for bundle in bundles:
-            bundle_devices = bundle.get(device_str, 0)
-            if bundle_devices > 1:
-                raise ValueError(
-                    "Placement group bundle cannot have more than 1 "
-                    f"{device_str}.")
-            if bundle_devices:
-                device_bundles += 1
-        if parallel_config.world_size > device_bundles:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group."
-                f"Required number of devices: {parallel_config.world_size}. "
-                f"Total number of devices: {device_bundles}.")
-    else:
-        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
-        if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
-        # Create a new placement group
-        placement_group_specs: List[Dict[str, float]] = ([{
-            device_str: 1.0
-        } for _ in range(parallel_config.world_size)])
-
-        # vLLM engine is also a worker to execute model with an accelerator,
-        # so it requires to have the device in a current node. Check if
-        # the current node has at least one device.
-        current_ip = get_ip()
-        current_node_id = ray.get_runtime_context().get_node_id()
-        current_node_resource = available_resources_per_node()[current_node_id]
-        if current_node_resource.get(device_str, 0) < 1:
-            raise ValueError(
-                f"Current node has no {device_str} available. "
-                f"{current_node_resource=}. vLLM engine cannot start without "
-                f"{device_str}. Make sure you have at least 1 {device_str} "
-                f"available in a node {current_node_id=} {current_ip=}.")
-        # This way, at least bundle is required to be created in a current
-        # node.
-        placement_group_specs[0][f"node:{current_ip}"] = 0.001
-
-        # By default, Ray packs resources as much as possible.
-        current_placement_group = ray.util.placement_group(
-            placement_group_specs, strategy="PACK")
-        _wait_until_pg_ready(current_placement_group)
-
-    assert current_placement_group is not None
-    _verify_bundles(current_placement_group, parallel_config, device_str)
-    # Set the placement group in the parallel config
-    parallel_config.placement_group = current_placement_group
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
deleted file mode 100644
index b3997caac726b..0000000000000
--- a/vllm/v1/executor/uniproc_executor.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import os
-from typing import Optional
-
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.worker.gpu_worker import Worker
-
-logger = init_logger(__name__)
-
-
-class UniprocExecutor(Executor):
-
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.prompt_adapter_config = vllm_config.prompt_adapter_config
-        self.observability_config = vllm_config.observability_config
-
-        self.worker: Worker = self._create_worker()
-        self.worker.init_device()
-        self.worker.load_model()
-
-    def _create_worker(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Worker:
-        """Return worker init args for a given rank."""
-        # see https://github.com/NVIDIA/nccl/issues/1234
-        os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return Worker(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-        )
-
-    def determine_available_memory(self) -> int:
-        """Determine the available memory (in bytes) for KV cache by invoking 
-        the underlying worker.
-        """
-        return self.worker.determine_available_memory()
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """Get all kv cache needed by the model by invoking the underlying
-        worker.
-        """
-        return self.worker.get_kv_cache_spec()
-
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        self.worker.initialize_cache(kv_cache_config)
-        self.worker.compile_or_warm_up_model()
-
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        output = self.worker.execute_model(scheduler_output)
-        assert output is not None
-        return output
-
-    def profile(self, is_start: bool = True):
-        self.worker.profile(is_start)
-
-    def shutdown(self):
-        pass
-
-    def check_health(self) -> None:
-        # UniprocExecutor will always be healthy as long as
-        # it's running.
-        return

From 32eb0da808ea162a2d6758ff0bd9bdd0934b5fd5 Mon Sep 17 00:00:00 2001
From: yancong <32220263+ice-tong@users.noreply.github.com>
Date: Sun, 19 Jan 2025 08:13:16 +0800
Subject: [PATCH 11/50] [Misc] Support register quantization method out-of-tree
 (#11969)

---
 .../test_register_quantization_config.py      | 117 ++++++++++++++++++
 .../layers/quantization/__init__.py           |  41 ++++++
 2 files changed, 158 insertions(+)
 create mode 100644 tests/quantization/test_register_quantization_config.py

diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
new file mode 100644
index 0000000000000..8e7f44a399ddf
--- /dev/null
+++ b/tests/quantization/test_register_quantization_config.py
@@ -0,0 +1,117 @@
+"""Tests register custom quantization config.
+
+See https://github.com/vllm-project/vllm/issues/11926 for more details.
+
+Run `pytest tests/quantization/test_register_quantization_config.py`.
+"""
+from typing import Any, Dict, List, Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization import (
+    get_quantization_config, register_quantization_config)
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig)
+
+
+class FakeQuantLinearMethod(UnquantizedLinearMethod):
+    """Fake quantization linear method for per-token dynamic quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization method."""
+        super().__init__()
+        self.num_bits = num_bits
+
+    def apply(self,
+              layer: "torch.nn.Module",
+              x: "torch.Tensor",
+              bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
+        """Perform fake quantization before the linear layer."""
+
+        # Calculate the scales dynamically
+        max_val = torch.amax(x, dim=(0, -1), keepdims=True)
+        min_val = torch.amin(x, dim=(0, -1), keepdims=True)
+        scales = (max_val - min_val) / (2**self.num_bits - 1)
+
+        # Fake quantize the input
+        quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
+                              2**(self.num_bits - 1) - 1)
+        dequant_x = quant_x * scales
+
+        return F.linear(dequant_x, layer.weight, bias)
+
+
+@register_quantization_config("custom_quant")
+class CustomQuantConfig(QuantizationConfig):
+    """Custom quantization config for per-token dynamic fake quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization config."""
+        self.num_bits = num_bits
+
+    def get_name(self) -> str:
+        """Name of the quantization method."""
+        return "custom_quant"
+
+    def get_supported_act_dtypes(self) -> List["torch.dtype"]:
+        """List of supported activation dtypes."""
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method."""
+        return -1
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig":
+        """Create a config class from the model's quantization config."""
+        return CustomQuantConfig(num_bits=config.get("num_bits", 8))
+
+    def get_quant_method(self, layer: "torch.nn.Module",
+                         prefix: str) -> Optional["FakeQuantLinearMethod"]:
+        """Get the quantize method to use for the quantized layer."""
+        if isinstance(layer, LinearBase):
+            return FakeQuantLinearMethod(num_bits=self.num_bits)
+        return None
+
+
+def test_register_quantization_config():
+    """Test register custom quantization config."""
+
+    # The quantization method `custom_quant` should be registered.
+    assert get_quantization_config("custom_quant") == CustomQuantConfig
+
+    # The quantization method `custom_quant` is already exists,
+    # should raise an error.
+    with pytest.raises(ValueError):
+        register_quantization_config("custom_quant")(CustomQuantConfig)
+
+
+@pytest.mark.parametrize(argnames="model",
+                         argvalues=[
+                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                         ])
+def test_custom_quant(vllm_runner, model):
+    """Test infer with the custom quantization method."""
+    with vllm_runner(model_name=model,
+                     quantization="custom_quant",
+                     enforce_eager=True) as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+        qkv_proj = layer.self_attn.qkv_proj
+
+        # Check the quantization method is FakeQuantLinearMethod
+        assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index caeb8b95e02f2..d2bde13fcf546 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -29,6 +29,45 @@
     "quark"
 ]
 
+# The customized quantization methods which will be added to this dict.
+_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
+
+
+def register_quantization_config(quantization: str):
+    """Register a customized vllm quantization config.
+
+    When a quantization method is not supported by vllm, you can register a customized
+    quantization config to support it.
+
+    Args:
+        quantization (str): The quantization method name.
+
+    Examples:
+        >>> from vllm.model_executor.layers.quantization import register_quantization_config
+        >>> from vllm.model_executor.layers.quantization import get_quantization_config
+        >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+        >>>
+        >>> @register_quantization_config("my_quant")
+        ... class MyQuantConfig(QuantizationConfig):
+        ...     pass
+        >>>
+        >>> get_quantization_config("my_quant")
+        <class 'MyQuantConfig'>
+    """  # noqa: E501
+
+    def _wrapper(quant_config_cls):
+        if quantization in QUANTIZATION_METHODS:
+            raise ValueError(
+                f"The quantization method `{quantization}` is already exists.")
+        if not issubclass(quant_config_cls, QuantizationConfig):
+            raise ValueError("The quantization config must be a subclass of "
+                             "`QuantizationConfig`.")
+        _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
+        QUANTIZATION_METHODS.append(quantization)
+        return quant_config_cls
+
+    return _wrapper
+
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in QUANTIZATION_METHODS:
@@ -84,6 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "ipex": IPEXConfig,
         "quark": QuarkConfig
     }
+    # Update the `method_to_config` with customized quantization methods.
+    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
 
     return method_to_config[quantization]
 

From 7a8a48d51e51554645b233f870b71ef43bc70177 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 18 Jan 2025 19:07:15 -0800
Subject: [PATCH 12/50] [V1] Collect env var for usage stats (#12115)

---
 vllm/usage/usage_lib.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index a9deee881f41a..841df3994fba2 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -27,6 +27,17 @@
 
 _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
 
+_USAGE_ENV_VARS_TO_COLLECT = [
+    "VLLM_USE_MODELSCOPE",
+    "VLLM_USE_TRITON_FLASH_ATTN",
+    "VLLM_ATTENTION_BACKEND",
+    "VLLM_USE_FLASHINFER_SAMPLER",
+    "VLLM_PP_LAYER_PARTITION",
+    "VLLM_USE_TRITON_AWQ",
+    "VLLM_USE_V1",
+    "VLLM_ENABLE_V1_MULTIPROCESSING",
+]
+
 
 def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
     """Set global usage data that will be sent with every usage heartbeat."""
@@ -122,6 +133,7 @@ def __init__(self) -> None:
         self.gpu_count: Optional[int] = None
         self.gpu_type: Optional[str] = None
         self.gpu_memory_per_device: Optional[int] = None
+        self.env_var_json: Optional[str] = None
 
         # vLLM Information
         self.model_architecture: Optional[str] = None
@@ -176,6 +188,12 @@ def _report_usage_once(self, model_architecture: str,
         self.vllm_version = VLLM_VERSION
         self.model_architecture = model_architecture
 
+        # Environment variables
+        self.env_var_json = json.dumps({
+            env_var: getattr(envs, env_var)
+            for env_var in _USAGE_ENV_VARS_TO_COLLECT
+        })
+
         # Metadata
         self.log_time = _get_current_timestamp_ns()
         self.source = envs.VLLM_USAGE_SOURCE

From 4e94951bb16282c36de6d12ef14a1500f25a3bdf Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Sun, 19 Jan 2025 04:12:05 +0100
Subject: [PATCH 13/50] [BUGFIX] Move scores to float32 in case of running
 xgrammar on cpu (#12152)

Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index f10a8fb8e03cf..2d8594cb8aafa 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int],
         # token_bitmask is a CPU tensor for use with accept_token and
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
+        dtype = scores.dtype
         if device_type != "cuda":
-            scores = scores.to("cpu").unsqueeze(0)
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)
 
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
@@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int],
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type).squeeze()
+            scores = scores.to(dtype).to(device_type).squeeze()
 
         return scores
 

From 630eb5b5ce6ea59b6480440b7f6064be5ca71ae1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 19 Jan 2025 11:16:34 +0800
Subject: [PATCH 14/50] [Bugfix] Fix multi-modal processors for transformers
 4.48 (#12187)

---
 vllm/model_executor/models/llava.py         |  25 ++++-
 vllm/model_executor/models/qwen2_audio.py   |  72 ++++++++----
 vllm/model_executor/models/ultravox.py      |   9 +-
 vllm/transformers_utils/config.py           |   9 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/aria.py     | 118 ++++++++++++++++++++
 6 files changed, 199 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 722fff98d5c19..6cceded43a79d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,9 +5,11 @@
 
 import torch
 import torch.nn as nn
+from packaging.version import Version
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           SiglipVisionConfig)
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -716,6 +718,27 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loader.load_weights(weights)
 
 
+class MantisProcessingInfo(LlavaProcessingInfo):
+
+    def get_hf_processor(self):
+        hf_config = self.get_hf_config()
+        vision_info = self.get_vision_encoder_info()
+
+        if Version(TRANSFORMERS_VERSION) < Version("4.48"):
+            # BUG: num_additional_image_tokens = 0 but treated as 1,
+            # so we set vision_feature_select_strategy to None to offset this
+            vision_feature_select_strategy = None
+        else:
+            # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
+            vision_feature_select_strategy = hf_config.vision_feature_select_strategy  # noqa: E501
+
+        return self.ctx.get_hf_processor(
+            LlavaProcessor,
+            patch_size=vision_info.get_patch_size(),
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+
+
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
@@ -794,7 +817,7 @@ def get_replacement_mantis(item_idx: int):
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
 @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
-                                        info=LlavaProcessingInfo,
+                                        info=MantisProcessingInfo,
                                         dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 0dff9595c6c08..47d56175261e4 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -36,8 +36,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -153,29 +154,24 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
-        mm_data = dict(mm_data)
-        audios = mm_data.pop("audios", [])
-
-        if audios:
-            mm_data["audios"] = audios
-
-            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
-            mm_kwargs = dict(
-                **mm_kwargs,
-                sampling_rate=feature_extractor.sampling_rate,
-            )
-        else:
-            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
-            pass
+        # Text-only input not supported in composite processor
+        if not mm_data or not mm_data.get("audios", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
 
-        processed_outputs = super()._call_hf_processor(
+        return super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
             mm_kwargs=mm_kwargs,
         )
 
-        return processed_outputs
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -192,8 +188,14 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.info.get_hf_config()
-        placeholder = hf_config.audio_token_index
+        processor = self.info.get_hf_processor()
+
+        # Use getattr with default to be compatible with transformers<4.48
+        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
+        audio_bos_token = getattr(processor, "audio_bos_token",
+                                  "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token",
+                                  "<|audio_eos|>")
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
@@ -214,12 +216,16 @@ def get_replacement_qwen2_audio(item_idx: int):
                     f"The audio {audio} (len={len(audio)}) is too short "
                     "to be represented inside the model")
 
-            return [placeholder] * num_placeholders
+            return "".join([
+                audio_bos_token,
+                audio_token * num_placeholders,
+                audio_eos_token,
+            ])
 
         return [
             PromptReplacement(
                 modality="audio",
-                target=[placeholder],
+                target=audio_token,
                 replacement=get_replacement_qwen2_audio,
             )
         ]
@@ -234,6 +240,26 @@ def _always_apply_prompt_replacements(self) -> bool:
         # tokens than the number of audio items)
         return not hasattr(self.info.get_hf_processor(), "audio_token")
 
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+        # Only <|AUDIO|> tokens should be considered as placeholders,
+        # so we ignore the audio_bos_token and audio_eos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2AudioMultiModalProcessor,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 587f18ccaf98f..9301422383696 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -137,7 +137,7 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data:
+        if not mm_data or not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -146,13 +146,6 @@ def _call_hf_processor(
         audios = mm_data.pop("audios", [])
         assert isinstance(audios, list)
 
-        if not audios:
-            return super()._call_hf_processor(
-                prompt=prompt,
-                mm_data=mm_data,
-                mm_kwargs=mm_kwargs,
-            )
-
         feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c97acffa1a719..f57dfded0a62f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,10 +22,10 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
+from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig,
+                                             Cohere2Config, DbrxConfig,
+                                             DeepseekVLV2Config, EAGLEConfig,
+                                             ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@
 }
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    "aria": AriaConfig,
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f065c56124605..807ef4fbfd0c0 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,3 +1,4 @@
+from vllm.transformers_utils.configs.aria import AriaConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
@@ -23,6 +24,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AriaConfig",
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
index d253da0d96a34..f4b531225b5d0 100644
--- a/vllm/transformers_utils/configs/aria.py
+++ b/vllm/transformers_utils/configs/aria.py
@@ -1,7 +1,32 @@
+# Copyright 2024 Rhymes AI. All rights reserved.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Mapping
+
+from transformers import PretrainedConfig
 from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2VisionConfig)
 from transformers.models.llama.configuration_llama import LlamaConfig
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 class AriaVisionConfig(Idefics2VisionConfig):
     model_type = "aria_vision_model"
@@ -45,3 +70,96 @@ def __init__(
         self.moe_num_experts = moe_num_experts
         self.moe_topk = moe_topk
         self.moe_num_shared_experts = moe_num_shared_experts
+
+
+class AriaConfig(PretrainedConfig):
+    """
+    Configuration class for Aria model.
+    This class handles the configuration for both vision and text components of
+    the Aria model,
+    as well as additional parameters for image token handling and projector
+    mapping.
+
+    Args:
+        vision_config (AriaVisionConfig or dict): Configuration for the vision
+            component.
+        text_config (AriaMoELMConfig or dict): Configuration for the text
+            component.
+        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
+            dimensions.
+        ignore_index (int): Index to ignore in loss calculation.
+        image_token_index (int): Index used to represent image tokens.
+        **kwargs: Additional keyword arguments passed to the parent class.
+    Attributes:
+        model_type (str): Type of the model, set to "aria".
+        is_composition (bool): Whether the model is a composition of multiple
+            components.
+        ignore_index (int): Index to ignore in loss calculation.
+        image_token_index (int): Index used to represent image tokens.
+        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
+            dimensions.
+        vision_config (AriaVisionConfig): Configuration for the vision
+            component.
+        text_config (AriaMoELMConfig): Configuration for the text component.
+    """
+
+    model_type = "aria"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config: AriaVisionConfig = AriaVisionConfig(),  # noqa: B008
+        text_config: AriaMoELMConfig = AriaMoELMConfig(),  # noqa: B008
+        projector_patch_to_query_dict: Mapping[int, int] = {
+            1225: 128,
+            4900: 256,
+        },
+        ignore_index=-100,
+        image_token_index=32000,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.tie_word_embeddings = tie_word_embeddings
+        attn_implementation = kwargs.pop("attn_implementation", None)
+
+        # Set the default attention implementation to flash_attention_2 if not
+        # specified
+        self._attn_implementation = ("flash_attention_2"
+                                     if attn_implementation is None else
+                                     attn_implementation)
+
+        # Convert the keys and values of projector_patch_to_query_dict to
+        # integers
+        # This ensures consistency even if they were provided as strings
+        self.projector_patch_to_query_dict = {
+            int(k): int(v)
+            for k, v in projector_patch_to_query_dict.items()
+        }
+
+        if isinstance(vision_config, dict) and "model_type" in vision_config:
+            vision_config = AriaVisionConfig(**vision_config)
+            if attn_implementation is None:
+                vision_attn_implementation = "flash_attention_2"
+            elif attn_implementation == "sdpa":
+                logger.warning("SDPA is not supported for vit, using "
+                               "flash_attention_2 instead")
+                vision_attn_implementation = "flash_attention_2"
+            else:
+                vision_attn_implementation = attn_implementation
+            vision_config._attn_implementation = vision_attn_implementation
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict) and "model_type" in text_config:
+            text_attn_implementation = ("sdpa" if attn_implementation is None
+                                        else attn_implementation)
+            text_config = AriaMoELMConfig(**text_config)
+            text_config._attn_implementation = text_attn_implementation
+
+        self.text_config = text_config
+
+        # This is needed for the static kv cache
+        self.num_hidden_layers = self.text_config.num_hidden_layers

From e66faf4809cebf0b2169887151f782fd99bf208f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 19 Jan 2025 16:27:26 +0800
Subject: [PATCH 15/50] [torch.compile] store inductor compiled Python file
 (#12182)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 80 ++++++++++++++++++++++++++----------
 vllm/config.py               | 13 +-----
 2 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 157e3f7f39c9c..d7f4dcb7a20fc 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -25,23 +25,30 @@
 logger = init_logger(__name__)
 
 
+@dataclasses.dataclass
+class InductorArtifact:
+    hash_str: str = ""
+    file_path: str = ""
+
+
 class InductorHashCache:
     """
     Disk format: a Python list of tuples, each tuple is
-    (runtime_shape, graph_index, hash_str)
+    (runtime_shape, graph_index, hash_str, file_path)
     We use list of tuple for readability.
 
     In-memory format: a defaultdict of dict, where the key is
     runtime_shape, and the value is a dict of graph_index to hash_str.
 
-    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`,
     we don't use json here because json doesn't support int as key.
 
     TODO: better off-the-shelf solution to serialize the data?
     """
 
     def __init__(self, cache_dir: str, disabled: bool = False):
-        self.cache: defaultdict = defaultdict(dict)
+        self.cache: Dict[Optional[int],
+                         Dict[int, InductorArtifact]] = defaultdict(dict)
         self.disabled = disabled
         self.cache_dir = cache_dir
         self.cache_file_path = os.path.join(cache_dir,
@@ -66,14 +73,25 @@ def deserialize(self, data: str):
         # because it is a safe way to parse Python literals.
         # do not use eval(), it is unsafe.
         list_data = ast.literal_eval(data)
-        for runtime_shape, graph_index, hash_str in list_data:
-            self.cache[runtime_shape][graph_index] = hash_str
+        for item in list_data:
+            runtime_shape = item[0]
+            graph_index = item[1]
+            hash_str = item[2]
+            # for compatibility of old version,
+            # where we don't have file_path.
+            # NOTE: after running the new code, the file_path
+            # will be updated.
+            file_path = "" if len(item) == 3 else item[3]
+            self.cache[runtime_shape][graph_index] = InductorArtifact(
+                hash_str=hash_str, file_path=file_path)
 
     def serialize(self) -> str:
         data = []
-        for runtime_shape, graph_index_to_hash_str in self.cache.items():
-            for graph_index, hash_str in graph_index_to_hash_str.items():
-                data.append((runtime_shape, graph_index, hash_str))
+        for runtime_shape, value in self.cache.items():
+            for graph_index, inductor_artifact in value.items():
+                data.append(
+                    (runtime_shape, graph_index, inductor_artifact.hash_str,
+                     inductor_artifact.file_path))
         printer = pprint.PrettyPrinter(indent=4)
         return printer.pformat(data)
 
@@ -90,13 +108,14 @@ def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
         return runtime_shape in self.cache and graph_index in self.cache[
             runtime_shape]
 
-    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact:
         if self.disabled:
             raise KeyError("cannot read from disabled cache")
         runtime_shape, graph_index = key
         return self.cache[runtime_shape][graph_index]
 
-    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+    def __setitem__(self, key: Tuple[Optional[int], int],
+                    value: InductorArtifact):
         # setitem for disabled cache is fine, because we
         # don't actually write to the disk
         runtime_shape, graph_index = key
@@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule,
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
-        hash_str = cache_data[(runtime_shape, graph_index)]
+        inductor_artifact = cache_data[(runtime_shape, graph_index)]
+        hash_str = inductor_artifact.hash_str
         if graph_index == 0:
             # adds some info logging for the first graph
             logger.info(
@@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 "Inductor cache lookup failed. Please remove"
                 f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
+            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
 
         # Inductor calling convention (function signature):
         # f(list) -> tuple
@@ -224,19 +245,20 @@ def compiled_graph(*args):
         # the assumption is that we don't have nested Inductor compilation.
         # compiled_fx_graph_hash will only be called once, and we can hook
         # it to get the hash of the compiled graph directly.
-        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        inductor_artifact = InductorArtifact()
+        from torch._inductor.codecache import (FxGraphCache,
+                                               compiled_fx_graph_hash)
+        original_load = FxGraphCache.load
+
+        def hijack_load(*args, **kwargs):
+            inductor_compiled_graph = original_load(*args, **kwargs)
+            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+            return inductor_compiled_graph
 
         def hijack_compiled_fx_graph_hash(*args, **kwargs):
             out = compiled_fx_graph_hash(*args, **kwargs)
-            # store the hash in the cache
-            nonlocal cache_data
-            cache_data[(runtime_shape, graph_index)] = out[0]
-            if graph_index == 0:
-                # adds some info logging for the first graph
-                logger.info("Cache the graph of shape %s for later use",
-                            str(runtime_shape))
-            logger.debug("store the %s-th graph for shape %s via hash %s",
-                         graph_index, str(runtime_shape), out[0])
+            inductor_artifact.hash_str = out[0]
             return out
 
         def _check_can_cache(*args, **kwargs):
@@ -255,6 +277,11 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             if not cache_data.disabled:
                 # compilation cache is enabled, patch several functions
 
+                # hijack to get the compiled graph itself
+                stack.enter_context(
+                    patch("torch._inductor.codecache.FxGraphCache.load",
+                          hijack_load))
+
                 # for hijacking the hash of the compiled graph
                 stack.enter_context(
                     patch("torch._inductor.codecache.compiled_fx_graph_hash",
@@ -275,7 +302,16 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             compiled_graph = compile_fx(graph,
                                         example_inputs,
                                         config_patches=current_config)
-
+        # store the inductor_artifact in the cache
+        cache_data[(runtime_shape, graph_index)] = inductor_artifact
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info("Cache the graph of shape %s for later use",
+                        str(runtime_shape))
+        logger.debug(
+            "store the %s-th graph for shape %s via hash %s from file %s",
+            graph_index, str(runtime_shape), inductor_artifact.hash_str,
+            inductor_artifact.file_path)
     # after compiling the last graph, record the end time
     if graph_index == num_graphs - 1:
         now = time.time()
diff --git a/vllm/config.py b/vllm/config.py
index ac5a4c91b1738..4698a05020332 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2862,17 +2862,8 @@ def model_post_init(self, __context: Any) -> None:
                     "vllm.unified_attention_with_output",
                 ]
             else:
-                # v0 can use full graph compilation without splitting,
-                # splitting is optional.
-                # right now we still need it. kv cache shape
-                # will be included in the graph if we don't split
-                # the graph.
-                # TODO: hide kv cache in static forward context
-                # so that inductor does not see it.
-                self.splitting_ops = [
-                    "vllm.unified_attention",
-                    "vllm.unified_attention_with_output",
-                ]
+                # v0 uses full graph compilation
+                self.splitting_ops = []
 
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):

From 936db119ed390fadc7de448261226358e153e46c Mon Sep 17 00:00:00 2001
From: gujing <925973396@qq.com>
Date: Sun, 19 Jan 2025 17:59:56 +0800
Subject: [PATCH 16/50] benchmark_serving support --served-model-name param
 (#12109)

Signed-off-by: zibai <zibai.gj@alibaba-inc.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/backend_request_func.py |  9 ++++++---
 benchmarks/benchmark_serving.py    | 13 +++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 9d71e4ecc4a37..a9ab4fc9b621e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -22,6 +22,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    model_name: Optional[str] = None
     best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
@@ -78,7 +79,7 @@ async def async_request_tgi(
                             continue
                         chunk_bytes = chunk_bytes.decode("utf-8")
 
-                        #NOTE: Sometimes TGI returns a ping response without
+                        # NOTE: Sometimes TGI returns a ping response without
                         # any data, we should skip it.
                         if chunk_bytes.startswith(":"):
                             continue
@@ -235,7 +236,8 @@ async def async_request_openai_completions(
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
             "best_of": request_func_input.best_of,
@@ -328,7 +330,8 @@ async def async_request_openai_chat_completions(
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)
         payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
             "messages": [
                 {
                     "role": "user",
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 4eb0e1f8ac903..53186e10c5452 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -525,6 +525,7 @@ async def benchmark(
     api_url: str,
     base_url: str,
     model_id: str,
+    model_name: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     logprobs: Optional[int],
@@ -553,6 +554,7 @@ async def benchmark(
             "Multi-modal content is only supported on 'openai-chat' backend.")
     test_input = RequestFuncInput(
         model=model_id,
+        model_name=model_name,
         prompt=test_prompt,
         api_url=api_url,
         prompt_len=test_prompt_len,
@@ -573,6 +575,7 @@ async def benchmark(
     if profile:
         print("Starting profiler...")
         profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
                                          prompt=test_prompt,
                                          api_url=base_url + "/start_profile",
                                          prompt_len=test_prompt_len,
@@ -616,6 +619,7 @@ async def limited_request_func(request_func_input, pbar):
     async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
         request_func_input = RequestFuncInput(model=model_id,
+                                              model_name=model_name,
                                               prompt=prompt,
                                               api_url=api_url,
                                               prompt_len=prompt_len,
@@ -780,6 +784,7 @@ def main(args: argparse.Namespace):
 
     backend = args.backend
     model_id = args.model
+    model_name = args.served_model_name
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer_mode = args.tokenizer_mode
 
@@ -877,6 +882,7 @@ def main(args: argparse.Namespace):
             api_url=api_url,
             base_url=base_url,
             model_id=model_id,
+            model_name=model_name,
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
@@ -1222,5 +1228,12 @@ def main(args: argparse.Namespace):
         'always use the slow tokenizer. \n* '
         '"mistral" will always use the `mistral_common` tokenizer.')
 
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
     args = parser.parse_args()
     main(args)

From edaae198e72d36e22a10e9e76198fac32f670b49 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 19 Jan 2025 19:49:22 +0800
Subject: [PATCH 17/50] [Misc] Add BNB support to GLM4-V model (#12184)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py    | 15 ++-
 vllm/model_executor/models/chatglm.py         | 95 +++++++++----------
 .../models/glm4_vision_encoder.py             |  3 +-
 3 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 9fe0db62435a0..e6e37358482fc 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1105,15 +1105,22 @@ def _load_weights(self, model_config: ModelConfig,
                     weight_name,
                     index,
             ) in self.modules_mapping.inverse_packed_mapping.items():
-                shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
-                if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
+                shard_pos = quant_param_name.find(shard_name)
+                can_correct_rename = (shard_pos > 0) and (
+                    quant_param_name[shard_pos - 1] == ".")
+                # If the quant_param_name is packed, it won't occur in the
+                # param_dict before renaming.
+                new_quant_param_name = quant_param_name.replace(
+                    shard_name, weight_name)
+                need_rename = (quant_param_name not in param_dict) \
+                              and (new_quant_param_name in param_dict)
+                if can_correct_rename and need_rename:
                     shard_index = index
-                    quant_param_name = quant_param_name.replace(
-                        shard_name, weight_name)
+                    quant_param_name = new_quant_param_name
                     break
 
             # Models like Clip/Siglip may skip some layers in initialization,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 7e37ce3086e6b..d5f9b4d19e5ca 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -41,7 +41,7 @@
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -605,9 +605,50 @@ def forward(
             return IntermediateTensors({"hidden_states": hidden_states})
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
+            ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "rotary_pos_emb.inv_freq" in name:
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".word_embeddings": ""}, )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -660,52 +701,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
-        merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
-            "transformer.vision.linear_proj.merged_proj.weight": {
-                "transformer.vision.linear_proj.gate_proj.weight": None,
-                "transformer.vision.linear_proj.dense_h_to_4h.weight": None,
-            }
-        }
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            is_weight_to_be_merge = False
-            for _, merged_weight_dict in merged_weights_dict.items():
-                if name in merged_weight_dict:
-                    assert merged_weight_dict[name] is None
-                    merged_weight_dict[name] = loaded_weight
-                    is_weight_to_be_merge = True
-            if is_weight_to_be_merge:
-                continue
-            if "rotary_pos_emb.inv_freq" in name:
-                continue
-            if "word_embeddings" in name:
-                name = name.replace(".word_embeddings", "")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        for combined_name, merged_weight_dict in merged_weights_dict.items():
-            if combined_name in params_dict:
-                param = params_dict[combined_name]
-                combined_weight = torch.cat(list(merged_weight_dict.values()),
-                                            dim=0)
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, combined_weight)
-                loaded_params.add(combined_name)
-        return loaded_params
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class ChatGLM(ChatGLMBaseModel):
@@ -726,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel):
 
 
 class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
+
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"],
@@ -777,7 +776,7 @@ def __new__(
     ) -> None:
         config = vllm_config.model_config.hf_config
         # Initialize VL
-        if hasattr(config, "visual"):
+        if hasattr(config, "vision_config"):
             return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
         # Initialize LLM
         else:
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 39a5736eb199b..51922e6f2d03d 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -42,7 +42,8 @@ def forward(self, images: torch.Tensor) -> torch.Tensor:
         torch.Tensor
             Transformed tensor with shape (B, L, D)
         """
-        images = images.to(self.proj.weight.device)
+        images = images.to(device=self.proj.weight.device,
+                           dtype=self.proj.weight.dtype)
         x = self.proj(images)
         x = x.flatten(2).transpose(1, 2)
         cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)

From 81763c58a01eda9205f3750177358acc79613e65 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 19 Jan 2025 03:52:13 -0800
Subject: [PATCH 18/50] [V1] Add V1 support of Qwen2-VL (#12128)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: imkero <kerorek@outlook.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 .../vision_language/test_qwen2_vl.py          |  18 +--
 vllm/compilation/decorators.py                |  14 +-
 .../model_executor/layers/rotary_embedding.py |  44 +++++-
 vllm/model_executor/models/llava_onevision.py |   6 +-
 vllm/model_executor/models/qwen2.py           |  10 +-
 vllm/model_executor/models/qwen2_vl.py        | 142 ++++++++++--------
 vllm/v1/worker/gpu_input_batch.py             |   3 +
 vllm/v1/worker/gpu_model_runner.py            | 138 ++++++++++++++++-
 9 files changed, 292 insertions(+), 85 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 2edb610ddf959..eb1bde9ec0089 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -754,7 +754,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
   - ✅︎
-  -
+  - ✅︎
 * - `UltravoxModel`
   - Ultravox
   - T + A<sup>E+</sup>
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 16e256e040a74..2fd22f0cc88ec 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -105,7 +105,7 @@ def batch_make_image_embeddings(
     pixel_values = preprocess_result["pixel_values"]
     image_grid_thw = preprocess_result["image_grid_thw"]
 
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
     with torch.no_grad():
         visual = llm.llm_engine.model_executor.driver_worker. \
             model_runner.model.visual
@@ -124,11 +124,10 @@ def batch_make_image_embeddings(
     for image_batch in image_batches_:
         cur_batch_image_count = len(image_batch)
         merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
-            grid_thw.prod() // merge_size // merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
             for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count]
-        ])
+                                           cur_batch_image_count])
 
         result.append({
             "image_embeds":
@@ -187,7 +186,7 @@ def batch_make_video_embeddings(
     pixel_values = preprocess_result["pixel_values_videos"]
     video_grid_thw = preprocess_result["video_grid_thw"]
 
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
     with torch.no_grad():
         visual = llm.llm_engine.model_executor.driver_worker.\
             model_runner.model.visual
@@ -206,11 +205,10 @@ def batch_make_video_embeddings(
     for video_batch in video_batches_:
         cur_batch_video_count = len(video_batch)
         merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
-            grid_thw.prod() // merge_size // merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
             for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count]
-        ])
+                                           cur_batch_video_count])
 
         result.append({
             "video_embeds":
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 10513111ea7f1..38f284794b8db 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -76,8 +76,8 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
     During runtime, when we actually mark dimensions of tensors,
      it depends on the value of arguments:
 
-    - if it is a single integer, the corresponding dimension of the argument
-        will be marked as dynamic.
+    - if it is a single integer (can be negative), the corresponding dimension 
+        of the argument will be marked as dynamic.
     - if it is `None`, ignored.
     - if it is `IntermediateTensors`, all the tensors in the intermediate
         tensors will be marked as dynamic.
@@ -177,10 +177,20 @@ def __call__(self, *args, **kwargs):
             for k, dims in dynamic_arg_dims.items():
                 arg = bound_args.arguments.get(k)
                 if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
                     if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [
+                            arg.ndim + dim if dim < 0 else dim for dim in dims
+                        ]
                         torch._dynamo.mark_dynamic(arg, dims)
                     elif isinstance(arg, IntermediateTensors):
                         for tensor in arg.tensors.values():
+                            # In case dims is specified with negative indexing
+                            dims = [
+                                tensor.ndim + dim if dim < 0 else dim
+                                for dim in dims
+                            ]
                             torch._dynamo.mark_dynamic(tensor, dims)
                     else:
                         raise ValueError(
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 3fcd81a3c4213..d071cfe888f05 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -841,6 +841,37 @@ def get_input_positions(
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
+        llm_positions, mrope_position_delta = \
+            MRotaryEmbedding.get_input_positions_tensor(
+                input_tokens,
+                image_grid_thw,
+                video_grid_thw,
+                image_token_id,
+                video_token_id,
+                vision_start_token_id,
+                vision_end_token_id,
+                spatial_merge_size,
+                context_len,
+                seq_len,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @staticmethod
+    def get_input_positions_tensor(
+        input_tokens: List[int],
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        image_token_id: int,
+        video_token_id: int,
+        vision_start_token_id: int,
+        vision_end_token_id: int,
+        spatial_merge_size: int,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
         if isinstance(video_grid_thw, torch.Tensor):
@@ -916,7 +947,7 @@ def get_input_positions(
                                 len(input_tokens)).item()
         llm_positions = llm_positions[:, context_len:seq_len]
 
-        return llm_positions.tolist(), mrope_position_delta
+        return llm_positions, mrope_position_delta
 
     @staticmethod
     def get_next_input_positions(
@@ -930,6 +961,17 @@ def get_next_input_positions(
                       seq_len + mrope_position_delta)) for _ in range(3)
         ]
 
+    @staticmethod
+    def get_next_input_positions_tensor(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> torch.Tensor:
+        return torch.arange(
+            mrope_position_delta + context_len,
+            mrope_position_delta + seq_len,
+        ).expand(3, -1)
+
 
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c9283e0c5ba20..6faa79f65d8de 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -554,10 +554,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key == "pixel_values" and "images" not in modalities:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
-            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
                 modalities["videos"] = self._parse_and_validate_video_input(
                     **kwargs)
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d015f60c6d065..82de1c3574090 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -256,7 +256,15 @@ def forward(
         return hidden_states, residual
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
 class Qwen2Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d00e5d362c8bc..34d5c8ad089a3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -67,11 +67,15 @@
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix)
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
 from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 # === Vision Inputs === #
 
 
@@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict):
     - List[`torch.Tensor`]: A list of tensors holding all videos' features.
         Each tensor holds an video's features.
     - `torch.Tensor`: A tensor holding all videos' features
-      (concatenation of all videos' feature tensors).
+        (concatenation of all videos' feature tensors).
     
     Tensor shape: `(num_image_features, hidden_size)`
     - `num_image_features` varies based on 
@@ -611,6 +615,7 @@ def forward(
 
         # adapter
         x = self.merger(x)
+
         return x
 
     def load_weights(self, weights: Iterable[Tuple[str,
@@ -874,8 +879,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
-
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+        num_frames = min(max(max_total_frames // max(max_videos, 1), 1),
+                         _MAX_FRAMES_PER_VIDEO)
 
         # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
         if num_frames > 1 and num_frames % 2 == 1:
@@ -955,13 +960,14 @@ def _get_prompt_replacements(
             "image": hf_processor.image_token,
             "video": hf_processor.video_token,
         }
+
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
             grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
             assert isinstance(grid_thw, torch.Tensor)
 
-            num_tokens = grid_thw.prod() // merge_length
+            num_tokens = grid_thw.prod().item() // merge_length
             return placeholder[modality] * num_tokens
 
         return [
@@ -1047,11 +1053,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Qwen2VLConfig = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        assert not cache_config.enable_prefix_caching, \
-            "Qwen2-VL currently does not support prefix caching"
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -1173,59 +1176,82 @@ def _parse_and_validate_video_input(
                                                video_embeds=video_embeds,
                                                video_grid_thw=video_grid_thw)
 
-    def _process_image_input(self,
-                             image_input: Qwen2VLImageInputs) -> torch.Tensor:
+    def _process_image_input(
+            self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
         if image_input["type"] == "image_embeds":
-            return image_input["image_embeds"].type(self.visual.dtype)
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
 
-        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-        image_embeds = self.visual(pixel_values,
-                                   grid_thw=image_input["image_grid_thw"])
-        return image_embeds
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
 
-    def _process_video_input(self,
-                             video_input: Qwen2VLVideoInputs) -> torch.Tensor:
         if video_input["type"] == "video_embeds":
-            return video_input["video_embeds"].type(self.visual.dtype)
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
-        pixel_values_videos = video_input["pixel_values_videos"].type(
-            self.visual.dtype)
-        video_embeds = self.visual(pixel_values_videos,
-                                   grid_thw=video_input["video_grid_thw"])
-        return video_embeds
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
 
-    def _merge_multimodal_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        multimodal_embeddings: torch.Tensor,
-        placeholder_token_id: int,
-    ) -> torch.Tensor:
-        mask = (input_ids == placeholder_token_id)
-        inputs_embeds[mask, :] = multimodal_embeddings
-        return inputs_embeds
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
 
     def get_multimodal_embeddings(
             self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
 
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        video_input = self._parse_and_validate_video_input(**kwargs)
-        if image_input is None and video_input is None:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
-
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            multimodal_embeddings.append((image_embeds, "image"))
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            multimodal_embeddings.append((video_embeds, "video"))
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
 
         return multimodal_embeddings
 
@@ -1237,21 +1263,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        embeddings,
-                        placeholder_token_id=self.config.image_token_id,
-                    )
-                if modality == "video":
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        embeddings,
-                        placeholder_token_id=self.config.video_token_id,
-                    )
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 40494e64b22f0..28d8e39053874 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -30,6 +30,9 @@ class CachedRequestState:
     num_computed_tokens: int
     output_token_ids: List[int]
 
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+
     @property
     def num_tokens(self) -> int:
         return len(self.prompt_token_ids) + len(self.output_token_ids)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aa63d9414c296..87a1cd7f9e627 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -14,6 +14,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
@@ -139,6 +140,32 @@ def __init__(
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
+
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.model_config.uses_mrope:
+            # NOTE: `mrope_positions` is implemented as a permuted tensor to
+            # satisfy the following properties to allow `torch.compile` to work
+            # properly:
+            # - shape: (3, <variable>)
+            # - stride: (1, 3)
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
+            self.mrope_positions = torch.zeros((self.max_num_tokens, 3),
+                                               dtype=torch.int64,
+                                               device=self.device)
+            self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3),
+                                                   dtype=torch.int64,
+                                                   device="cpu",
+                                                   pin_memory=self.pin_memory)
+
+            self.mrope_positions = self.mrope_positions.permute((1, 0))
+            self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0))
+
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
@@ -246,6 +273,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
+
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            if self.model_config.uses_mrope:
+                image_grid_thw = []
+                video_grid_thw = []
+                for mm_input in self.requests[req_id].mm_inputs:
+                    if mm_input.get("image_grid_thw") is not None:
+                        image_grid_thw.extend(
+                            mm_input["image_grid_thw"].tolist())
+                    if mm_input.get("video_grid_thw") is not None:
+                        video_grid_thw.extend(
+                            mm_input["video_grid_thw"].tolist())
+
+                hf_config = self.model_config.hf_config
+
+                self.requests[req_id].mrope_positions, \
+                    self.requests[req_id].mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions_tensor(
+                        self.requests[req_id].prompt_token_ids,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.
+                        spatial_merge_size,
+                    )
+
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
@@ -313,6 +369,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                arange,
                out=positions_np)
 
+        # Calculate M-RoPE positions.
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.model_config.uses_mrope:
+            self._calc_mrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -359,8 +420,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(
-            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        if self.model_config.uses_mrope:
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
+                self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True)
+        else:
+            # Common case (1D positions)
+            self.positions[:total_num_scheduled_tokens].copy_(
+                self.positions_cpu[:total_num_scheduled_tokens],
+                non_blocking=True)
         query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
             self.device, non_blocking=True)
         seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
@@ -472,6 +541,61 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
 
+    def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
+        mrope_pos_ptr = 0
+        num_reqs = self.input_batch.num_reqs
+        for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
+
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            num_computed_tokens = \
+                self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = \
+                scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = len(req.prompt_token_ids)
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0,
+                                      num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(
+                    0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    req.mrope_positions[:,src_start:src_end]
+
+                mrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    MRotaryEmbedding.get_next_input_positions_tensor(
+                        req.mrope_position_delta,
+                        context_len=num_computed_tokens +
+                        prompt_part_len,
+                        seq_len=num_computed_tokens +
+                        prompt_part_len +
+                        completion_part_len,
+                    )
+
+                mrope_pos_ptr += completion_part_len
+
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
@@ -618,9 +742,12 @@ def execute_model(
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
+            positions = self.mrope_positions[:, :num_input_tokens] \
+                if self.model_config.uses_mrope \
+                else self.positions[:num_input_tokens]
             hidden_states = self.model(
                 input_ids=input_ids,
-                positions=self.positions[:num_input_tokens],
+                positions=positions,
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
                 inputs_embeds=inputs_embeds,
@@ -707,9 +834,12 @@ def _dummy_run(
             input_ids = self.input_ids[:num_tokens]
             inputs_embeds = None
         with set_forward_context(None, self.vllm_config):
+            positions = self.mrope_positions[:, :num_tokens] \
+                if self.model_config.uses_mrope \
+                else self.positions[:num_tokens]
             hidden_states = model(
                 input_ids=input_ids,
-                positions=self.positions[:num_tokens],
+                positions=positions,
                 kv_caches=kv_caches,
                 attn_metadata=None,
                 inputs_embeds=inputs_embeds,

From bbe5f9de7dab1fa905807577faa185d85040213a Mon Sep 17 00:00:00 2001
From: Martin Gleize <mgleize@meta.com>
Date: Sun, 19 Jan 2025 19:40:40 +0100
Subject: [PATCH 19/50] [Model] Support for fairseq2 Llama (#11442)

Signed-off-by: Martin Gleize <mgleize@meta.com>
Co-authored-by: mgleize user <mgleize@a100-st-p4de24xlarge-4.fair-a100.hpcaas>
---
 tests/models/registry.py                     |   1 +
 tests/weight_loading/models.txt              |   3 +-
 tests/weight_loading/test_weight_loading.py  |  13 +-
 vllm/model_executor/layers/linear.py         |  34 +++--
 vllm/model_executor/model_loader/loader.py   |  15 +-
 vllm/model_executor/models/fairseq2_llama.py | 151 +++++++++++++++++++
 vllm/model_executor/models/registry.py       |   1 +
 7 files changed, 197 insertions(+), 21 deletions(-)
 create mode 100644 vllm/model_executor/models/fairseq2_llama.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 938c838617e8b..cb0521cfe80a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -69,6 +69,7 @@ class _HfExamplesInfo:
     "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
                                          trust_remote_code=True),
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
+    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index a06956ce18a93..272206d4502e9 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
 marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
 qqq, HandH1998/QQQ-Llama-3-8b-g128, main
 qqq, HandH1998/QQQ-Llama-3-8b, main
-hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
\ No newline at end of file
+hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
+None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 199731bdc21fe..7a3786456d0d6 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
-    with vllm_runner(model_name=MODEL_NAME,
-                     revision=REVISION,
-                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
-                     quantization=QUANTIZATION,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=2) as model:
+    with vllm_runner(
+            model_name=MODEL_NAME,
+            revision=REVISION,
+            dtype=torch.half if QUANTIZATION == "gptq" else "auto",
+            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=2) as model:
 
         output = model.generate_greedy("Hello world!", max_tokens=20)
         print(output)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 00ae64bbe6388..52263e96fb9f9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -344,11 +344,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
 
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
         param_data = param.data
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow here
-        if output_dim is not None and not use_bitsandbytes_4bit:
+        if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
@@ -546,6 +548,11 @@ def weight_loader(self,
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
             if use_bitsandbytes_4bit:
                 shard_size = loaded_weight.shape[output_dim]
                 shard_offset = loaded_weight.shape[output_dim] * \
@@ -554,9 +561,7 @@ def weight_loader(self,
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
         # Special case for AQLM codebooks.
@@ -941,6 +946,11 @@ def weight_loader(self,
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
             if use_bitsandbytes_4bit:
                 orig_qkv_offsets = {
                     "q": (0, self.num_heads * self.head_size),
@@ -964,9 +974,7 @@ def weight_loader(self,
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
 
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
 
@@ -1070,6 +1078,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
         # Special case for GGUF
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
@@ -1085,9 +1097,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
 
         param_data = param.data
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow here
-        if input_dim is not None and not use_bitsandbytes_4bit:
+        if input_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e6e37358482fc..f697c3245f098 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -182,6 +182,9 @@ class Source:
         fall_back_to_pt: bool = True
         """Whether .pt weights can be used."""
 
+        allow_patterns_overrides: Optional[list[str]] = None
+        """If defined, weights will load exclusively using these patterns."""
+
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
         if load_config.model_loader_extra_config:
@@ -218,6 +221,7 @@ def _prepare_weights(
         model_name_or_path: str,
         revision: Optional[str],
         fall_back_to_pt: bool,
+        allow_patterns_overrides: Optional[list[str]],
     ) -> Tuple[str, List[str], bool]:
         """Prepare weights for the model.
 
@@ -249,6 +253,9 @@ def _prepare_weights(
         if fall_back_to_pt:
             allow_patterns += ["*.pt"]
 
+        if allow_patterns_overrides is not None:
+            allow_patterns = allow_patterns_overrides
+
         if not is_local:
             hf_folder = download_weights_from_hf(
                 model_name_or_path,
@@ -298,7 +305,8 @@ def _get_weights_iterator(
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
-            source.model_or_path, source.revision, source.fall_back_to_pt)
+            source.model_or_path, source.revision, source.fall_back_to_pt,
+            source.allow_patterns_overrides)
         if self.load_config.load_format == LoadFormat.NPCACHE:
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
@@ -340,6 +348,8 @@ def _get_all_weights(
             prefix="",
             fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
                                     True),
+            allow_patterns_overrides=getattr(model, "allow_patterns_overrides",
+                                             None),
         )
         yield from self._get_weights_iterator(primary_weights)
 
@@ -353,7 +363,8 @@ def _get_all_weights(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model,
                               model_config.revision,
-                              fall_back_to_pt=True)
+                              fall_back_to_pt=True,
+                              allow_patterns_overrides=None)
 
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
new file mode 100644
index 0000000000000..b93a68680375d
--- /dev/null
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -0,0 +1,151 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llama model for fairseq2 weights."""
+
+from typing import Iterable, Set, Tuple
+
+import torch
+from torch.nn import Parameter
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import set_weight_attrs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import AutoWeightsLoader, WeightsMapper
+
+
+class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # For the model loader to read only the relevant checkpoint files
+        self.allow_patterns_overrides = [
+            # either the full checkpoint
+            "model.pt",
+            # or the tp-sharded checkpoint of the current rank
+            f"model.{self.tp_rank}.pt",
+        ]
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        # fairseq2's serialization adds a wrapper to usual .pt state_dict's:
+        # { "model_key": my_model_name, "my_model_name": state_dict }
+        # which we first need to unpack
+        weights_wrapped = dict(weights)
+        weights = weights_wrapped[
+            weights_wrapped["model_key"]].items()  # type: ignore
+
+        # remap keys
+        fs2_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "decoder_frontend.embed.": "model.embed_tokens.",
+                "decoder.": "model.",
+                "final_proj.": "lm_head.",
+            },
+            orig_to_new_substr={
+                ".self_attn_layer_norm.": ".input_layernorm.",
+                ".ffn_layer_norm.": ".post_attention_layernorm.",
+                ".self_attn.output_proj.": ".self_attn.o_proj.",
+                ".ffn.gate_proj.": ".mlp.gate_proj.",
+                ".ffn.inner_proj.": ".mlp.up_proj.",
+                ".ffn.output_proj.": ".mlp.down_proj.",
+                ".layer_norm.": ".norm.",
+            },
+        )
+        weights = fs2_to_vllm_mapper.apply(weights)
+
+        params = dict(self.named_parameters())
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(
+            (self.reshape_fairseq2_weights(name, loaded_weight, params)
+             for name, loaded_weight in weights))
+
+    def flag_sharded_weights(self, params: dict[str, Parameter]):
+        """Sets the `is_sharded_weight` flag to True for all sharded weights"""
+        for name, param in params.items():
+            modules = name.split(".")
+            if "norm" in name and len(param.size()) < 2:
+                # layer norms are not sharded
+                continue
+            elif any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+                # for now we repeat embedding layers for compatibility
+                continue
+            else:
+                # all other layers are sharded
+                set_weight_attrs(param, {"is_sharded_weight": True})
+
+    def reshape_fairseq2_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params: dict[str, Parameter],
+    ) -> Tuple[str, torch.Tensor]:
+        """Reshape fairseq2's weights."""
+
+        def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor:
+            attn_in = self.config.head_dim * n_heads
+            # check for a sharded weight on dim 0
+            if attn_in // self.tp_size == w.size()[0]:
+                attn_in //= self.tp_size
+                n_heads //= self.tp_size
+            attn_out = self.config.hidden_size
+            return (w.view(n_heads, attn_in // n_heads // 2, 2,
+                           attn_out).transpose(1,
+                                               2).reshape(attn_in, attn_out))
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if "k_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+
+        elif "q_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        # We make the loaded weights compatible with both
+        # full checkpoints and tp sharded checkpoints.
+        # Embeddings are repeated to fit the vocab size.
+        # Other weights are flagged for the weight_loader calls.
+        if any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+            # Embeddings are sharded on dim 0
+            dim = 0
+            # In fairseq2, vocab size has to be divisible by tp_size
+            # so we don't worry about padding
+            if self.tp_size > 1 and loaded_weight.shape[
+                    dim] < self.config.vocab_size:
+                assert loaded_weight.shape[
+                    dim] * self.tp_size == self.config.vocab_size, \
+                        "vocab_size should be divisible by tp_size."
+                repeats = [1] * len(loaded_weight.size())
+                repeats[dim] = self.tp_size
+                # repeat to match vocab size and to be easily 'narrow'able
+                loaded_weight = loaded_weight.repeat(repeats)
+                set_weight_attrs(params[name], {"is_sharded_weight": False})
+                # if embeddings are sharded, the rest is too
+                if "embed_tokens" in modules:
+                    self.flag_sharded_weights(params)
+
+        return name, loaded_weight
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a71f7f7029c7d..311f91472783b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -47,6 +47,7 @@
     "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),

From df450aa5671bd3a48929686eb14d8a4324afd91a Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 20 Jan 2025 10:56:43 +0800
Subject: [PATCH 20/50] [Bugfix] Fix num_heads value for simple connector when
 tp enabled (#12074)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/distributed/kv_transfer/kv_connector/simple_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 4ace03ff1184e..7780e2dfa317d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -35,6 +35,7 @@ def __init__(
     ):
 
         self.config = config.kv_transfer_config
+        self.tp_size = config.parallel_config.tensor_parallel_size
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -161,7 +162,7 @@ def send_kv_caches_and_hidden_states(
         end_layer = model_executable.model.end_layer
 
         model_config = model_executable.model.config
-        num_heads = model_config.num_key_value_heads
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
         hidden_size = model_config.hidden_size
         num_attention_heads = model_config.num_attention_heads
         head_size = int(hidden_size / num_attention_heads)

From 51ef828f10acddbe941c38255c5de7f61738abad Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 11:37:50 +0800
Subject: [PATCH 21/50] [torch.compile] fix sym_tensor_indices (#12191)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d7f4dcb7a20fc..955c25f300512 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -624,9 +624,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         ]
 
         # index of tensors that have symbolic shapes (batch size)
+        # for weights and static buffers, they will have concrete shapes.
+        # symbolic shape only happens for input tensors.
+        from torch.fx.experimental.symbolic_shapes import is_symbolic
         self.sym_tensor_indices = [
             i for i, x in enumerate(fake_args)
-            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
+            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
+                any(is_symbolic(d) for d in x.size())
         ]
 
         # compiler managed cudagraph input buffers

From 3ea7b94523f748faf464293ca5fdc4c94e3a3a89 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 20 Jan 2025 06:58:01 +0000
Subject: [PATCH 22/50] Move linting to `pre-commit` (#11975)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../scripts/nightly-annotate.sh               |   2 +-
 .github/workflows/actionlint.yml              |  40 --
 .github/workflows/clang-format.yml            |  53 --
 .github/workflows/codespell.yml               |  45 --
 .github/workflows/doc-lint.yml                |  32 -
 .github/workflows/dummy.yml                   |  20 +
 .github/workflows/matchers/ruff.json          |  17 -
 .github/workflows/mypy.yaml                   |  51 --
 .github/workflows/png-lint.yml                |  37 --
 .github/workflows/pre-commit.yml              |  17 +
 .github/workflows/ruff.yml                    |  52 --
 .github/workflows/shellcheck.yml              |  37 --
 .github/workflows/yapf.yml                    |  38 --
 .pre-commit-config.yaml                       |  73 +++
 csrc/core/scalar_type.hpp                     |   2 +-
 csrc/cpu/cpu_types.hpp                        |   6 +-
 csrc/cpu/cpu_types_arm.hpp                    | 549 +++++++++---------
 csrc/cpu/cpu_types_vsx.hpp                    | 254 ++++----
 csrc/cpu/cpu_types_x86.hpp                    | 311 +++++-----
 csrc/cutlass_extensions/common.hpp            |   3 +-
 docs/source/contributing/overview.md          |  13 +-
 format.sh                                     | 321 ----------
 pyproject.toml                                |   8 +
 requirements-lint.txt                         |  15 +-
 tools/actionlint.sh                           |  13 -
 tools/doc-lint.sh                             |   3 -
 26 files changed, 725 insertions(+), 1287 deletions(-)
 delete mode 100644 .github/workflows/actionlint.yml
 delete mode 100644 .github/workflows/clang-format.yml
 delete mode 100644 .github/workflows/codespell.yml
 delete mode 100644 .github/workflows/doc-lint.yml
 create mode 100644 .github/workflows/dummy.yml
 delete mode 100644 .github/workflows/matchers/ruff.json
 delete mode 100644 .github/workflows/mypy.yaml
 delete mode 100644 .github/workflows/png-lint.yml
 create mode 100644 .github/workflows/pre-commit.yml
 delete mode 100644 .github/workflows/ruff.yml
 delete mode 100644 .github/workflows/shellcheck.yml
 delete mode 100644 .github/workflows/yapf.yml
 create mode 100644 .pre-commit-config.yaml
 delete mode 100755 format.sh
 delete mode 100755 tools/actionlint.sh
 delete mode 100755 tools/doc-lint.sh

diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 686f70dbece6c..69b6b146b3549 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -43,7 +43,7 @@ main() {
     
 
 
-    # The figures should be genereated by a separate process outside the CI/CD pipeline
+    # The figures should be generated by a separate process outside the CI/CD pipeline
 
     # # generate figures
     # python3 -m pip install tabulate pandas matplotlib
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
deleted file mode 100644
index 0226cf0ca00e9..0000000000000
--- a/.github/workflows/actionlint.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Lint GitHub Actions workflows
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
deleted file mode 100644
index 68149d2dc019f..0000000000000
--- a/.github/workflows/clang-format.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-name: clang-format
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - '**/*.h'
-      - '**/*.cpp'
-      - '**/*.cu'
-      - '**/*.cuh'
-      - '.github/workflows/clang-format.yml'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - '**/*.h'
-      - '**/*.cpp'
-      - '**/*.cu'
-      - '**/*.cuh'
-      - '.github/workflows/clang-format.yml'
-
-jobs:
-  clang-format:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.11"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install clang-format==18.1.5
-    - name: Running clang-format
-      run: |
-        EXCLUDES=(
-            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/quantization/gguf/ggml-common.h'
-            'csrc/quantization/gguf/dequantize.cuh'
-            'csrc/quantization/gguf/vecdotq.cuh'
-            'csrc/quantization/gguf/mmq.cuh'
-            'csrc/quantization/gguf/mmvq.cuh'
-        )
-        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
-            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
-            | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
deleted file mode 100644
index 68887adaae54b..0000000000000
--- a/.github/workflows/codespell.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: codespell
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - "**/*.md"
-      - "**/*.rst"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/codespell.yml
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - "**/*.md"
-      - "**/*.rst"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/codespell.yml
-
-jobs:
-  codespell:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml
deleted file mode 100644
index 2f5ee8bbfd8c5..0000000000000
--- a/.github/workflows/doc-lint.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Lint documentation
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - "docs/**"
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "docs/**"
-
-jobs:
-  doc-lint:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Linting docs
-        run: tools/doc-lint.sh
diff --git a/.github/workflows/dummy.yml b/.github/workflows/dummy.yml
new file mode 100644
index 0000000000000..ea507fab6b2de
--- /dev/null
+++ b/.github/workflows/dummy.yml
@@ -0,0 +1,20 @@
+name: dummy-checks
+
+on:
+  pull_request:
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
deleted file mode 100644
index f6d4479ee1996..0000000000000
--- a/.github/workflows/matchers/ruff.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "problemMatcher": [
-      {
-        "owner": "ruff",
-        "pattern": [
-          {
-            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
-            "file": 1,
-            "line": 2,
-            "column": 3,
-            "code": 4,
-            "message": 5
-          }
-        ]
-      }
-    ]
-  }
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
deleted file mode 100644
index 73eeacf1fa562..0000000000000
--- a/.github/workflows/mypy.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: mypy
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - '**/*.py'
-      - '.github/workflows/mypy.yaml'
-      - 'tools/mypy.sh'
-      - 'pyproject.toml'
-  pull_request:
-    branches:
-      - main
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    #paths:
-    #  - '**/*.py'
-    #  - '.github/workflows/mypy.yaml'
-    #  - 'tools/mypy.sh'
-    #  - 'pyproject.toml'
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install mypy==1.11.1
-        pip install types-setuptools
-        pip install types-PyYAML
-        pip install types-requests
-        pip install types-setuptools
-    - name: Mypy
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
deleted file mode 100644
index 4932af943a07b..0000000000000
--- a/.github/workflows/png-lint.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Lint PNG exports from excalidraw
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '*.excalidraw.png'
-      - '.github/workflows/png-lint.yml'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '*.excalidraw.png'
-      - '.github/workflows/png-lint.yml'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run png-lint.sh to check excalidraw exported images"
-        run: |
-          tools/png-lint.sh
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000000..8c72a709cf330
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,17 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: "3.12"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
deleted file mode 100644
index 7266cc378cfb0..0000000000000
--- a/.github/workflows/ruff.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: ruff
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/matchers/ruff.json
-      - .github/workflows/ruff.yml
-  pull_request:
-    branches:
-      - main
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    #paths:
-    #  - "**/*.py"
-    #  - pyproject.toml
-    #  - requirements-lint.txt
-    #  - .github/workflows/matchers/ruff.json
-    #  - .github/workflows/ruff.yml
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Analysing the code with ruff
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-          ruff check --output-format github .
-      - name: Run isort
-        run: |
-          isort . --check-only
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
deleted file mode 100644
index 4b1587e373e17..0000000000000
--- a/.github/workflows/shellcheck.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Lint shell scripts
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  shellcheck:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Check shell scripts"
-        run: |
-          tools/shellcheck.sh
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
deleted file mode 100644
index ff441f94435ad..0000000000000
--- a/.github/workflows/yapf.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: yapf
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-
-jobs:
-  yapf:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install yapf==0.32.0
-          pip install toml==0.10.2
-      - name: Running yapf
-        run: |
-          yapf --diff --recursive .
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..8ea0f37885d9f
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,73 @@
+repos:
+- repo: https://github.com/google/yapf
+  rev: v0.32.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.6.5
+  hooks:
+  - id: ruff
+    args: [--output-format, github]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.3.0
+  hooks:
+  - id: codespell
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
+- repo: https://github.com/PyCQA/isort
+  rev: 5.13.2
+  hooks:
+  - id: isort
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v18.1.5
+  hooks:
+  - id: clang-format
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
+    types_or: [c++, cuda]
+    args: [--style=file, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.27
+  hooks:
+  - id: pymarkdown
+    files: docs/.*
+- repo: local
+  hooks:
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.10
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.11
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.12
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: shellcheck
+    name: Lint shell scripts
+    entry: tools/shellcheck.sh
+    language: script
+    types: [shell]
+  - id: png-lint
+    name: Lint PNG exports from excalidraw
+    entry: tools/png-lint.sh
+    language: script
+    types: [png]
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.6
+  hooks:
+  - id: actionlint
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 408e736d5bc0f..c2ae554c9f8e8 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -32,7 +32,7 @@ class ScalarType {
         signed_(signed_),
         bias(bias),
         finite_values_only(finite_values_only),
-        nan_repr(nan_repr){};
+        nan_repr(nan_repr) {};
 
   static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
     return ScalarType(0, size_bits - 1, true, bias);
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 28db0479748bf..a71815106133a 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -2,13 +2,13 @@
 #define CPU_TYPES_HPP
 
 #if defined(__x86_64__)
-  //x86 implementation
+  // x86 implementation
   #include "cpu_types_x86.hpp"
 #elif defined(__POWER9_VECTOR__)
-  //ppc implementation
+  // ppc implementation
   #include "cpu_types_vsx.hpp"
 #elif defined(__aarch64__)
-  //arm implementation
+  // arm implementation
   #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index ae062a5b86892..990e99f2fc069 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -1,48 +1,50 @@
 #include <arm_neon.h>
-#include <torch/all.h> 
+#include <torch/all.h>
 #include <cmath>
 
 namespace vec_op {
 
 #ifdef ARM_BF16_SUPPORT
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #else
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
     AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #endif
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
-  template <typename T, T... indexes, typename F>
-  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-    (f(std::integral_constant<T, indexes>{}), ...);
-  };
-}; 
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+};
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
 };
 
@@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
 
   float16x8_t reg;
 
-  explicit FP16Vec8(const void *ptr)
-      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
+  explicit FP16Vec8(const void* ptr)
+      : reg(vld1q_f16(static_cast<const __fp16*>(ptr))) {};
 
-  explicit FP16Vec8(const FP32Vec8 &);
+  explicit FP16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const {
-    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
-  }
+  void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); }
 };
 
 struct FP16Vec16 : public Vec<FP16Vec16> {
-    constexpr static int VEC_ELEM_NUM = 16;
-    
-    float16x8x2_t reg; 
-    
-    explicit FP16Vec16(const void *ptr) {
-        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
-        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
-    }
-    
-    explicit FP16Vec16(const FP32Vec16& vec);
-    
-    void save(void *ptr) const {
-        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
-        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  float16x8x2_t reg;
+
+  explicit FP16Vec16(const void* ptr) {
+    reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));
+    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
+  }
+
+  explicit FP16Vec16(const FP32Vec16& vec);
+
+  void save(void* ptr) const {
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+  }
+
+  void save(void* ptr, const int elem_num) const {
+    int full_blocks = elem_num / 8;
+    int remainder = elem_num % 8;
+
+    if (full_blocks > 0) {
+      vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+      if (full_blocks > 1) {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+      }
     }
-    
-    void save(void *ptr, const int elem_num) const {
-        int full_blocks = elem_num / 8;   
-        int remainder = elem_num % 8;     
-        
-        if (full_blocks > 0) {
-            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
-            if (full_blocks > 1) {
-                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
-            }
-        }
-
-        // Note: below is the unrolled version of the following code:
-        // 
-        // for (int i = 0; i < remainder; ++i) {
-        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
-        //          vgetq_lane_f16(temp, i);
-        // }
-        // 
-        // For macOS build (Clang), the arm/neon intrinsics function 
-        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
-        // time. 
-        
-        if (remainder > 0) {
-            float16x8_t temp = reg.val[full_blocks];
-            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
-            switch (remainder)
-            {
-            case 1:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              break;
-            case 2:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              break;
-            case 3:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              break;
-            case 4:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              break;
-            case 5:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              break;
-            case 6:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
-              break;
-            case 7:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
-              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
-              break;
-            
-            default:
-              break;
-            }
-        }
+
+    // Note: below is the unrolled version of the following code:
+    //
+    // for (int i = 0; i < remainder; ++i) {
+    //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
+    //          vgetq_lane_f16(temp, i);
+    // }
+    //
+    // For macOS build (Clang), the arm/neon intrinsics function
+    // `vgetq_lane_f16` needs the parameter `i` to be constant at compile
+    // time.
+
+    if (remainder > 0) {
+      float16x8_t temp = reg.val[full_blocks];
+      __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+      switch (remainder) {
+        case 1:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          break;
+        case 2:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          break;
+        case 3:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          break;
+        case 4:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          break;
+        case 5:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          break;
+        case 6:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          break;
+        case 7:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+          break;
+
+        default:
+          break;
+      }
     }
+  }
 };
 
-
 #ifdef ARM_BF16_SUPPORT
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
 
   bfloat16x8_t reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
+  explicit BF16Vec8(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t*>(ptr)) {};
 
   explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
+  explicit BF16Vec8(float32x4x2_t v)
+      : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8_t*>(ptr) = reg; }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   bfloat16x8x2_t reg;
 
-  explicit BF16Vec16(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
+  explicit BF16Vec16(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
 
   explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  explicit BF16Vec16(float32x4x4_t v) : reg({
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
-  }){};
+  explicit BF16Vec16(float32x4x4_t v)
+      : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+             vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x2_t*>(ptr) = reg; };
 };
 
 struct BF16Vec32 : public Vec<BF16Vec32> {
@@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   bfloat16x8x4_t reg;
 
-  explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t*>(ptr)) {};
 
   explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
 
-  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg
-  }) {};
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x4_t*>(ptr) = reg; };
 };
 #endif
 
@@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
 
-  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+  explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {};
 
   explicit FP32Vec4(float32x4_t data) : reg(data) {};
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
 
-  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+  explicit FP32Vec8(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
 
   explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
 
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
 
-  explicit FP32Vec8(const FP16Vec8 &v) {
-        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
-        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
-    };
+  explicit FP32Vec8(const FP16Vec8& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg));
+  };
 
-  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+  explicit FP32Vec8(float16x8_t v)
+      : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
 
-  #ifdef ARM_BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 
-  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+  explicit FP32Vec8(bfloat16x8_t v)
+      : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
 
-  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+  explicit FP32Vec8(const BF16Vec8& v)
+      : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
 
-  #endif
+#endif
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float answer = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
 
     return answer;
   }
@@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
 
-    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
-    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
-    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
-    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])),
+                           static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])),
+                           static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])),
+                           static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])),
+                           static_cast<float32_t>(erf(ar.values[7]))};
 
     float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
     float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
@@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     result.val[1] = result1;
 
     return FP32Vec8(result);
-  } 
+  }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                   vmulq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                   vaddq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                   vsubq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                   vdivq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vst1q_f32(ptr, reg.val[0]);
     vst1q_f32(ptr + 4, reg.val[1]);
   }
@@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float32x4x4_t reg;
 
-  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+  explicit FP32Vec16(float v)
+      : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
 
-  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+  explicit FP32Vec16()
+      : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0),
+             vmovq_n_f32(0.0)}) {}
 
-  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+  explicit FP32Vec16(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
+             vld1q_f32(ptr + 12)}) {}
 
   explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data) {
-        reg.val[0] = data.reg.val[0]; 
-        reg.val[1] = data.reg.val[1]; 
-        reg.val[2] = data.reg.val[0]; 
-        reg.val[3] = data.reg.val[1]; 
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
   }
 
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {}
 
-  #ifdef ARM_BF16_SUPPORT
-  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
-    vcvtq_low_f32_bf16(v.val[0]),
-    vcvtq_high_f32_bf16(v.val[0]),
-    vcvtq_low_f32_bf16(v.val[1]),
-    vcvtq_high_f32_bf16(v.val[1])
-  }) {};
-  #endif
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v)
+      : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]),
+             vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {};
+#endif
 
-  explicit FP32Vec16(const FP32Vec4 &data) {
+  explicit FP32Vec16(const FP32Vec4& data) {
     reg.val[0] = data.reg;
     reg.val[1] = data.reg;
     reg.val[2] = data.reg;
     reg.val[3] = data.reg;
   };
 
-  #ifdef ARM_BF16_SUPPORT
-  explicit FP32Vec16(const BF16Vec16 &v) : reg({
-    vcvtq_low_f32_bf16(v.reg.val[0]),
-    vcvtq_high_f32_bf16(v.reg.val[0]),
-    vcvtq_low_f32_bf16(v.reg.val[1]),
-    vcvtq_high_f32_bf16(v.reg.val[1])
-  }) {};
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
-  #endif
-
-  explicit FP32Vec16(const FP16Vec16 &v) {
-      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
-      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
-      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
-      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16& v)
+      : reg({vcvtq_low_f32_bf16(v.reg.val[0]),
+             vcvtq_high_f32_bf16(v.reg.val[0]),
+             vcvtq_low_f32_bf16(v.reg.val[1]),
+             vcvtq_high_f32_bf16(v.reg.val[1])}) {};
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
+#endif
+
+  explicit FP32Vec16(const FP16Vec16& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+    reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+    reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
   };
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vaddq_f32(reg.val[0], b.reg.val[0]),
-        vaddq_f32(reg.val[1], b.reg.val[1]),
-        vaddq_f32(reg.val[2], b.reg.val[2]),
-        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                    vaddq_f32(reg.val[1], b.reg.val[1]),
+                                    vaddq_f32(reg.val[2], b.reg.val[2]),
+                                    vaddq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vmulq_f32(reg.val[0], b.reg.val[0]),
-        vmulq_f32(reg.val[1], b.reg.val[1]),
-        vmulq_f32(reg.val[2], b.reg.val[2]),
-        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                    vmulq_f32(reg.val[1], b.reg.val[1]),
+                                    vmulq_f32(reg.val[2], b.reg.val[2]),
+                                    vmulq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vsubq_f32(reg.val[0], b.reg.val[0]),
-        vsubq_f32(reg.val[1], b.reg.val[1]),
-        vsubq_f32(reg.val[2], b.reg.val[2]),
-        vsubq_f32(reg.val[3], b.reg.val[3])
-    }));
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                    vsubq_f32(reg.val[1], b.reg.val[1]),
+                                    vsubq_f32(reg.val[2], b.reg.val[2]),
+                                    vsubq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vdivq_f32(reg.val[0], b.reg.val[0]),
-        vdivq_f32(reg.val[1], b.reg.val[1]),
-        vdivq_f32(reg.val[2], b.reg.val[2]),
-        vdivq_f32(reg.val[3], b.reg.val[3])
-    }));
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                    vdivq_f32(reg.val[1], b.reg.val[1]),
+                                    vdivq_f32(reg.val[2], b.reg.val[2]),
+                                    vdivq_f32(reg.val[3], b.reg.val[3])}));
   };
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float answer = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
 
     return answer;
   };
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
 
     AliasReg ar;
@@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return answer;
   };
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vst1q_f32(ptr, reg.val[0]);
     vst1q_f32(ptr + 4, reg.val[1]);
     vst1q_f32(ptr + 8, reg.val[2]);
@@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   };
 };
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
 
 #ifdef ARM_BF16_SUPPORT
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 #endif
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<__fp16 *>(ptr) = v;
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<__fp16*>(ptr) = v;
 }
 
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
-    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
-    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
-    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
-    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+  float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+  float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
 
-    reg.val[0] = vcombine_f16(low_0, high_0);
-    reg.val[1] = vcombine_f16(low_1, high_1);
+  reg.val[0] = vcombine_f16(low_0, high_0);
+  reg.val[1] = vcombine_f16(low_1, high_1);
 };
 
-inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
-    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
-    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
+  float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
 
-    reg = vcombine_f16(lower_half, upper_half);
+  reg = vcombine_f16(lower_half, upper_half);
 };
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
   acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
   acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
@@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
 };
 
 #ifdef ARM_BF16_SUPPORT
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
   float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
   float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
@@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
 #endif
 
 #ifdef ARM_BF16_SUPPORT
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
-  }){};
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {
+      };
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+           vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]),
+                               v.reg.val[3])}) {};
 #endif
 
-inline void prefetch(const void *addr) {
-    __builtin_prefetch(addr, 0, 1);
-};
+inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); };
 
 #ifdef ARM_BF16_SUPPORT
 template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
-  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v);
 };
 #endif
-};
\ No newline at end of file
+};  // namespace vec_op
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
index b50bdadc5713d..a8e1be37eb418 100644
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -9,38 +9,40 @@
 namespace vec_op {
 
 // FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
 template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
   (f(std::integral_constant<T, indexes>{}), ...);
 }
-}; // namespace
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 
@@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
 
   __vector signed short reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
+  explicit BF16Vec8(const void* ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {}
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   ss16x8x2_t reg;
 
-  explicit BF16Vec16(const void *ptr) {
+  explicit BF16Vec16(const void* ptr) {
     // Load 256 bits in two parts
-    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
-    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
   }
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const {
+  void save(void* ptr) const {
     // Save 256 bits in two parts
-    vec_xst(reg.val[0], 0, (signed short *)ptr);
-    vec_xst(reg.val[1], 16, (signed short *)ptr);
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
   }
 };
 
@@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   constexpr static int VEC_ELEM_NUM = 32;
 
   ss16x8x4_t reg;
-  explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
 
   explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
 
-  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg
-  }) {}
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
-  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
 
-  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
 
   explicit FP32Vec4(__vector float data) : reg(data) {}
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     reg.val[1] = vec_splats(0.0f);
   }
 
-  explicit FP32Vec8(const float *ptr) {
+  explicit FP32Vec8(const float* ptr) {
     reg.val[0] = vec_xl(0, ptr);
     reg.val[1] = vec_xl(16, ptr);
   }
 
   explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
 
-  explicit FP32Vec8(const FP32Vec8 &data) {
+  explicit FP32Vec8(const FP32Vec8& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
   }
 
-  explicit FP32Vec8(const BF16Vec8 &v) {
+  explicit FP32Vec8(const BF16Vec8& v) {
     reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
     reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
   }
@@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
@@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
   }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
   }
@@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = vec_splats(0.0f);
   }
 
-  explicit FP32Vec16(const float *ptr) {
+  explicit FP32Vec16(const float* ptr) {
     reg.val[0] = vec_xl(0, ptr);
     reg.val[1] = vec_xl(16, ptr);
     reg.val[2] = vec_xl(32, ptr);
@@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec16 &data) {
+  explicit FP32Vec16(const FP32Vec16& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
     reg.val[2] = data.reg.val[2];
     reg.val[3] = data.reg.val[3];
   }
 
-  explicit FP32Vec16(const FP32Vec4 &data) {
+  explicit FP32Vec16(const FP32Vec4& data) {
     reg.val[0] = data.reg;
     reg.val[1] = data.reg;
     reg.val[2] = data.reg;
     reg.val[3] = data.reg;
   }
 
-  explicit FP32Vec16(const FP32Vec8 &data) {
+  explicit FP32Vec16(const FP32Vec8& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
     reg.val[2] = data.reg.val[0];
     reg.val[3] = data.reg.val[1];
   }
 
-  explicit FP32Vec16(const BF16Vec16 &v) {
+  explicit FP32Vec16(const BF16Vec16& v) {
     reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
     reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
     reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
     reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
   }
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_mul(reg.val[0], b.reg.val[0]),
-        vec_mul(reg.val[1], b.reg.val[1]),
-        vec_mul(reg.val[2], b.reg.val[2]),
-        vec_mul(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_add(reg.val[0], b.reg.val[0]),
-        vec_add(reg.val[1], b.reg.val[1]),
-        vec_add(reg.val[2], b.reg.val[2]),
-        vec_add(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_sub(reg.val[0], b.reg.val[0]),
-        vec_sub(reg.val[1], b.reg.val[1]),
-        vec_sub(reg.val[2], b.reg.val[2]),
-        vec_sub(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_div(reg.val[0], b.reg.val[0]),
-        vec_div(reg.val[1], b.reg.val[1]),
-        vec_div(reg.val[2], b.reg.val[2]),
-        vec_div(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
   }
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
 
     AliasReg ar;
@@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
     vec_xst(reg.val[2], 32, ptr);
@@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 };
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc = acc + a * b;
 }
 
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
   *ptr = *(v_ptr + 1);
 }
 
 #ifndef __VEC_CLASS_FP_NAN
-#define __VEC_CLASS_FP_NAN (1 << 6)
+  #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
-const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+const static __vector unsigned char omask = {0,  1,  4,  5,  8,  9,  12, 13,
+                                             16, 17, 20, 21, 24, 25, 28, 29};
 #ifndef _ARCH_PWR10
-const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
-const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
-const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
-const static __vector unsigned int one  = { 1, 1, 1, 1 };
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
 #endif
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
 #ifdef _ARCH_PWR10
   __vector signed short ret[2];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
   reg = vec_perm(ret[0], ret[1], omask);
 #elif defined(_ARCH_PWR9)
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
@@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
   __vector unsigned int rnd1 = vec_add(lsb1, bias);
   inp0 = vec_add(inp0, rnd0);
   inp1 = vec_add(inp1, rnd1);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
   inp0 = vec_sel(inp0, nan, sel0);
   inp1 = vec_sel(inp1, nan, sel1);
   inp0 = vec_sr(inp0, sh16);
@@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
 #endif
 }
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 #ifdef _ARCH_PWR10
   __vector signed short ret[4];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
-  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
-  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[3]);
   reg.val[0] = vec_perm(ret[0], ret[1], omask);
   reg.val[1] = vec_perm(ret[2], ret[3], omask);
 #elif defined(_ARCH_PWR9)
@@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
   inp1 = vec_add(inp1, rnd1);
   inp2 = vec_add(inp2, rnd2);
   inp3 = vec_add(inp3, rnd3);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 =
+      vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 =
+      vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
   inp0 = vec_sel(inp0, nan, sel0);
   inp1 = vec_sel(inp1, nan, sel1);
   inp2 = vec_sel(inp2, nan, sel2);
@@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
 #endif
 }
 
-inline void prefetch(const void *addr) {
+inline void prefetch(const void* addr) {
   __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
 }
 
-}; // namespace vec_op
+};  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 4bb4eb0f491ac..a4ef2be2a58ca 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
 
 namespace vec_op {
 
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)                      \
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
 template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
   (f(std::integral_constant<T, indexes>{}), ...);
 }
-}; // namespace
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 
@@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
 
   __m128i reg;
 
-  explicit FP16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+  explicit FP16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
 
-  explicit FP16Vec8(const FP32Vec8 &);
+  explicit FP16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
 };
 
 struct FP16Vec16 : public Vec<FP16Vec16> {
@@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   __m256i reg;
 
-  explicit FP16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+  explicit FP16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  explicit FP16Vec16(const FP32Vec16 &);
+  explicit FP16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
 
   __m128i reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+  explicit BF16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   __m256i reg;
 
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+  explicit BF16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   __m512i reg;
 
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
 
   explicit BF16Vec32(__m512i data) : reg(data) {}
 
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
+  explicit BF16Vec32(BF16Vec8& vec8_data)
       : reg((__m512i)_mm512_inserti32x4(
             _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
                                                       (__m128i)vec8_data.reg),
@@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
                                (__m128i)vec8_data.reg, 2),
             (__m128i)vec8_data.reg, 3)) {}
 
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; }
 };
 #else
 struct BF16Vec32 : public Vec<BF16Vec32> {
@@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   __m256i reg_low;
   __m256i reg_high;
 
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+  explicit BF16Vec32(const void* ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const*)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {}
 
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
+  explicit BF16Vec32(__m256i low, __m256i high)
+      : reg_low(low), reg_high(high) {}
 
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
+  explicit BF16Vec32(BF16Vec8& vec8_data)
       : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)),
         reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)) {}
 
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  void save(void* ptr) const {
+    *reinterpret_cast<__m256i*>(ptr) = reg_low;
+    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
   }
 };
 #endif
@@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
 
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+  explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {}
 
   explicit FP32Vec4(__m128 data) : reg(data) {}
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
 
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+  explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {}
 
   explicit FP32Vec8(__m256 data) : reg(data) {}
 
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}
 
-  explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
+  explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {}
 
-  explicit FP32Vec8(const BF16Vec8 &v)
+  explicit FP32Vec8(const BF16Vec8& v)
       : reg(_mm256_castsi256_ps(
             _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
 
@@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
@@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
                                   erf(ar.values[1]), erf(ar.values[0])));
   }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
+  FP32Vec8 operator*(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_mul_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
+  FP32Vec8 operator+(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_add_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
+  FP32Vec8 operator-(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_sub_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
+  FP32Vec8 operator/(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_div_ps(reg, b.reg));
   }
 
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+  void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); }
 };
 
 #ifdef __AVX512F__
-struct INT32Vec16: public Vec<INT32Vec16> {
+struct INT32Vec16 : public Vec<INT32Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
     __m512i reg;
@@ -272,12 +274,11 @@ struct INT32Vec16: public Vec<INT32Vec16> {
   };
 
   __m512i reg;
-  
-  explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
 
-  void save(int32_t* ptr) const {
-    _mm512_storeu_epi32(ptr, reg);
-  }
+  explicit INT32Vec16(const void* data_ptr)
+      : reg(_mm512_loadu_epi32(data_ptr)) {}
+
+  void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); }
 
   void save(int32_t* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
 
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec4 &data)
+  explicit FP32Vec16(const FP32Vec4& data)
       : reg((__m512)_mm512_inserti32x4(
             _mm512_inserti32x4(
                 _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
@@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                 (__m128i)data.reg, 2),
             (__m128i)data.reg, 3)) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data)
+  explicit FP32Vec16(const FP32Vec8& data)
       : reg((__m512)_mm512_inserti32x8(
             _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
 
-  explicit FP32Vec16(const BF16Vec16 &v)
+  explicit FP32Vec16(const BF16Vec16& v)
       : reg(_mm512_castsi512_ps(
             _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
 
-  explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
+  explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {}
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const INT32Vec16 &v)
-      : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
+  explicit FP32Vec16(const INT32Vec16& v)
+      : reg(_mm512_cvt_roundepi32_ps(
+            v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
+  FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_mul_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
+  FP32Vec16 operator+(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_add_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
+  FP32Vec16 operator-(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_sub_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
+  FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
 
@@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
   }
 
-  FP32Vec16 abs() const {
-    return FP32Vec16(_mm512_abs_ps(reg));
-  } 
+  FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); }
 
   float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
 
@@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float reduce_min() const { return _mm512_reduce_min_ps(reg); }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
     __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
     return _mm512_mask_reduce_add_ps(mask, reg);
   }
 
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+  void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
 
   void save(float* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   __m256 reg_low;
   __m256 reg_high;
 
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
+  explicit FP32Vec16(float v)
+      : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {}
 
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
+  explicit FP32Vec16()
+      : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {}
 
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+  explicit FP32Vec16(const float* ptr)
+      : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {}
 
   explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
 
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
+  explicit FP32Vec16(const FP32Vec16& data)
+      : reg_low(data.reg_low), reg_high(data.reg_high) {}
 
-  explicit FP32Vec16(const FP32Vec4 &data)
+  explicit FP32Vec16(const FP32Vec4& data)
       : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
         reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data)
+  explicit FP32Vec16(const FP32Vec8& data)
       : reg_low(data.reg), reg_high(data.reg) {}
 
-  explicit FP32Vec16(const FP16Vec16 &v) {
+  explicit FP32Vec16(const FP16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
 
@@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg_high = _mm256_cvtph_ps(high);
   }
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const BF16Vec16 &v) {
+  explicit FP32Vec16(const BF16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
 
@@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg_high = _mm256_castsi256_ps(v_high_shifted);
   }
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
+  FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
                      _mm256_mul_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
+  FP32Vec16 operator+(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
                      _mm256_add_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
+  FP32Vec16 operator-(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
                      _mm256_sub_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
+  FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
                      _mm256_div_ps(reg_high, b.reg_high));
   }
@@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return low.reduce_sum() + high.reduce_sum();
   }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     float sum = 0.0;
     static_assert(VEC_ELEM_NUM % group_size == 0);
     constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return sum;
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     _mm256_storeu_ps(ptr, reg_low);
     _mm256_storeu_ps(ptr + 8, reg_high);
   }
@@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 #endif
 
 #ifdef __AVX512F__
-struct INT8Vec16: public Vec<INT8Vec16> {
+struct INT8Vec16 : public Vec<INT8Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
     __m128i reg;
@@ -523,14 +523,12 @@ struct INT8Vec16: public Vec<INT8Vec16> {
   };
 
   __m128i reg;
-  
-  explicit INT8Vec16(const FP32Vec16& vec) : reg(
-    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
-  ) {}
 
-  void save(int8_t* ptr) const {
-    _mm_storeu_epi8(ptr, reg);
-  }
+  explicit INT8Vec16(const FP32Vec16& vec)
+      : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(
+            vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {}
+
+  void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); }
 
   void save(int8_t* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -540,71 +538,92 @@ struct INT8Vec16: public Vec<INT8Vec16> {
 };
 #endif
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc = acc + a * b;
 }
 
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<unsigned short *>(ptr) =
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<unsigned short*>(ptr) =
       _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
-inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
+inline FP16Vec8::FP16Vec8(const FP32Vec8& v)
     : reg(_mm256_cvtps_ph(v.reg,
                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 
 #ifdef __AVX512F__
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
     : reg(_mm512_cvtps_ph(v.reg,
                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 #else
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
-    : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
+    : reg(_mm256_insertf128_si256(
+          _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
+          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
 #endif
 
 #ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v);
 }
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
     : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
 
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
   acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
 }
 #else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
   *ptr = *(v_ptr + 1);
 }
 
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+  #ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg(_mm256_cvtepi32_epi16(
           _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
     : reg(_mm512_cvtepi32_epi16(
           _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
+  #else
+namespace {
 __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
   __m256i ai = _mm256_castps_si256(a);
   ai = _mm256_srli_epi32(ai, 16);
@@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
   ai = _mm256_permute4x64_epi64(ai, 0b00111001);
   return _mm256_extracti128_si256(ai, 0);
 }
-}
+}  // namespace
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
   BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
   reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
 }
-#endif // __AVX512F__
-#endif // __AVX512BF16__
+  #endif  // __AVX512F__
+#endif    // __AVX512BF16__
 
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
-}; // namespace vec_op
+};  // namespace vec_op
 
 #endif
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 85e359aa57113..07c9e46c27b06 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -27,8 +27,7 @@
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
   return max_shared_mem_per_block_opt_in;
 }
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index e92104399342d..36cf8e7440eca 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai
 ```bash
 pip install -r requirements-dev.txt
 
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
+# Linting, formatting and static type checking
+pre-commit install
+
+# You can manually run pre-commit with
+pre-commit run --all-files
+
 # Unit tests
 pytest tests/
 ```
@@ -88,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes.
 The PR needs to meet the following code quality standards:
 
 - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
-- Pass all linter checks. Please use <gh-file:format.sh> to format your code.
+- Pass all linter checks. Please use `pre-commit` to format your code. See
+  <https://pre-commit.com/#usage> if `pre-commit` is new to you.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
diff --git a/format.sh b/format.sh
deleted file mode 100755
index 2277eef93c745..0000000000000
--- a/format.sh
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/env bash
-# YAPF formatter, adapted from ray and skypilot.
-#
-# Usage:
-#    # Do work and commit your work.
-
-#    # Format files that differ from origin/main.
-#    bash format.sh
-
-#    # Commit changed files with message 'Run yapf and ruff'
-#
-#
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
-# You are encouraged to run this locally before pushing changes for review.
-
-# Cause the script to exit if a single command fails
-set -eo pipefail
-
-# this stops git rev-parse from failing if we run this from the .git directory
-builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
-ROOT="$(git rev-parse --show-toplevel)"
-builtin cd "$ROOT" || exit 1
-
-check_command() {
-    if ! command -v "$1" &> /dev/null; then
-        echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
-        exit 1
-    fi
-}
-
-check_command yapf
-check_command ruff
-check_command mypy
-check_command codespell
-check_command isort
-check_command clang-format
-
-YAPF_VERSION=$(yapf --version | awk '{print $2}')
-RUFF_VERSION=$(ruff --version | awk '{print $2}')
-MYPY_VERSION=$(mypy --version | awk '{print $2}')
-CODESPELL_VERSION=$(codespell --version)
-ISORT_VERSION=$(isort --vn)
-CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
-PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}')
-
-# # params: tool name, tool version, required version
-tool_version_check() {
-    expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3)
-    if [[ "$2" != "$expected" ]]; then
-        echo "❓❓Wrong $1 version installed: $expected is required, not $2."
-        exit 1
-    fi
-}
-
-tool_version_check "yapf" "$YAPF_VERSION"
-tool_version_check "ruff" "$RUFF_VERSION"
-tool_version_check "mypy" "$MYPY_VERSION"
-tool_version_check "isort" "$ISORT_VERSION"
-tool_version_check "codespell" "$CODESPELL_VERSION"
-tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
-tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION"
-
-YAPF_FLAGS=(
-    '--recursive'
-    '--parallel'
-)
-
-YAPF_EXCLUDES=(
-    '--exclude' 'build/**'
-)
-
-# Format specified files
-format() {
-    yapf --in-place "${YAPF_FLAGS[@]}" "$@"
-}
-
-# Format files that differ from main branch. Ignores dirs that are not slated
-# for autoformat yet.
-format_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause yapf to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
-    fi
-
-}
-
-# Format all files
-format_all() {
-    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" .
-}
-
-## This flag formats individual files. --files *must* be the first command line
-## arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   format "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is formatted.
-elif [[ "$1" == '--all' ]]; then
-   format_all
-else
-   # Format only the files that changed in last commit.
-   format_changed
-fi
-echo 'vLLM yapf: Done'
-
-# Run mypy
-echo 'vLLM mypy:'
-tools/mypy.sh
-echo 'vLLM mypy: Done'
-
-
-# If git diff returns a file that is in the skip list, the file may be checked anyway:
-# https://github.com/codespell-project/codespell/issues/1915
-# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
-CODESPELL_EXCLUDES=(
-    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
-)
-
-# check spelling of specified files
-spell_check() {
-    codespell "$@"
-}
-
-spell_check_all(){
-  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
-}
-
-# Spelling check of files that differ from main branch.
-spell_check_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-            codespell "${CODESPELL_EXCLUDES[@]}"
-    fi
-}
-
-# Run Codespell
-## This flag runs spell check of individual files. --files *must* be the first command line
-## arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   spell_check "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   spell_check_all
-else
-   # Check spelling only of the files that changed in last commit.
-   spell_check_changed
-fi
-echo 'vLLM codespell: Done'
-
-
-# Lint specified files
-lint() {
-    ruff check "$@"
-}
-
-# Lint files that differ from main branch. Ignores dirs that are not slated
-# for autolint yet.
-lint_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             ruff check
-    fi
-
-}
-
-# Run Ruff
-### This flag lints individual files. --files *must* be the first command line
-### arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   lint "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   lint vllm tests
-else
-   # Format only the files that changed in last commit.
-   lint_changed
-fi
-echo 'vLLM ruff: Done'
-
-# check spelling of specified files
-isort_check() {
-    isort "$@"
-}
-
-isort_check_all(){
-  isort .
-}
-
-# Spelling  check of files that differ from main branch.
-isort_check_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             isort
-    fi
-}
-
-# Run Isort
-# This flag runs spell check of individual files. --files *must* be the first command line
-# arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   isort_check "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   isort_check_all
-else
-   # Check spelling only of the files that changed in last commit.
-   isort_check_changed
-fi
-echo 'vLLM isort: Done'
-
-# Clang-format section
-# Exclude some files for formatting because they are vendored
-# NOTE: Keep up to date with .github/workflows/clang-format.yml
-CLANG_FORMAT_EXCLUDES=(
-    'csrc/moe/topk_softmax_kernels.cu'
-    'csrc/quantization/gguf/ggml-common.h'
-    'csrc/quantization/gguf/dequantize.cuh'
-    'csrc/quantization/gguf/vecdotq.cuh'
-    'csrc/quantization/gguf/mmq.cuh'
-    'csrc/quantization/gguf/mmvq.cuh'
-)
-
-# Format specified files with clang-format
-clang_format() {
-    clang-format -i "$@"
-}
-
-# Format files that differ from main branch with clang-format.
-clang_format_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause clang-format to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    # Get the list of changed files, excluding the specified ones
-    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
-    if [ -n "$changed_files" ]; then
-        echo "$changed_files" | xargs -P 5 clang-format -i
-    fi
-}
-
-# Format all files with clang-format
-clang_format_all() {
-    find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
-        | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \
-        | xargs clang-format -i
-}
-
-# Run clang-format
-if [[ "$1" == '--files' ]]; then
-   clang_format "${@:2}"
-elif [[ "$1" == '--all' ]]; then
-   clang_format_all
-else
-   clang_format_changed
-fi
-echo 'vLLM clang-format: Done'
-
-echo 'vLLM actionlint:'
-tools/actionlint.sh -color
-echo 'vLLM actionlint: Done'
-
-echo 'vLLM shellcheck:'
-tools/shellcheck.sh
-echo 'vLLM shellcheck: Done'
-
-echo 'excalidraw png check:'
-tools/png-lint.sh
-echo 'excalidraw png check: Done'
-
-if ! git diff --quiet &>/dev/null; then
-    echo 
-    echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
-    git --no-pager diff --name-only
-    echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
-
-    exit 1
-else
-    echo "✨🎉 Format check passed! Congratulations! 🎉✨"
-fi
-
-echo 'vLLM doc-lint:'
-tools/doc-lint.sh
-echo 'vLLM doc-lint: Done'
diff --git a/pyproject.toml b/pyproject.toml
index 82275ccafb572..8f2e20d0f5800 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools_scm]
 # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
 
+[tool.yapfignore]
+ignore_patterns = [
+    "build/**",
+]
+
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
@@ -52,6 +57,9 @@ ignore = [
     "B007",
     # f-string format
     "UP032",
+    # Python 3.8 typing
+    "UP006", "UP035",
+
 ]
 
 [tool.mypy]
diff --git a/requirements-lint.txt b/requirements-lint.txt
index ffc73f90a0d48..62446f94048df 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,15 +1,2 @@
 # formatting
-yapf==0.32.0
-toml==0.10.2
-tomli==2.0.2
-ruff==0.6.5
-codespell==2.3.0
-isort==5.13.2
-clang-format==18.1.5
-pymarkdownlnt==0.9.26
-
-# type checking
-mypy==1.11.1
-types-PyYAML
-types-requests
-types-setuptools
+pre-commit==4.0.1
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
deleted file mode 100755
index f6a8b5e83a2de..0000000000000
--- a/tools/actionlint.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-if command -v actionlint &> /dev/null; then
-    actionlint "$@"
-    exit 0
-elif [ -x ./actionlint ]; then
-    ./actionlint "$@"
-    exit 0
-fi
-
-# download a binary to the current directory - v1.7.3
-bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
-./actionlint "$@"
diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh
deleted file mode 100755
index 19a55ddfa91c4..0000000000000
--- a/tools/doc-lint.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-pymarkdownlnt scan docs -r

From c5c06209ec1d90146dd12095d7bff3326aa6dd15 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:58:29 -0500
Subject: [PATCH 23/50] [DOC] Fix typo in docstring and assert message (#12194)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/engine/output_processor/single_step.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index da3185f33dbe9..55c56abea0da3 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -102,9 +102,9 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         
         Args:
           seq_group: the output is associated with this :class:`SequenceGroup`
-          output: the :class:`SequenceGroupOutput` for a single scheduler step
+          outputs: the :class:`SequenceGroupOutput` for a single scheduler step
         """
-        assert len(outputs) == 1, ("Single step should only has 1 output.")
+        assert len(outputs) == 1, "Single step should only have 1 output."
         output = outputs[0]
         assert isinstance(output, CompletionSequenceGroupOutput)
         single_step_process_prompt_logprob(self, seq_group, output)

From d2643128f7741b937435b00fecde7d6b2e351d0c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:59:00 -0500
Subject: [PATCH 24/50] [DOC] Add missing docstring in LLMEngine.add_request()
 (#12195)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/engine/llm_engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 88c21f9a6d31b..b6bba1d67b408 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -689,7 +689,9 @@ def add_request(
                 :class:`~vllm.PoolingParams` for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
+            lora_request: The LoRA request to add.
             trace_headers: OpenTelemetry trace headers.
+            prompt_adapter_request: The prompt adapter request to add.
             priority: The priority of the request.
                 Only applicable with priority scheduling.
 

From 0974c9bc5c0252ecb25f440139936529657452ab Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:59:20 -0500
Subject: [PATCH 25/50] [Bugfix] Fix incorrect types in LayerwiseProfileResults
 (#12196)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/profiler/layerwise_profile.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 33babfebdca1e..29c0edd0ee535 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,7 +1,7 @@
 import copy
 from collections import defaultdict
 from dataclasses import asdict, dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union
 
 import pandas as pd
 from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
@@ -128,7 +128,7 @@ def export_summary_stats_table_csv(self, filename: str):
         ])
         df.to_csv(filename)
 
-    def convert_stats_to_dict(self) -> str:
+    def convert_stats_to_dict(self) -> dict[str, Any]:
         return {
             "metadata": {
                 "num_running_seqs": self.num_running_seqs
@@ -227,7 +227,7 @@ def _total_cuda_time(self):
             [self._cumulative_cuda_time(root) for root in self._module_tree])
 
     def _build_stats_trees(self):
-        summary_dict: Dict[str, self.StatsTreeNode] = {}
+        summary_dict: Dict[str, _StatsTreeNode] = {}
         total_cuda_time = self._total_cuda_time()
 
         def pct_cuda_time(cuda_time_us):

From 83609791d2ceeb628e0d1f5ea60a64c132eb083c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 20 Jan 2025 14:59:46 +0800
Subject: [PATCH 26/50] [Model] Add Qwen2 PRM model support (#12202)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  5 +++
 .../embedding/language/test_embedding.py      |  9 ++--
 tests/models/registry.py                      |  1 +
 vllm/model_executor/models/qwen2_rm.py        | 42 +++++++++++++++----
 vllm/model_executor/models/registry.py        |  1 +
 5 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index eb1bde9ec0089..3da5aaf713c1f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -470,6 +470,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
   - `Qwen/Qwen2.5-Math-RM-72B`, etc.
   - ✅︎
   - ✅︎
+* - `Qwen2ForProcessRewardModel`
+  - Qwen2-based
+  - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 04ab4dd7371a3..bb47d14807b55 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -17,14 +17,15 @@
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
-        # [Encoder-decoder]
-        pytest.param("intfloat/e5-mistral-7b-instruct",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        pytest.param("intfloat/e5-mistral-7b-instruct",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # [Encoder-decoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index cb0521cfe80a7..9603ea8817cac 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -155,6 +155,7 @@ class _HfExamplesInfo:
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 988d682d36be3..593ce4857af0f 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -12,7 +12,7 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
@@ -32,7 +32,7 @@ def forward(self, input):
         return self.activation(input)
 
 
-class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP):
+class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -60,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
 
         self.config = config
         self.lora_config = lora_config
@@ -74,14 +73,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  config.hidden_size,
                                  quant_config=quant_config),
             ReLU(),
-            RowParallelLinear(config.hidden_size, 1,
+            RowParallelLinear(config.hidden_size,
+                              config.num_labels,
                               quant_config=quant_config),
         )
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.ALL,
-            normalize=False,
-            softmax=False)
+        self._pooler: SimplePooler
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -115,3 +111,31 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
         return loader.load_weights(weights)
+
+
+class Qwen2ForRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config, prefix=""):
+        vllm_config.model_config.hf_config.num_labels = 1
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.ALL,
+            normalize=False,
+            softmax=False)
+
+
+class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config, prefix=""):
+        vllm_config.model_config.hf_config.num_labels = 2
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.STEP,
+            normalize=False,
+            softmax=True,
+            step_tag_id=151651,
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 311f91472783b..8d2719ca2d00d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -127,6 +127,7 @@
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+    "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501

From 59a0192fb9bef026086d0a2ed32705d870a9466a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 15:00:59 +0800
Subject: [PATCH 27/50] [Core] Interface for accessing model from `VllmRunner`
 (#10353)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/conftest.py                             |   5 +
 tests/engine/test_custom_executor.py          |   4 +-
 .../test_model_load_with_params.py            |  64 ++---
 .../decoder_only/language/test_jamba.py       |   7 +-
 .../decoder_only/language/test_mamba.py       |   7 +-
 .../decoder_only/language/test_models.py      |   7 +-
 .../vision_language/test_qwen2_vl.py          |  49 ++--
 .../embedding/language/test_cls_models.py     |   7 +-
 .../embedding/language/test_embedding.py      |   7 +-
 tests/quantization/test_compressed_tensors.py | 242 ++++++++++--------
 tests/quantization/test_fp8.py                |  52 ++--
 tests/quantization/test_lm_head.py            |  37 +--
 tests/quantization/test_quark.py              |  23 +-
 tests/tensorizer_loader/test_tensorizer.py    |  34 ++-
 vllm/engine/llm_engine.py                     |  17 +-
 vllm/entrypoints/llm.py                       |  52 ++--
 vllm/executor/executor_base.py                |  50 +++-
 vllm/executor/mp_distributed_executor.py      |   2 +-
 .../model_executor/model_loader/tensorizer.py |  17 +-
 vllm/spec_decode/ngram_worker.py              |  12 +-
 .../spec_decode/smaller_tp_proposer_worker.py |  12 +
 vllm/spec_decode/spec_decode_worker.py        |   4 +
 vllm/v1/executor/multiproc_executor.py        |  16 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +
 vllm/v1/worker/gpu_worker.py                  |   4 +
 vllm/worker/cpu_model_runner.py               |   3 +
 vllm/worker/hpu_model_runner.py               |   4 +
 vllm/worker/model_runner.py                   |   3 +
 vllm/worker/model_runner_base.py              |   9 +-
 vllm/worker/neuron_model_runner.py            |   3 +
 vllm/worker/openvino_model_runner.py          |   3 +
 vllm/worker/openvino_worker.py                |   4 +
 vllm/worker/tpu_model_runner.py               |   3 +
 vllm/worker/worker_base.py                    |  12 +
 vllm/worker/xpu_model_runner.py               |   3 +
 35 files changed, 474 insertions(+), 307 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 95af4ac1eb17b..279c1bf9a3776 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -244,6 +244,7 @@ def video_assets() -> _VideoAssets:
 
 
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
+_R = TypeVar("_R")
 
 
 class HfRunner:
@@ -930,6 +931,10 @@ def score(
         req_outputs = self.model.score(text_1, text_2)
         return [req_output.outputs.score for req_output in req_outputs]
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        executor = self.model.llm_engine.model_executor
+        return executor.apply_model(func)
+
     def __enter__(self):
         return self
 
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index fdfcd4f4c9d50..0e33f3662da82 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomUniExecutor)
+            model=model,
+            distributed_executor_backend=CustomUniExecutor,
+        )
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 0609fd96825e3..9c1f784c1c93b 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner):
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_config = model.model.llm_engine.model_config
-
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner):
         assert model_tokenizer.tokenizer_config["do_lower_case"]
         assert model_tokenizer.tokenizer.model_max_length == 512
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert isinstance(model, BertEmbeddingModel)
-        assert model._pooler.pooling_type == PoolingType.CLS
-        assert model._pooler.normalize
+        def check_model(model):
+            assert isinstance(model, BertEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.CLS
+            assert model._pooler.normalize
+
+        vllm_model.apply_model(check_model)
+
         # assert output
         assert output
 
@@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                      revision=REVISION_ROBERTA,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_config = model.model.llm_engine.model_config
-
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
         assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
         assert not model_tokenizer.tokenizer_config["do_lower_case"]
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert isinstance(model, RobertaEmbeddingModel)
-        assert model._pooler.pooling_type == PoolingType.MEAN
-        assert model._pooler.normalize
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.MEAN
+            assert model._pooler.normalize
+
+        vllm_model.apply_model(check_model)
 
         # assert output
         assert output
@@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
     model_name = "FacebookAI/roberta-base"
     with vllm_runner(model_name=model_name,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
         assert model_tokenizer.tokenizer_id == model_name
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert not hasattr(model, "lm_head")
-        assert isinstance(model, RobertaEmbeddingModel)
-        assert isinstance(model._pooler, CLSPool)
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert not hasattr(model, "lm_head")
+            assert isinstance(model._pooler, CLSPool)
+
+        vllm_model.apply_model(check_model)
 
         assert output
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 057b04349e8b7..2e06b10fbb827 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -33,10 +33,13 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 06739e8f02253..1ad4f5aae8f5b 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -51,10 +51,13 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 4e110366a09f3..c7efa4edbbc0a 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -73,10 +73,13 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 2fd22f0cc88ec..5a485f3d81747 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -5,7 +5,6 @@
 import torch
 from PIL import Image
 
-from vllm.entrypoints.llm import LLM
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
@@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 def batch_make_image_embeddings(
         image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
-        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
     """batched image embeddings for Qwen2-VL
 
     This will infer all images' embeddings in a single batch, 
@@ -106,16 +105,18 @@ def batch_make_image_embeddings(
     image_grid_thw = preprocess_result["image_grid_thw"]
 
     # pixel values to embeddings & grid_thws
-    with torch.no_grad():
-        visual = llm.llm_engine.model_executor.driver_worker. \
-            model_runner.model.visual
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
 
-        pixel_values_on_device = pixel_values.to(visual.device,
-                                                 dtype=visual.dtype)
-        image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
-        image_embeds = visual(pixel_values_on_device,
-                              grid_thw=image_grid_thw_on_device)
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=image_grid_thw_on_device)
+
+    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
     result: List[Qwen2VLPromptImageEmbeddingInput] = []
@@ -150,7 +151,7 @@ def batch_make_image_embeddings(
 
 def batch_make_video_embeddings(
         video_batches: PromptVideoInput, processor,
-        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
     """batched video embeddings for Qwen2-VL
 
     A NDArray represents a single video's all frames.
@@ -187,16 +188,18 @@ def batch_make_video_embeddings(
     video_grid_thw = preprocess_result["video_grid_thw"]
 
     # pixel values to embeddings & grid_thws
-    with torch.no_grad():
-        visual = llm.llm_engine.model_executor.driver_worker.\
-            model_runner.model.visual
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=video_grid_thw_on_device)
 
-        pixel_values_on_device = pixel_values.to(visual.device,
-                                                 dtype=visual.dtype)
-        video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
-        video_embeds = visual(pixel_values_on_device,
-                              grid_thw=video_grid_thw_on_device)
+    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
     result: List[Qwen2VLPromptVideoEmbeddingInput] = []
@@ -278,9 +281,9 @@ def run_embedding_input_test(
                 max_tokens,
                 num_logprobs=num_logprobs,
                 images=batch_make_image_embeddings(
-                    images, processor, vllm_model.model) if images else None,
+                    images, processor, vllm_model) if images else None,
                 videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model.model) if videos else None)
+                    videos, processor, vllm_model) if videos else None)
             for prompts, images, videos in inputs
         ]
 
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 6673a9fc22f69..0cbe4afe96c0a 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -24,10 +24,13 @@ def test_classification_models(
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     with hf_runner(model,
                    dtype=dtype,
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index bb47d14807b55..e17198e385475 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -62,10 +62,13 @@ def test_models(
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 92436889ecffe..0cd86cef0a475 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -30,50 +30,55 @@
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
-
-        qkv_proj = layer.self_attn.qkv_proj
-        o_proj = layer.self_attn.o_proj
-        gate_up_proj = layer.mlp.gate_up_proj
-        down_proj = layer.mlp.down_proj
-
-        # assert zp for symmetric and asymmetric cases
-        def zp_valid(zp: Optional[torch.Tensor]):
-            if is_symmetric:
-                return zp is None
-
-            return zp is not None and zp.dtype is torch.int32
-
-        assert zp_valid(qkv_proj.input_zero_point)
-        assert zp_valid(o_proj.input_zero_point)
-        assert zp_valid(gate_up_proj.input_zero_point)
-        assert zp_valid(down_proj.input_zero_point)
-
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(gate_up_proj.quant_method,
-                          CompressedTensorsLinearMethod)
-        assert isinstance(down_proj.quant_method,
-                          CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
-
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.is_static_input_scheme
-        expected_type = torch.int8
-
-        assert qkv_proj.weight.dtype is expected_type
-        assert o_proj.weight.dtype is expected_type
-        assert gate_up_proj.weight.dtype is expected_type
-
-        if qkv_proj.scheme.strategy == "tensor":
-            # Make sure it is a channelwise buffer
-            # After running process_weights_after_loading
-            assert len(qkv_proj.weight_scale.shape) == 2
-            assert qkv_proj.weight_scale.shape[0] == shape_0
-            assert qkv_proj.weight_scale.shape[1] == 1
-        assert qkv_proj.weight_scale.dtype is torch.float32
-        assert qkv_proj.input_scale.dtype is torch.float32
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            # assert zp for symmetric and asymmetric cases
+            def zp_valid(zp: Optional[torch.Tensor]):
+                if is_symmetric:
+                    return zp is None
+
+                return zp is not None and zp.dtype is torch.int32
+
+            assert zp_valid(qkv_proj.input_zero_point)
+            assert zp_valid(o_proj.input_zero_point)
+            assert zp_valid(gate_up_proj.input_zero_point)
+            assert zp_valid(down_proj.input_zero_point)
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.is_static_input_scheme
+            expected_type = torch.int8
+
+            assert qkv_proj.weight.dtype is expected_type
+            assert o_proj.weight.dtype is expected_type
+            assert gate_up_proj.weight.dtype is expected_type
+
+            if qkv_proj.scheme.strategy == "tensor":
+                # Make sure it is a channelwise buffer
+                # After running process_weights_after_loading
+                assert len(qkv_proj.weight_scale.shape) == 2
+                assert qkv_proj.weight_scale.shape[0] == shape_0
+                assert qkv_proj.weight_scale.shape[1] == 1
+            assert qkv_proj.weight_scale.dtype is torch.float32
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
@@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
 def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
     model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+            assert not qkv_proj.scheme.is_static_input_scheme
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.weight.dtype is torch.int8
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
-        assert not qkv_proj.scheme.is_static_input_scheme
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.weight.dtype is torch.int8
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
@@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
-        assert qkv_proj.weight_packed.dtype is torch.int32
-        assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.scheme.pack_factor == pack_factor
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.group_size == (-1
+                                                  if group is None else group)
+
+            assert qkv_proj.weight_packed.dtype is torch.int32
+            assert qkv_proj.weight_scale.dtype is torch.float16
+            assert qkv_proj.scheme.pack_factor == pack_factor
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
-        assert qkv_proj.weight_packed.dtype is torch.int32
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
+            assert qkv_proj.weight_packed.dtype is torch.int32
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
 def test_compressed_tensors_fp8(vllm_runner):
     model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(
-            qkv_proj.scheme,
-            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+            qkv_proj = layer.self_attn.qkv_proj
 
-        assert qkv_proj.input_scale.dtype is torch.float32
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(
+                qkv_proj.scheme,
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
 
-        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
-            assert len(qkv_proj.input_scale.shape) == 0
-            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-            assert qkv_proj.weight_scale.dtype is torch.float32
-            assert len(qkv_proj.weight_scale.shape) == 0
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+            if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight_scale.dtype is torch.float32
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
-        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
@@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert qkv_proj.scheme.weights_dtype == torch.int8
-        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
@@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
-
-        qkv_proj = layer.self_attn.qkv_proj
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensors24)
-
-        assert qkv_proj.scheme.weight_quant is None
-        assert qkv_proj.scheme.input_quant is None
-        assert not qkv_proj.scheme.quantized
-        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-        assert sparsity_map.get("Linear").format == "dense"
-        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+            assert sparsity_map.get("Linear").format == "dense"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index a0c1d7e24c503..4bff734746297 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        attn = model.model.layers[0].self_attn.attn
-        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
-        # NOTE: it is valid for scales to be 1.0 (default value), but we know
-        # these checkpoints have scales < 1.0
-        assert 0.0 < attn._k_scale < 1.0
-        assert 0.0 < attn._v_scale < 1.0
+        def check_model(model):
+            attn = model.model.layers[0].self_attn.attn
+
+            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+
+            # NOTE: it is valid for scales to be 1.0 (default value), but
+            # we know these checkpoints have scales < 1.0
+            assert 0.0 < attn._k_scale < 1.0
+            assert 0.0 < attn._v_scale < 1.0
+
+        llm.apply_model(check_model)
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
@@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                      quantization="fp8",
                      kv_cache_dtype=kv_cache_dtype) as llm:
 
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        fc1 = model.model.decoder.layers[0].fc1
-        assert isinstance(fc1.quant_method, Fp8LinearMethod)
-        if kv_cache_dtype == "fp8":
-            attn = model.model.decoder.layers[0].self_attn.attn
-            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
-            assert attn._k_scale == 1.0
-            assert attn._v_scale == 1.0
-
-        if current_platform.has_device_capability(89) and not force_marlin:
-            # For GPUs with hardware support, we keep weights in fp8
-            assert fc1.weight.dtype == torch.float8_e4m3fn
-        else:
-            # For GPUs without hardware support, we pack the fp8 weights
-            # for weight-only quantization using Marlin kernels
-            assert fc1.weight.dtype == torch.int32
+        def check_model(model):
+            fc1 = model.model.decoder.layers[0].fc1
+            assert isinstance(fc1.quant_method, Fp8LinearMethod)
+            if kv_cache_dtype == "fp8":
+                attn = model.model.decoder.layers[0].self_attn.attn
+                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+                assert attn._k_scale == 1.0
+                assert attn._v_scale == 1.0
+
+            if current_platform.has_device_capability(89) and not force_marlin:
+                # For GPUs with hardware support, we keep weights in fp8
+                assert fc1.weight.dtype == torch.float8_e4m3fn
+            else:
+                # For GPUs without hardware support, we pack the fp8 weights
+                # for weight-only quantization using Marlin kernels
+                assert fc1.weight.dtype == torch.int32
+
+        llm.apply_model(check_model)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index ad526a4065101..fa2d9645ea47f 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -28,20 +28,23 @@ def test_lm_head(
     model_lm_head_quant: Tuple[str, bool],
 ) -> None:
     model, lm_head_quantized = model_lm_head_quant
-    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
-
-    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
-                     model_runner.model.lm_head)
-
-    if lm_head_quantized:
-        assert isinstance(
-            lm_head_layer.linear_method,
-            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
-    else:
-        assert isinstance(lm_head_layer.linear_method,
-                          UnquantizedEmbeddingMethod)
-
-    print(
-        vllm_model.generate_greedy(prompts=["Hello my name is"],
-                                   max_tokens=10)[0][1])
-    del vllm_model
+
+    with vllm_runner(model, dtype=torch.float16,
+                     max_model_len=2048) as vllm_model:
+
+        def check_model(model):
+            lm_head_layer = model.lm_head
+
+            if lm_head_quantized:
+                assert isinstance(lm_head_layer.linear_method,
+                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
+                                   MarlinLinearMethod))
+            else:
+                assert isinstance(lm_head_layer.linear_method,
+                                  UnquantizedEmbeddingMethod)
+
+        vllm_model.apply_model(check_model)
+
+        print(
+            vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                       max_tokens=10)[0][1])
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 27493a682b746..11382ad708faa 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -12,19 +12,22 @@
 def test_quark_fp8(vllm_runner):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
-        assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+            qkv_proj = layer.self_attn.qkv_proj
 
-        if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
-            assert len(qkv_proj.input_scale.shape) == 0
-            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-            #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
-            assert len(qkv_proj.weight_scale.shape) == 0
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+
+            if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index bf409d2d97aa1..6e7eec1c6ab34 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -3,6 +3,7 @@
 import os
 import pathlib
 import subprocess
+from functools import partial
 from unittest.mock import MagicMock, patch
 
 import openai
@@ -24,7 +25,6 @@
 # yapf: enable
 from vllm.utils import PlaceholderModule, import_from_path
 
-from ..conftest import VllmRunner
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
@@ -58,16 +58,6 @@ def is_curl_installed():
         return False
 
 
-def get_torch_model(vllm_runner: VllmRunner):
-    return vllm_runner \
-        .model \
-        .llm_engine \
-        .model_executor \
-        .driver_worker \
-        .model_runner \
-        .model
-
-
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
     pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
@@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
                                                   encryption_keyfile=key_path)
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             config_for_serializing)
+
+        vllm_model.apply_model(
+            partial(serialize_vllm_model,
+                    tensorizer_config=config_for_serializing))
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
@@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             TensorizerConfig(tensorizer_uri=model_path))
+        vllm_model.apply_model(
+            partial(
+                serialize_vllm_model,
+                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
 
     with vllm_runner(
             model_ref,
@@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             TensorizerConfig(tensorizer_uri=model_path))
+        vllm_model.apply_model(
+            partial(
+                serialize_vllm_model,
+                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
 
         model_loader_extra_config = {
             "tensorizer_uri": str(model_path),
@@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
 
     with vllm_runner(model_ref) as vllm_model:
         outputs = vllm_model.generate(prompts, sampling_params)
-        serialize_vllm_model(get_torch_model(vllm_model), config)
+
+        vllm_model.apply_model(
+            partial(serialize_vllm_model, tensorizer_config=config))
 
         assert is_vllm_tensorized(config)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b6bba1d67b408..6a6b4a14a4c49 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,10 +5,10 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
+                    List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Type, Union, cast, overload
+from typing import Set, Type, Union, cast, overload
 
 import torch
 from typing_extensions import TypeVar, deprecated
@@ -1818,17 +1818,6 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
-        """
-        See LLM.collective_rpc for more details.
-        """
-        return self.model_executor.collective_rpc(method, timeout, args,
-                                                  kwargs)
-
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0cfe6be9ac767..27386daa4bbc9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,8 +5,9 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch.nn as nn
 from tqdm import tqdm
-from typing_extensions import deprecated
+from typing_extensions import TypeVar, deprecated
 
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -42,6 +43,8 @@
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R", default=Any)
+
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -464,25 +467,42 @@ def generate(
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def collective_rpc(self,
-                       method: Union[str, Callable],
+                       method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+        
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        executor = self.llm_engine.model_executor
+        return executor.collective_rpc(method, timeout, args, kwargs)
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
-        Run a method on all workers, with homogeneous arguments.
-        The main extension point for the LLM entrypoint.
-        Users can provide custom worker class through `worker_cls`
-        argument, and implement new methods in the worker class.
-        Then, users can call the new methods through this API.
-        It is recommended to use this API to only pass control messages,
-        and set up data-plane communication to pass data.
-        The method can also be a callable, which will be serialized
-        and sent to all workers to execute.
-        If the method is a callable, it should accept an additional
-        `self` argument, in addition to the arguments passed in `args`
-        and `kwargs`. The `self` argument will be the worker object.
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
         """
-        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
+        executor = self.llm_engine.model_executor
+        return executor.apply_model(func)
 
     def beam_search(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index e5952b388c543..859e105f15d97 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -3,6 +3,9 @@
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
                     Union)
 
+import torch.nn as nn
+from typing_extensions import TypeVar
+
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -11,9 +14,12 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R", default=Any)
+
 
 class ExecutorBase(ABC):
     """Base class for all executors.
@@ -44,22 +50,37 @@ def __init__(
 
     @abstractmethod
     def _init_executor(self) -> None:
-        pass
+        raise NotImplementedError
 
     @abstractmethod
     def collective_rpc(self,
-                       method: Union[str, Callable],
+                       method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
         """
-        The main interface of the executor to run a method on all workers,
-        with homogeneous arguments.
-        If the args are heterogeneous, then we can pack them into a list,
-        and unpack them in the method of every worker, because every worker
-        knows their own rank.
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+        
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
         """
-        pass
+        raise NotImplementedError
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
@@ -97,6 +118,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         self.collective_rpc("initialize_cache",
                             args=(num_gpu_blocks, num_cpu_blocks))
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        """
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
+        """
+
+        def rpc_func(worker: WorkerBase) -> _R:
+            return func(worker.get_model())
+
+        return self.collective_rpc(rpc_func)
+
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
     ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index a80b0ee8b3122..78c86321d861d 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -148,7 +148,7 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
-    ) -> Any:
+    ) -> List[Any]:
         """Runs the given method on all workers.
 
         Args:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index fbd4937112e11..5b4757072353f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs,
             stream.write(encryption_params.key)
 
     engine = LLMEngine.from_engine_args(engine_args)
-    if tensorizer_config._is_sharded:
-        # if the engine is a distributed engine (for tensor parallel) then each
-        # worker shard needs to serialize its part of the model.
-        engine.model_executor._run_workers(
-            "save_tensorized_model",
-            tensorizer_config=tensorizer_config,
-        )
-    else:
-        # with a single worker, we can get to the underlying model directly
-        serialize_vllm_model(
-            engine.model_executor.driver_worker.model_runner.model,
-            tensorizer_config,
-        )
+    engine.model_executor.collective_rpc(
+        "save_tensorized_model",
+        kwargs=dict(tensorizer_config=tensorizer_config),
+    )
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index bb6b99135580e..e906b1789cde8 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
@@ -10,6 +11,10 @@
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
 
+class _DummyModel(nn.Module):
+    pass
+
+
 class NGramWorker(NonLLMProposerWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
@@ -36,7 +41,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int,
 
     def init_device(self):
         self.device = torch.device(f"{self.device_type}:{self.local_rank}")
-        self.load_model = lambda *args, **kwargs: None
 
         # Current NGramWorker only supports Top1Proposer
         self._proposer = Top1Proposer(
@@ -45,6 +49,12 @@ def init_device(self):
             vocab_size=self.vocab_size,
         )
 
+    def load_model(self) -> None:
+        pass  # Dummy
+
+    def get_model(self) -> nn.Module:
+        return _DummyModel()
+
     def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index 8896b7dbc6b8a..c6ff5e52f9388 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.distributed.parallel_state import (get_tp_group,
                                              init_model_parallel_group,
@@ -15,6 +16,10 @@
 logger = init_logger(__name__)
 
 
+class _DummyModel(nn.Module):
+    pass
+
+
 class SmallerTpProposerWorker(ProposerWorkerBase):
     """Class which allows a speculative draft model to run with smaller tensor
     parallel degree than target model.
@@ -139,6 +144,13 @@ def get_spec_proposals(
             return self._worker.get_spec_proposals(
                 execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
+    def get_model(self) -> nn.Module:
+        if self._is_dummy:
+            return _DummyModel()
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 540d118d65ecb..0d66ede3d907a 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -4,6 +4,7 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
 import torch
+import torch.nn as nn
 
 from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
@@ -403,6 +404,9 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks,
                                               num_cpu_blocks=num_cpu_blocks)
 
+    def get_model(self) -> nn.Module:
+        return self.scorer_worker.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 93026029ad13e..f6cf35da0106b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -94,22 +94,12 @@ def collective_rpc(self,
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
-        """
-        Execute an RPC call on workers.
-        
-        Args:
-            method: Name of the worker method to execute
-            timeout: Maximum time in seconds to wait for execution. Rases a
-                     TimeoutError on timeout. None means wait indefinitely.
-            args: Positional arguments to pass to the worker method
-            kwargs: Keyword arguments to pass to the worker method
-
-        Returns:
-            List of results from each worker
-        """
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
+        # NOTE: If the args are heterogeneous, then we pack them into a list,
+        # and unpack them in the method of every worker, because every worker
+        # knows their own rank.
         try:
             if isinstance(method, str):
                 send_method = method
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 87a1cd7f9e627..2350074c23a59 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -689,6 +689,9 @@ def _gather_encoder_outputs(
                 encoder_outputs.append(encoder_output[start_idx:end_idx])
         return encoder_outputs
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 4fb4197f1822f..0929e64d58f1e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch.distributed
+import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
@@ -176,6 +177,9 @@ def compile_or_warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 303d9a15e9c3c..abbf6450ab7f6 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -509,6 +509,9 @@ def load_model(self) -> None:
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 260ffaf27f9a1..4c8f69e449393 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -21,6 +21,7 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
+import torch.nn as nn
 from vllm_hpu_extension.ops import LoraMask as LoraMask
 from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
                                          HabanaMemoryProfiler, format_bytes)
@@ -676,6 +677,9 @@ def load_model(self) -> None:
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _use_graphs(self, batch_size, seq_len, is_prompt):
         if self.enforce_eager:
             return False
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ae8b7f97c827d..cb2ff0c934da3 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1176,6 +1176,9 @@ def load_model(self) -> None:
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                 backend=backend)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def save_sharded_state(
         self,
         path: str,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index c7abad7e0258d..acfd6d0b03f62 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -7,6 +7,7 @@
                     Optional, Type, TypeVar)
 
 import torch
+import torch.nn as nn
 from torch import is_tensor
 
 from vllm.config import VllmConfig
@@ -264,6 +265,10 @@ def prepare_model_input(
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
     def execute_model(
         self,
         model_input: T,
@@ -297,9 +302,9 @@ class ModelRunnerWrapperBase:
 
     def __init__(
         self,
-        moderl_runner: ModelRunnerBase,
+        model_runner: ModelRunnerBase,
     ) -> None:
-        self.model_runner: ModelRunnerBase = moderl_runner
+        self.model_runner: ModelRunnerBase = model_runner
 
     def __getattr__(self, attr):
         return getattr(self.model_runner, attr)
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a35f5467e1a1f..596c26eac28bd 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -113,6 +113,9 @@ def load_model(self) -> None:
             raise NotImplementedError(
                 "Supports only Transformer-NeuronX based models.")
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index a38b5a4e6e8d5..9d0a759ca2f21 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -84,6 +84,9 @@ def load_model(self) -> None:
                                kv_cache_dtype=self.kv_cache_dtype,
                                ov_core=self.ov_core)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 50a155d22c666..f5b46cde3969c 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -4,6 +4,7 @@
 import openvino as ov
 import torch
 import torch.distributed
+import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
@@ -362,6 +363,9 @@ def cache_copy(
     ) -> None:
         self.cache_engine.copy(blocks_to_copy)  # type: ignore
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 52c577bccab9c..f5c7bc955a673 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -158,6 +158,9 @@ def load_model(self) -> None:
                                    fullgraph=True,
                                    dynamic=False)
 
+    def get_model(self) -> nn.Module:
+        return self.model.model
+
     def _dummy_run(
         self,
         batch_size: int,
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index fb9919f7a7b6a..1104eceef72a3 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,6 +6,7 @@
 
 import cloudpickle
 import torch
+import torch.nn as nn
 
 from vllm.config import ObservabilityConfig, VllmConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
@@ -90,6 +91,11 @@ def start_worker_execution_loop(self) -> None:
                 if output is None:
                     return None
 
+    @abstractmethod
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -147,6 +153,9 @@ def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
+    def get_model(self) -> nn.Module:
+        return self.worker.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -363,6 +372,9 @@ def prepare_input(
         else:
             return self._get_worker_input_from_broadcast()
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 82b8f22a5af33..25a2fea1e8eac 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -416,6 +416,9 @@ def load_model(self) -> None:
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()

From 5c89a29c22471a0ad5bb05dea9cb891ff97f9623 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 16:04:49 +0800
Subject: [PATCH 28/50] [misc] add placeholder format.sh (#12206)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 format.sh           | 5 +++++
 tools/shellcheck.sh | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100755 format.sh

diff --git a/format.sh b/format.sh
new file mode 100755
index 0000000000000..4bcd0be0c96e5
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "vLLM linting system has been moved from format.sh to pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Then linters will run automatically before each commit."
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index d99fa77b96351..7efb3cabc64fe 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then
 fi
 
 # TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'

From 4001ea126692d9c4e6872936a791a1999c826156 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 16:41:57 +0800
Subject: [PATCH 29/50] [CI/Build] Remove dummy CI steps (#12208)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/workflows/dummy.yml | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 .github/workflows/dummy.yml

diff --git a/.github/workflows/dummy.yml b/.github/workflows/dummy.yml
deleted file mode 100644
index ea507fab6b2de..0000000000000
--- a/.github/workflows/dummy.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: dummy-checks
-
-on:
-  pull_request:
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - run: echo "This is a dummy step that always passes"
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - run: echo "This is a dummy step that always passes"

From 3127e975fb9417d10513e25b80820870f594c627 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 17:36:24 +0800
Subject: [PATCH 30/50] [CI/Build] Make pre-commit faster (#12212)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/workflows/pre-commit.yml |  2 ++
 .pre-commit-config.yaml          | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8c72a709cf330..bf9460151ec1b 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,3 +15,5 @@ jobs:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      with:
+        extra_args: --hook-stage manual
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8ea0f37885d9f..47eddb345edbd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
   rev: v0.32.0
@@ -33,30 +36,41 @@ repos:
     files: docs/.*
 - repo: local
   hooks:
+  - id: mypy-local
+    name: Run mypy for local Python installation
+    entry: tools/mypy.sh
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    stages: [pre-commit] # Don't run in CI
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
     entry: tools/mypy.sh 1 "3.9"
     language: python
     types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.11
     entry: tools/mypy.sh 1 "3.11"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.12
     entry: tools/mypy.sh 1 "3.12"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: shellcheck
     name: Lint shell scripts
     entry: tools/shellcheck.sh

From b37d82791e3c9f7d492db81493d920004de59a26 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 17:58:48 +0800
Subject: [PATCH 31/50] [Model] Upgrade Aria to transformers 4.48 (#12203)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py |   3 -
 .../vision_language/test_models.py            |   7 +-
 .../multimodal/processing/test_common.py      |  12 +-
 tests/models/registry.py                      |  67 ++++-
 tests/models/test_initialization.py           |  14 +-
 tests/models/test_registry.py                 |   3 +
 vllm/model_executor/models/aria.py            | 275 +++++++-----------
 vllm/transformers_utils/config.py             |   9 +-
 vllm/transformers_utils/configs/__init__.py   |   2 -
 vllm/transformers_utils/configs/aria.py       | 165 -----------
 10 files changed, 178 insertions(+), 379 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/aria.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 69228bbf22949..f9048c7735ebf 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -26,11 +26,8 @@ def run_aria(question: str, modality: str):
 
     # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              dtype="bfloat16",
               max_model_len=4096,
               max_num_seqs=2,
-              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ca572cc39e538..14d9a739be318 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -10,7 +10,6 @@
 import pytest
 from transformers import AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
-from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -140,9 +139,7 @@
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
-        tokenizer_mode="slow",
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
         max_model_len=4096,
@@ -158,8 +155,8 @@
         max_tokens=64,
         marks=[
             pytest.mark.skipif(
-                not is_flash_attn_2_available(),
-                reason="Model needs flash-attn for numeric convergence.",
+                TRANSFORMERS_VERSION < "4.48.0",
+                reason="HF model requires transformers>=4.48.0",
             ),
             large_gpu_mark(min_gb=64),
         ],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 1e3e7ea50b122..d6d3d3b34ad46 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -11,6 +11,7 @@
 from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....multimodal.utils import random_audio, random_image, random_video
+from ...registry import HF_EXAMPLE_MODELS
 
 
 def _test_processing_correctness(
@@ -20,12 +21,9 @@ def _test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
-    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
-        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
-    elif model_id == "deepseek-ai/deepseek-vl2-tiny":
-        hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]}
-    else:
-        hf_overrides = {}
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     limit_mm_per_prompt = {
         modality: 3 if supports_multi else 1
@@ -41,7 +39,7 @@ def _test_processing_correctness(
         seed=0,
         dtype="float16",
         revision=None,
-        hf_overrides=hf_overrides,
+        hf_overrides=model_info.hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 9603ea8817cac..23227ea6b9714 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,5 +1,9 @@
 from dataclasses import dataclass, field
-from typing import AbstractSet, Mapping, Optional
+from typing import AbstractSet, Any, Literal, Mapping, Optional
+
+import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 
 @dataclass(frozen=True)
@@ -38,6 +42,50 @@ class _HfExamplesInfo:
     trust_remote_code: bool = False
     """The ``trust_remote_code`` level required to load the model."""
 
+    hf_overrides: dict[str, Any] = field(default_factory=dict)
+    """The ``hf_overrides`` required to load the model."""
+
+    def check_transformers_version(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the installed transformers version does not meet the requirements,
+        perform the given action.
+        """
+        if self.min_transformers_version is None:
+            return
+
+        current_version = TRANSFORMERS_VERSION
+        required_version = self.min_transformers_version
+        if Version(current_version) < Version(required_version):
+            msg = (
+                f"You have `transformers=={current_version}` installed, but "
+                f"`transformers>={required_version}` is required to run this "
+                "model")
+
+            if on_fail == "error":
+                raise RuntimeError(msg)
+            else:
+                pytest.skip(msg)
+
+    def check_available_online(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the model is not available online, perform the given action.
+        """
+        if not self.is_available_online:
+            msg = "Model is not available online"
+
+            if on_fail == "error":
+                raise RuntimeError(msg)
+            else:
+                pytest.skip(msg)
+
 
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
@@ -48,8 +96,6 @@ class _HfExamplesInfo:
                                          trust_remote_code=True),
     "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
                                          trust_remote_code=True),
-    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
-                                                    trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
@@ -176,6 +222,8 @@ class _HfExamplesInfo:
 
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
+                                                    min_transformers_version="4.48"),
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
@@ -183,7 +231,8 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
-    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"),   # noqa: E501
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
@@ -194,7 +243,8 @@ class _HfExamplesInfo:
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
-    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -247,5 +297,12 @@ def get_supported_archs(self) -> AbstractSet[str]:
     def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
         return self.hf_models[model_arch]
 
+    def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
+        for info in self.hf_models.values():
+            if info.default == model_id:
+                return info
+
+        raise ValueError(f"No example model defined for {model_id}")
+
 
 HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index daece7c93c0ef..d3a3aaf670c23 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,9 +1,7 @@
 from unittest.mock import patch
 
 import pytest
-from packaging.version import Version
 from transformers import PretrainedConfig
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm import LLM
 
@@ -13,16 +11,8 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    if not model_info.is_available_online:
-        pytest.skip("Model is not available online")
-    if model_info.min_transformers_version is not None:
-        current_version = TRANSFORMERS_VERSION
-        required_version = model_info.min_transformers_version
-        if Version(current_version) < Version(required_version):
-            pytest.skip(
-                f"You have `transformers=={current_version}` installed, but "
-                f"`transformers>={required_version}` is required to run this "
-                "model")
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 73b70d65e8e0b..ac0366847e334 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -21,6 +21,9 @@
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
+
     # Ensure all model classes can be imported successfully
     model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 5b97eced62df0..503d1a38d9ee9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,9 +1,11 @@
-from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, PretrainedConfig
+from transformers import AriaConfig, AriaTextConfig, BatchFeature
+from transformers.models.aria.modeling_aria import AriaCrossAttention
+from transformers.models.aria.processing_aria import AriaProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
@@ -26,10 +28,11 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
-                                                  AriaVisionConfig)
 
-from .idefics2_vision_model import Idefics2VisionTransformer
+# yapf: disable
+from .idefics2_vision_model import (
+    Idefics2VisionTransformer as Idefics3VisionTransformer)
+# yapf: enable
 from .interfaces import SupportsMultiModal
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -47,87 +50,22 @@ class AriaImagePixelInputs(TypedDict):
     """
 
 
-class AriaVisionTransformer(Idefics2VisionTransformer):
-    """
-    AriaVisionTransformer is a modified version of Idefics2VisionTransformer
-    that replaces the post-layernorm with an identity layer.
-    """
-
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(config, quant_config, prefix)
-        self.post_layernorm = nn.Identity()
-
-
-class AriaVisionModel(nn.Module):
-    config_class = AriaVisionConfig
+class AriaProjectorMLP(nn.Module):
 
     def __init__(
         self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        *,
-        prefix: str = "",
+        in_features: int,
+        hidden_features: int,
+        output_dim: int,
     ) -> None:
         super().__init__()
 
-        self.vision_model = AriaVisionTransformer(
-            config,
-            quant_config,
-            prefix=f"{prefix}.vision_model",
-        )
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
-
-        vit_oup = self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-        )
-
-        image_atts = self._create_image_attention_mask(patch_attention_mask)
-
-        return vit_oup, image_atts
-
-    def _create_patch_attention_mask(
-            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
-        if pixel_mask is None:
-            return None
-
-        patches_subgrid = pixel_mask.unfold(
-            dimension=1,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        ).unfold(
-            dimension=2,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        )
-        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-
-    def _create_image_attention_mask(
-            self, patch_attention_mask: torch.Tensor) -> torch.Tensor:
-        if patch_attention_mask is None:
-            return None
-
-        flattened_mask = patch_attention_mask.flatten(1)
-        return torch.logical_not(flattened_mask)
-
-
-class FFN(nn.Module):
-
-    def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None:
-        super().__init__()
-        self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
-        self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
+        self.linear_in = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=False)
+        self.linear_out = RowParallelLinear(hidden_features,
+                                            output_dim,
+                                            bias=False)
         self.act = get_act_fn("gelu_new")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -137,46 +75,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class CrossAttention(nn.Module):
-
-    def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-
-        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.linear = nn.Linear(embed_dim, embed_dim)
-
-        self.layer_norm = nn.LayerNorm(embed_dim)
-        self.ln_kv = nn.LayerNorm(kv_dim)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        hidden_states: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        normed_hidden_states = self.layer_norm(hidden_states)
-        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
-
-        x = self.ln_kv(x)
-        key = self.k_proj(x).permute(1, 0, 2)
-        value = self.v_proj(x).permute(1, 0, 2)
-
-        attn_output, _ = self.multihead_attn(query,
-                                             key,
-                                             value,
-                                             attn_mask=attn_mask)
-
-        attn_output = attn_output.permute(1, 0, 2)
-
-        attn_output = self.linear(attn_output)
-
-        return attn_output
-
-
 class AriaProjector(nn.Module):
     """
     A projection module with one cross attention layer and one FFN layer, which
@@ -198,42 +96,42 @@ class AriaProjector(nn.Module):
         A tensor with the shape of (batch_size, query_number, output_dim)
     """
 
-    def __init__(
-        self,
-        patch_to_query_dict: dict[int, int],
-        embed_dim: int,
-        num_heads: int,
-        kv_dim: int,
-        ff_dim: int,
-        output_dim: int,
-        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-    ) -> None:
+    def __init__(self, config: AriaConfig) -> None:
         super().__init__()
-        self.patch_to_query_dict = patch_to_query_dict
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
+
+        self.patch_to_query_dict = config.projector_patch_to_query_dict
+        self.in_features = config.vision_config.hidden_size
+        self.num_heads = config.vision_config.num_attention_heads
+        self.kv_dim = config.vision_config.hidden_size
+        self.hidden_features = config.text_config.hidden_size
+        self.output_dim = config.text_config.hidden_size
 
         self.query = nn.Parameter(
-            torch.empty(max(patch_to_query_dict.values()), self.embed_dim))
+            torch.empty(config.max_value_projector_patch_to_query_dict,
+                        self.in_features))
 
-        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+        self.cross_attn = AriaCrossAttention(config)
 
-        self.ln_ffn = norm_layer(embed_dim)
-        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+        self.layer_norm = nn.LayerNorm(self.in_features)
+        self.feed_forward = AriaProjectorMLP(self.in_features,
+                                             self.hidden_features,
+                                             self.output_dim)
 
     def forward(
         self,
         x: torch.Tensor,
         attn_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        bs = x.shape[0]
-        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+        batch_size, num_patches = x.shape[0], x.shape[1]
 
-        query_num = self.patch_to_query_dict.get(x.shape[1], None)
-        assert (query_num is not None
-                ), f"Query number for {x.shape[1]} patches is not provided"
+        if num_patches not in self.patch_to_query_dict:
+            raise KeyError(f"Number of patches {num_patches} not found in "
+                           "patch_to_query_dict amongst possible values "
+                           f"{self.patch_to_query_dict.keys()}.")
 
-        queries = queries[:, :query_num, :]
+        query_num = self.patch_to_query_dict[num_patches]
+
+        queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1)
 
         if attn_mask is not None:
             attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
@@ -241,7 +139,7 @@ def forward(
 
         attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
 
-        out = self.ffn(self.ln_ffn(attention_out))
+        out = self.feed_forward(self.layer_norm(attention_out))
 
         return out
 
@@ -278,7 +176,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                 param.data.copy_(loaded_weight.transpose(1, 2))
 
 
-class MoELayer(nn.Module):
+class AriaTextMoELayer(nn.Module):
     """
     Mixture of Experts (MoE) Layer for the AriaMoE model.
 
@@ -289,7 +187,7 @@ class MoELayer(nn.Module):
 
     def __init__(
         self,
-        config: AriaMoELMConfig,
+        config: AriaTextConfig,
         quant_config: Optional[QuantizationConfig],
     ) -> None:
         super().__init__()
@@ -303,15 +201,16 @@ def __init__(
             num_experts=config.moe_num_experts,
             top_k=config.moe_topk,
             hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
+            intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             reduce_results=True,
         )
         self.shared_experts = LlamaMLP(
             config.hidden_size,
-            config.moe_intermediate_size * config.moe_num_shared_experts,
+            config.intermediate_size * config.moe_num_shared_experts,
             "silu",
             quant_config=quant_config,
+            bias=config.mlp_bias,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -329,13 +228,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_output = torch.nn.functional.linear(hidden_states,
                                                    self.router_weight)
 
-        shared_expert_output = self.shared_experts(hidden_states)
         sparse_expert_output = self.experts(hidden_states, router_output)
+        shared_expert_output = self.shared_experts(hidden_states)
 
         return sparse_expert_output + shared_expert_output
 
 
-class MoEDecoderLayer(LlamaDecoderLayer):
+class AriaTextDecoderLayer(LlamaDecoderLayer):
     """
     Custom Decoder Layer for the AriaMoE model which modifies the standard
     `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
@@ -344,16 +243,16 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: AriaMoELMConfig,
+        config: AriaTextConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
         super().__init__(config, cache_config, quant_config, prefix)
-        self.mlp = MoELayer(config, quant_config=quant_config)
+        self.mlp = AriaTextMoELayer(config, quant_config=quant_config)
 
 
-class AriaMoELMModel(LlamaModel):
+class AriaTextModel(LlamaModel):
     """
     Custom LlamaModel for the AriaMoE model which modifies the standard
     LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
@@ -362,7 +261,7 @@ class AriaMoELMModel(LlamaModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,
-                         layer_type=MoEDecoderLayer)
+                         layer_type=AriaTextDecoderLayer)
 
     # Adapted from LlamaModel.load_weights with the modification of adding
     # the expert weights mapping to `stacked_params_mapping`
@@ -434,25 +333,23 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def build_mm_projector(config: PretrainedConfig):
-    return AriaProjector(
-        patch_to_query_dict=config.projector_patch_to_query_dict,
-        embed_dim=config.vision_config.hidden_size,
-        num_heads=config.vision_config.num_attention_heads,
-        kv_dim=config.vision_config.hidden_size,
-        ff_dim=config.text_config.hidden_size,
-        output_dim=config.text_config.hidden_size,
-    )
-
-
 class AriaProcessingInfo(BaseProcessingInfo):
 
     def get_hf_config(self):
-        return self.ctx.get_hf_config()
+        return self.ctx.get_hf_config(AriaConfig)
 
-    def get_vision_config(self) -> AriaVisionConfig:
+    def get_vision_config(self):
         return self.get_hf_config().vision_config
 
+    def get_hf_processor(self):
+        processor = self.ctx.get_hf_processor(AriaProcessor)
+
+        # Patch for https://github.com/huggingface/transformers/issues/35768
+        processor.tokenizer.image_token = "<|img|>"
+        processor.image_token = "<|img|>"
+
+        return processor
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
@@ -554,10 +451,14 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.vision_tower = AriaVisionModel(config.vision_config)
-        self.multi_modal_projector = build_mm_projector(config)
+        self.vision_tower = Idefics3VisionTransformer(
+            config.vision_config,
+            quant_config,
+            prefix=f"{prefix}.vision_tower",
+        )
+        self.multi_modal_projector = AriaProjector(config)
         self.vocab_size = config.text_config.vocab_size
-        self.language_model = AriaMoELMModel(
+        self.language_model = AriaTextModel(
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=maybe_prefix(prefix, "language_model.model"),
         )
@@ -608,6 +509,22 @@ def _parse_and_validate_image_input(
             pixel_mask=pixel_mask,
         )
 
+    def _create_patch_attention_mask(
+            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
     def _process_image_input(
         self, image_input: AriaImagePixelInputs
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -616,9 +533,18 @@ def _process_image_input(
         pixel_values = image_input['pixel_values']
         pixel_mask = image_input['pixel_mask']
 
-        image_feature, image_attn_mask = self.vision_tower(
-            pixel_values, pixel_mask=pixel_mask)
-        return self.multi_modal_projector(image_feature, image_attn_mask)
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        image_outputs = self.vision_tower(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+        image_attn_mask = None
+        if patch_attention_mask is not None:
+            flattened_mask = patch_attention_mask.flatten(1)
+            image_attn_mask = torch.logical_not(flattened_mask)
+
+        return self.multi_modal_projector(image_outputs, image_attn_mask)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -683,6 +609,5 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f57dfded0a62f..c97acffa1a719 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,10 +22,10 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig,
-                                             Cohere2Config, DbrxConfig,
-                                             DeepseekVLV2Config, EAGLEConfig,
-                                             ExaoneConfig, H2OVLChatConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,7 +52,6 @@
 }
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    "aria": AriaConfig,
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 807ef4fbfd0c0..f065c56124605 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,3 @@
-from vllm.transformers_utils.configs.aria import AriaConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
@@ -24,7 +23,6 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
-    "AriaConfig",
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
deleted file mode 100644
index f4b531225b5d0..0000000000000
--- a/vllm/transformers_utils/configs/aria.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2024 Rhymes AI. All rights reserved.
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Mapping
-
-from transformers import PretrainedConfig
-from transformers.models.idefics2.configuration_idefics2 import (
-    Idefics2VisionConfig)
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class AriaVisionConfig(Idefics2VisionConfig):
-    model_type = "aria_vision_model"
-
-
-class AriaMoELMConfig(LlamaConfig):
-    """
-    Configuration class for AriaMoE language model.
-
-    This class extends the LlamaConfig to include additional parameters specific
-    to the Mixture of Experts (MoE) architecture.
-    """
-
-    model_type = "aria_moe_lm"
-
-    def __init__(
-        self,
-        moe_intermediate_size: int = 4096,
-        moe_num_experts: int = 8,
-        moe_topk: int = 2,
-        moe_num_shared_experts: int = 2,
-        **kwargs,
-    ):
-        """
-        Initialize the AriaMoELMConfig.
-
-        Args:
-            moe_intermediate_size (int): The intermediate size for MoE layers.
-                Default is 4096.
-            moe_num_experts (int): The number of experts in the MoE layer.
-                Default is 8.
-            moe_topk (int): The number of top experts to route to for each 
-                token. Default is 2.
-            moe_num_shared_experts (int): The number of shared experts. Default
-                is 2. 
-            **kwargs: Additional keyword arguments to be passed to the parent
-                LlamaConfig.
-        """
-        super().__init__(**kwargs)
-        self.moe_intermediate_size = moe_intermediate_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_topk = moe_topk
-        self.moe_num_shared_experts = moe_num_shared_experts
-
-
-class AriaConfig(PretrainedConfig):
-    """
-    Configuration class for Aria model.
-    This class handles the configuration for both vision and text components of
-    the Aria model,
-    as well as additional parameters for image token handling and projector
-    mapping.
-
-    Args:
-        vision_config (AriaVisionConfig or dict): Configuration for the vision
-            component.
-        text_config (AriaMoELMConfig or dict): Configuration for the text
-            component.
-        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
-            dimensions.
-        ignore_index (int): Index to ignore in loss calculation.
-        image_token_index (int): Index used to represent image tokens.
-        **kwargs: Additional keyword arguments passed to the parent class.
-    Attributes:
-        model_type (str): Type of the model, set to "aria".
-        is_composition (bool): Whether the model is a composition of multiple
-            components.
-        ignore_index (int): Index to ignore in loss calculation.
-        image_token_index (int): Index used to represent image tokens.
-        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
-            dimensions.
-        vision_config (AriaVisionConfig): Configuration for the vision
-            component.
-        text_config (AriaMoELMConfig): Configuration for the text component.
-    """
-
-    model_type = "aria"
-    is_composition = False
-
-    def __init__(
-        self,
-        vision_config: AriaVisionConfig = AriaVisionConfig(),  # noqa: B008
-        text_config: AriaMoELMConfig = AriaMoELMConfig(),  # noqa: B008
-        projector_patch_to_query_dict: Mapping[int, int] = {
-            1225: 128,
-            4900: 256,
-        },
-        ignore_index=-100,
-        image_token_index=32000,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.tie_word_embeddings = tie_word_embeddings
-        attn_implementation = kwargs.pop("attn_implementation", None)
-
-        # Set the default attention implementation to flash_attention_2 if not
-        # specified
-        self._attn_implementation = ("flash_attention_2"
-                                     if attn_implementation is None else
-                                     attn_implementation)
-
-        # Convert the keys and values of projector_patch_to_query_dict to
-        # integers
-        # This ensures consistency even if they were provided as strings
-        self.projector_patch_to_query_dict = {
-            int(k): int(v)
-            for k, v in projector_patch_to_query_dict.items()
-        }
-
-        if isinstance(vision_config, dict) and "model_type" in vision_config:
-            vision_config = AriaVisionConfig(**vision_config)
-            if attn_implementation is None:
-                vision_attn_implementation = "flash_attention_2"
-            elif attn_implementation == "sdpa":
-                logger.warning("SDPA is not supported for vit, using "
-                               "flash_attention_2 instead")
-                vision_attn_implementation = "flash_attention_2"
-            else:
-                vision_attn_implementation = attn_implementation
-            vision_config._attn_implementation = vision_attn_implementation
-
-        self.vision_config = vision_config
-
-        if isinstance(text_config, dict) and "model_type" in text_config:
-            text_attn_implementation = ("sdpa" if attn_implementation is None
-                                        else attn_implementation)
-            text_config = AriaMoELMConfig(**text_config)
-            text_config._attn_implementation = text_attn_implementation
-
-        self.text_config = text_config
-
-        # This is needed for the static kv cache
-        self.num_hidden_layers = self.text_config.num_hidden_layers

From 170eb350793a04ceb18ae86be4ccf97d02ad199f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 18:06:24 +0800
Subject: [PATCH 32/50] [misc] print a message to suggest how to bypass commit
 hooks (#12217)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .pre-commit-config.yaml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 47eddb345edbd..8d1fc257388a8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,10 @@ repos:
   hooks:
   - id: pymarkdown
     files: docs/.*
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.6
+  hooks:
+  - id: actionlint
 - repo: local
   hooks:
   - id: mypy-local
@@ -81,7 +85,8 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
-- repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
-  hooks:
-  - id: actionlint
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true

From c222f47992ce0bbcd3ccbce24736e045d8689be8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 19:35:59 +0800
Subject: [PATCH 33/50] [core][bugfix] configure env var during import vllm
 (#12209)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 examples/offline_inference/rlhf.py |  7 +----
 vllm/__init__.py                   | 49 ++++++++----------------------
 vllm/plugins/__init__.py           | 23 ++++++++++++++
 vllm/worker/worker_base.py         |  3 --
 4 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 3bc303dad277f..5c4918008dcb3 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -19,7 +19,7 @@
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 from transformers import AutoModelForCausalLM
 
-from vllm import LLM, SamplingParams, configure_as_vllm_process
+from vllm import LLM, SamplingParams
 from vllm.utils import get_ip, get_open_port
 from vllm.worker.worker import Worker
 
@@ -98,12 +98,7 @@ def __init__(self, *args, **kwargs):
 """
 Start the training process, here we use huggingface transformers 
 as an example to hold a model on GPU 0.
-
-It is important for all the processes outside of vLLM to call
-`configure_as_vllm_process` to set some common environment variables
-the same as vLLM workers.
 """
-configure_as_vllm_process()
 
 train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
 train_model.to("cuda:0")
diff --git a/vllm/__init__.py b/vllm/__init__.py
index a533dba561c00..2aabe820d9a84 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,7 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+import os
+
+import torch
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -17,43 +20,18 @@
 
 from .version import __version__, __version_tuple__
 
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
 
-def configure_as_vllm_process():
-    """
-    set some common config/environment variables that should be set
-    for all processes created by vllm and all processes
-    that interact with vllm workers.
-    """
-    import os
-
-    import torch
-
-    # see https://github.com/NVIDIA/nccl/issues/1234
-    os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-    # see https://github.com/vllm-project/vllm/issues/10480
-    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-    # see https://github.com/vllm-project/vllm/issues/10619
-    torch._inductor.config.compile_threads = 1
-
-    from vllm.platforms import current_platform
-
-    if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
-        torch._dynamo.config.disable = True
-    elif current_platform.is_hpu():
-        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
-        # does not support torch.compile
-        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
-        # torch.compile support
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
-        if is_lazy:
-            torch._dynamo.config.disable = True
-            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
-            # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
-            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1
 
 __all__ = [
     "__version__",
@@ -80,5 +58,4 @@ def configure_as_vllm_process():
     "AsyncEngineArgs",
     "initialize_ray_cluster",
     "PoolingParams",
-    "configure_as_vllm_process",
 ]
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index ff54174f634af..a78a054917756 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,6 +1,9 @@
 import logging
+import os
 from typing import Callable, Dict
 
+import torch
+
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
@@ -51,6 +54,26 @@ def load_general_plugins():
     if plugins_loaded:
         return
     plugins_loaded = True
+
+    # some platform-specific configurations
+    from vllm.platforms import current_platform
+
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
+    elif current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
     plugins = load_plugins_by_group(group='vllm.general_plugins')
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 1104eceef72a3..c6e6693c54f57 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -535,9 +535,6 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
-        from vllm import configure_as_vllm_process
-        configure_as_vllm_process()
-
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 

From 5f0ec3935a0118fee8cf2764728f765c8cc53d2a Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 20 Jan 2025 21:54:16 +0800
Subject: [PATCH 34/50] [V1] Remove `_get_cache_block_size` (#12214)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/worker/gpu_worker.py | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0929e64d58f1e..bd40112aea5e8 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -8,14 +8,13 @@
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
@@ -235,24 +234,3 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
                 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
                 "You can use float16 instead by explicitly setting the"
                 "`dtype` flag in CLI, for example: --dtype=half.")
-
-
-def _get_cache_block_size(
-    cache_config: CacheConfig,
-    model_config: ModelConfig,
-    parallel_config: ParallelConfig,
-) -> int:
-    head_size = model_config.get_head_size()
-    num_heads = model_config.get_num_kv_heads(parallel_config)
-    num_attention_layers = model_config.get_num_layers_by_block_type(
-        parallel_config, LayerBlockType.attention)
-
-    key_cache_block = cache_config.block_size * num_heads * head_size
-    value_cache_block = key_cache_block
-    total = num_attention_layers * (key_cache_block + value_cache_block)
-    if cache_config.cache_dtype == "auto":
-        dtype = model_config.dtype
-    else:
-        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-    dtype_size = get_dtype_size(dtype)
-    return dtype_size * total

From 86bfb6dba7c6e0650e7d7498cbd46b49155b2a42 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 20 Jan 2025 23:25:28 +0800
Subject: [PATCH 35/50] [Misc] Pass `attention` to impl backend (#12218)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/attention/backends/abstract.py         | 23 +++++++++++++++++----
 vllm/attention/backends/blocksparse_attn.py | 12 +++++------
 vllm/attention/backends/flash_attn.py       | 10 ++++-----
 vllm/attention/backends/flashinfer.py       | 16 +++++++-------
 vllm/attention/backends/hpu_attn.py         |  4 ++--
 vllm/attention/backends/ipex_attn.py        | 18 ++++++++--------
 vllm/attention/backends/pallas.py           |  6 +++---
 vllm/attention/backends/rocm_flash_attn.py  | 20 +++++++++---------
 vllm/attention/backends/torch_sdpa.py       | 18 +++++++---------
 vllm/attention/backends/xformers.py         | 20 ++++++++----------
 vllm/attention/layer.py                     |  8 +++----
 vllm/v1/attention/backends/flash_attn.py    |  9 ++++----
 12 files changed, 86 insertions(+), 78 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 737559bfe70ca..e6ddca69bf01b 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set,
-                    Tuple, Type, TypeVar)
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
+                    Protocol, Set, Tuple, Type, TypeVar)
 
 import torch
 
@@ -223,6 +223,22 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         raise NotImplementedError
 
 
+class AttentionLayer(Protocol):
+
+    _k_scale: float
+    _v_scale: float
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        ...
+
+
 class AttentionImpl(ABC, Generic[T]):
 
     @abstractmethod
@@ -244,13 +260,12 @@ def __init__(
     @abstractmethod
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 77cfa8490172b..9089db1126c94 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -4,6 +4,7 @@
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
@@ -358,13 +359,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: BlocksparseFlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -401,8 +401,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         if prefill_meta := attn_metadata.prefill_metadata:
@@ -439,8 +439,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
                 tp_rank=self.tp_rank,
                 blocksparse_local_blocks=self.local_blocks,
                 blocksparse_vert_stride=self.vert_stride,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 48b3e8d177ec9..40250ef08b595 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -8,6 +8,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
@@ -634,13 +635,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -657,7 +657,7 @@ def forward(
         NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert k_scale == 1.0 and v_scale == 1.0, (
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
         assert output is not None, "Output tensor must be provided."
@@ -709,8 +709,8 @@ def forward(
                     kv_cache[1],
                     updated_slot_mapping.flatten(),  # type: ignore[union-attr]
                     kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         (num_prefill_query_tokens, num_prefill_kv_tokens,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 6ca75fabdfc38..b9cd805e81b45 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -23,6 +23,7 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionState, AttentionType)
@@ -792,13 +793,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashInferMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
@@ -826,8 +826,8 @@ def forward(
                 kv_cache[:, 1],
                 attn_metadata.slot_mapping.flatten(),
                 kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
             # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
             # to process the cache when the kv_cache_dtype is fp8
@@ -886,8 +886,8 @@ def forward(
                     kv_cache,
                     logits_soft_cap=logits_soft_cap,
                     causal=True,
-                    k_scale=k_scale,
-                    v_scale=v_scale,
+                    k_scale=layer._k_scale,
+                    v_scale=layer._v_scale,
                     window_left=window_left)
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
@@ -897,8 +897,8 @@ def forward(
                 kv_cache,
                 sm_scale=softmax_scale,
                 logits_soft_cap=logits_soft_cap,
-                k_scale=k_scale,
-                v_scale=v_scale,
+                k_scale=layer._k_scale,
+                v_scale=layer._v_scale,
                 window_left=window_left)
 
         if prefill_output is None and decode_output is not None:
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 94a461e0c8c29..80c132c0a8c05 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -11,6 +11,7 @@
 from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
@@ -152,13 +153,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: HPUAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index da1d307daa517..cd729a1c8b274 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -7,6 +7,7 @@
 
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
@@ -171,13 +172,12 @@ def split_kv_cache(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: IpexAttnMetadata,  # type: ignore
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -193,7 +193,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -210,8 +210,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         if attn_metadata.is_prompt:
@@ -296,8 +296,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
             else:
                 # Run PagedAttention V2.
@@ -329,8 +329,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2ac492dd8ae54..f5bf390df6afb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -5,6 +5,7 @@
 import torch_xla.experimental.custom_kernel  # Required to register custom ops.
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
@@ -150,13 +151,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Tuple[torch.Tensor, torch.Tensor],
         attn_metadata: PallasMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -173,7 +173,7 @@ def forward(
         Returns:
             shape = [batch_size, seq_len, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index a91a5af5c3d58..e9f2808ff1674 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -7,6 +7,7 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
@@ -414,13 +415,12 @@ def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: ROCmFlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -458,8 +458,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         num_prefill_tokens = attn_metadata.num_prefill_tokens
@@ -567,8 +567,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -613,8 +613,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
             else:
                 output[num_prefill_tokens:] = PagedAttention.forward_decode(
@@ -628,8 +628,8 @@ def forward(
                     self.num_kv_heads,
                     self.scale,
                     self.alibi_slopes,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index ca1c4618615de..7cd2049f0c0a5 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,6 +7,7 @@
 from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
@@ -429,13 +430,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TorchSDPAMetadata,  # type: ignore
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -451,7 +451,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -493,11 +493,9 @@ def forward(
                     # Update self-attention KV cache (prefill/decode)
                     updated_slot_mapping = attn_metadata.slot_mapping
 
-                PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                    value_cache,
-                                                    updated_slot_mapping,
-                                                    self.kv_cache_dtype,
-                                                    k_scale, v_scale)
+                PagedAttention.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
 
         if attn_type != AttentionType.ENCODER:
             # Decoder self-attention supports chunked prefill.
@@ -571,8 +569,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c8ca8520a9db..38e27434dab2c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -10,6 +10,7 @@
                                          LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (
     CommonAttentionState, CommonMetadataBuilder,
@@ -412,13 +413,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: Optional[torch.Tensor],
         value: Optional[torch.Tensor],
         kv_cache: torch.Tensor,
         attn_metadata: "XFormersMetadata",
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -524,11 +524,9 @@ def forward(
                 # If kv_cache is not provided, the new key and value tensors are
                 # not cached. This happens during the initial memory
                 # profiling run.
-                PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                    value_cache,
-                                                    updated_slot_mapping,
-                                                    self.kv_cache_dtype,
-                                                    k_scale, v_scale)
+                PagedAttention.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
         (num_prefill_query_tokens, num_prefill_kv_tokens,
         num_decode_query_tokens) = \
             get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
@@ -580,8 +578,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
                 assert output[:num_prefill_query_tokens].shape == out.shape
                 output[:num_prefill_query_tokens] = out
@@ -607,8 +605,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index e2403306950a3..c36f8d08eb4a7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -243,8 +243,7 @@ def unified_attention(
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
-                             self._k_scale, self._v_scale)
+    return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
 
 def unified_attention_fake(
@@ -276,13 +275,12 @@ def unified_attention_with_output(
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    self.impl.forward(query,
+    self.impl.forward(self,
+                      query,
                       key,
                       value,
                       kv_cache,
                       attn_metadata,
-                      self._k_scale,
-                      self._v_scale,
                       output=output)
 
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 7b0786261a6a6..fd36ea8d8806b 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -130,13 +130,12 @@ def __init__(
 
     def forward(
         self,
+        layer: torch.nn.Module,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -151,7 +150,7 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert k_scale == 1.0 and v_scale == 1.0, (
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
         assert output is not None, "Output tensor must be provided."
@@ -183,8 +182,8 @@ def forward(
             value_cache,
             attn_metadata.slot_mapping,
             self.kv_cache_dtype,
-            k_scale,
-            v_scale,
+            layer._k_scale,
+            layer._v_scale,
         )
 
         # Compute attention and update output up to `num_actual_tokens`.

From 18572e3384a6f55a7589dd81e1f3f70f7dd73e3a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 23:35:36 +0800
Subject: [PATCH 36/50] [Bugfix] Fix `HfExampleModels.find_hf_info` (#12223)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/registry.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 23227ea6b9714..e99dbd16c47b9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -302,6 +302,11 @@ def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
             if info.default == model_id:
                 return info
 
+        # Fallback to extras
+        for info in self.hf_models.values():
+            if any(extra == model_id for extra in info.extras.values()):
+                return info
+
         raise ValueError(f"No example model defined for {model_id}")
 
 

From 96663699b2f78eecd44d1d1de9d93c7e054aabc2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 20 Jan 2025 23:49:18 +0800
Subject: [PATCH 37/50] [CI] Pass local python version explicitly to pre-commit
 mypy.sh (#12224)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .pre-commit-config.yaml | 2 +-
 tools/mypy.sh           | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8d1fc257388a8..432bf5ed18dbc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,7 +42,7 @@ repos:
   hooks:
   - id: mypy-local
     name: Run mypy for local Python installation
-    entry: tools/mypy.sh
+    entry: tools/mypy.sh 0 "local"
     language: python
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
diff --git a/tools/mypy.sh b/tools/mypy.sh
index bf95e4c526fd1..77d342da1ec82 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -1,12 +1,16 @@
 #!/bin/bash
 
 CI=${1:-0}
-PYTHON_VERSION=${2:-3.9}
+PYTHON_VERSION=${2:-local}
 
 if [ "$CI" -eq 1 ]; then
     set -e
 fi
 
+if [ $PYTHON_VERSION == "local" ]; then
+    PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+fi
+
 run_mypy() {
     echo "Running mypy on $1"
     if [ "$CI" -eq 1 ] && [ -z "$1" ]; then

From 7bd36300679a0876c16a905e2baea41dd59a60a2 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:19:09 -0800
Subject: [PATCH 38/50] [Misc] Update CODEOWNERS (#12229)

---
 .github/CODEOWNERS | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 3cb91fc0f8232..37733ebacbc7a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,32 +2,33 @@
 # for more info about CODEOWNERS file
 
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
+/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 
 # Test ownership
-/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
+/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/quantization @mgoin @robertgshaw2-neuralmagic
+/tests/quantization @mgoin @robertgshaw2-redhat
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-neuralmagic @comaniac
+/tests/multi_step @alexm-redhat @comaniac
 /tests/weight_loading @mgoin @youkaichao
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

From af69a6aded526343db8d4a199cdfd1bb84134201 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C5=9F=C4=B1k?= <41375111+isikhi@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:23:28 +0000
Subject: [PATCH 39/50] fix: update platform detection for M-series arm based
 MacBook processors (#12227)

Signed-off-by: isikhi <huseyin.isik000@gmail.com>
---
 vllm/platforms/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 6ca95b41dbb07..6033a806d2023 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -101,6 +101,10 @@ def cpu_platform_plugin() -> Optional[str]:
     try:
         from importlib.metadata import version
         is_cpu = "cpu" in version("vllm")
+        if is_cpu == False:
+            import platform
+            is_cpu = platform.machine().lower().startswith("arm")
+
     except Exception:
         pass
 

From da7512215f0b5c589c2747303b171357940c0614 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 08:31:01 +0800
Subject: [PATCH 40/50] [misc] add cuda runtime version to usage data (#12190)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/usage/usage_lib.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 841df3994fba2..7f5cc906382af 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -130,6 +130,7 @@ def __init__(self) -> None:
         self.total_memory: Optional[int] = None
         self.architecture: Optional[str] = None
         self.platform: Optional[str] = None
+        self.cuda_runtime: Optional[str] = None
         self.gpu_count: Optional[int] = None
         self.gpu_type: Optional[str] = None
         self.gpu_memory_per_device: Optional[int] = None
@@ -169,6 +170,8 @@ def _report_usage_once(self, model_architecture: str,
             self.gpu_count = torch.cuda.device_count()
             self.gpu_type = device_property.name
             self.gpu_memory_per_device = device_property.total_memory
+        if current_platform.is_cuda():
+            self.cuda_runtime = torch.version.cuda
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
         self.platform = platform.platform()

From 06a760d6e8bcd60dc98775678b5b12eef01d82bb Mon Sep 17 00:00:00 2001
From: Cheng Kuan Yong Jason <jasoncky96@gmail.com>
Date: Tue, 21 Jan 2025 08:42:02 +0800
Subject: [PATCH 41/50] [bugfix] catch xgrammar unsupported array constraints
 (#12210)

Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
---
 vllm/model_executor/guided_decoding/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 20abaefbacc51..90dfa62ec4670 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -20,6 +20,13 @@ def check_object(obj: dict) -> bool:
                 ]):
             return True
 
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(key in obj for key in [
+                "uniqueItems", "contains", "minContains", "maxContains",
+                "minItems", "maxItems"
+        ]):
+            return True
+
         # Recursively check all nested objects and arrays
         for value in obj.values():
             if isinstance(value, dict):

From 750f4cabfac4bfed679d95074d9550b043e3f8d5 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Tue, 21 Jan 2025 08:42:16 +0800
Subject: [PATCH 42/50] [Kernel] optimize moe_align_block_size for cuda graph
 and large num_experts (e.g. DeepSeek-V3) (#12222)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 93 +++++++++++++++++++------------
 vllm/config.py                    |  2 +-
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 24341d63fb1f8..715a1b42841f2 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace
 
-template <typename scalar_t>
+template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                             int32_t* sorted_token_ids,
                                             int32_t* expert_ids,
@@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
   extern __shared__ int32_t shared_mem[];
-
-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (blockDim.x + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem +
-      (blockDim.x + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
+  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                           block_size) *
                       block_size;
     }
-    *total_tokens_post_pad = cumsum[num_experts];
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
   }
 
   __syncthreads();
@@ -224,26 +220,44 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  // If we have very large number of experts, we can no longer use shared
-  // memory.
-  // TODO(simon): the right solution should be calculating the exact right
-  // amount of shared memory and use that. The num_experts >= 256 is just a
-  // temporary solution to unblock Deepseek V3.
-  if (num_experts >= 256) {
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+
+  const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+  const int32_t shared_mem_i32 =
+      ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+  const int32_t shared_mem_i16 =
+      ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
+      (num_experts + 1) * sizeof(int32_t);
+
+  bool use_global_memory = false;
+  bool use_i16 = false; // Use uint16_t for shared memory token counts
+  if (shared_mem_i16 > device_max_shared_mem) {
+    use_global_memory = true;
+  } else if (shared_mem_i32 > device_max_shared_mem &&
+             topk_ids.numel() <= 65535) {
+    // when nelements of topk_ids is smaller than 65535 (max value of uint16),
+    // element value of token_cnts would also smaller than 65535,
+    // so we can use uint16 as dtype of token_cnts
+    use_i16 = true;
+  }
+
+  if (use_global_memory) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
           const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
 
-          const int32_t mem_tokens_cnts =
-              ((num_experts + 1) * num_experts) * sizeof(int32_t);
-          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
-          // allocate global memory
-          int32_t* tokens_cnts;
-          int32_t* cumsum;
-          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
-          cudaMalloc(&cumsum, mem_cumsum);
+          auto options_int = torch::TensorOptions()
+                                 .dtype(torch::kInt)
+                                 .device(topk_ids.device());
+          torch::Tensor token_cnts_buffer =
+              torch::empty({(num_experts + 1) * num_experts}, options_int);
+          torch::Tensor cumsum_buffer =
+              torch::empty({num_experts + 1}, options_int);
 
           auto kernel =
               vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
@@ -252,25 +266,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), tokens_cnts, cumsum);
-          cudaFree(tokens_cnts);
-          cudaFree(cumsum);
+              topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
+              cumsum_buffer.data_ptr<int32_t>());
         });
-  } else {
+  } else if (use_i16) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-          // tensors
-          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-          const int32_t shared_mem =
-              ((num_thread + 1) * num_experts + (num_experts + 1)) *
-              sizeof(int32_t);
-
           // set dynamic shared mem
-          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem_i16));
+          kernel<<<1, num_thread, shared_mem_i16, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        });
+  } else {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
           AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-              (void*)kernel, shared_mem));
-          kernel<<<1, num_thread, shared_mem, stream>>>(
+              (void*)kernel, shared_mem_i32));
+          kernel<<<1, num_thread, shared_mem_i32, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
diff --git a/vllm/config.py b/vllm/config.py
index 4698a05020332..b0a92b2e21343 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,7 +607,7 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
         if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
             logger.warning(

From ecf67814f1a9e31e9802d93e8bd8b11a1c2810e7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 20 Jan 2025 20:23:40 -0500
Subject: [PATCH 43/50] Add quantization and guided decoding CODEOWNERS
 (#12228)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 37733ebacbc7a..bc324d8b988b1 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,6 +9,8 @@
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
+/vllm/model_executor/guided_decoding @mgoin
 /vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 

From d4b62d4641377c104baa2de7807f2c61d091cfe2 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 20 Jan 2025 23:22:23 -0500
Subject: [PATCH 44/50] [AMD][Build] Porting dockerfiles from the ROCm/vllm
 fork (#11777)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 Dockerfile.rocm                               | 258 +++++++-----------
 Dockerfile.rocm_base                          | 158 +++++++++++
 .../installation/gpu/rocm.inc.md              |  13 +-
 ...14336,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=1792,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=3584,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=7168,device_name=AMD_Instinct_MI300X.json |  36 +--
 7 files changed, 337 insertions(+), 236 deletions(-)
 create mode 100644 Dockerfile.rocm_base

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e922cb207b899..7213a15a2e005 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,174 +1,118 @@
-# Default ROCm 6.2 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+# default base image
+ARG REMOTE_VLLM="0"
+ARG USE_CYTHON="0"
+ARG BUILD_RPD="1"
+ARG COMMON_WORKDIR=/app
+ARG BASE_IMAGE=rocm/vllm-dev:base
 
-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+FROM ${BASE_IMAGE} AS base
 
-# Whether to install CK-based flash-attention
-# If 0, will not install flash-attention
-ARG BUILD_FA="1"
-ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="3cea2fb"
-
-# Whether to build triton on rocm
-ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e192dba"
-
-### Base image build stage
-FROM $BASE_IMAGE AS base
-
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH
+ARG ARG_PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
-RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    tmux \
-    ccache \
- && rm -rf /var/lib/apt/lists/*
-
-# When launching the container, mount the code directory to /vllm-workspace
-ARG APP_MOUNT=/vllm-workspace
-WORKDIR ${APP_MOUNT}
-
-RUN python3 -m pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+# Remove sccache    
+RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}
+
+
+# -----------------------
+# vLLM fetch stages
+FROM base AS fetch_vllm_0
+ONBUILD COPY ./ vllm/
+FROM base AS fetch_vllm_1
+ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
+ARG VLLM_BRANCH="main"
+ONBUILD RUN git clone ${VLLM_REPO} \
+	    && cd vllm \
+	    && git checkout ${VLLM_BRANCH}
+FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
+
+# -----------------------
+# vLLM build stages
+FROM fetch_vllm AS build_vllm
+ARG USE_CYTHON
+# Build vLLM
+RUN cd vllm \
+    && python3 -m pip install -r requirements-rocm.txt \
+    && python3 setup.py clean --all  \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch AS export_vllm
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+
+# -----------------------
+# Test vLLM image
+FROM base AS test
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+WORKDIR /vllm-workspace
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
-# Install torch == 2.6.0 on ROCm
-RUN --mount=type=cache,target=/root/.cache/pip \
-    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.2"*) \
-            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --pre \
-                torch \
-                'setuptools-scm>=8' \
-                torchvision \
-                --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
-        *) ;; esac
+# install development dependencies (for testing)
+RUN cd /vllm-workspace \
+    && rm -rf vllm \
+    && python3 -m pip install -e tests/vllm_test_utils \
+    && python3 -m pip install lm-eval[api]==0.4.4
 
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
-
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
-    && python3 -m pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_FA" = "1" ]; then \
-        mkdir -p libs \
-        && cd libs \
-        && git clone https://github.com/ROCm/flash-attention.git \
-        && cd flash-attention \
-        && git checkout "${FA_BRANCH}" \
-        && git submodule update --init \
-        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_TRITON" = "1" ]; then \
-    mkdir -p libs \
-    && cd libs \
-    && python3 -m pip install ninja cmake wheel pybind11 \
-    && git clone https://github.com/OpenAI/triton.git \
-    && cd triton \
-    && git checkout "${TRITON_BRANCH}" \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Final vLLM build stage
+# -----------------------
+# Final vLLM image
 FROM base AS final
-# Import the vLLM development directory from the build context
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
-RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
 
-# Package upgrades for useful functionality or to avoid dependency issues
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
+RUN python3 -m pip install --upgrade huggingface-hub[cli]
+ARG BUILD_RPD
+RUN if [ ${BUILD_RPD} -eq "1" ]; then \
+    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
+    && cd rocmProfileData/rpd_tracer \
+    && pip install -r requirements.txt && cd ../ \
+    && make && make install \
+    && cd hipMarker && python3 setup.py install ; fi
 
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+ARG COMMON_WORKDIR
+
+# Copy over the benchmark scripts as well
+COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
+COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 
-# Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
 ENV TOKENIZERS_PARALLELISM=false
 
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=bind,source=.git,target=.git \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -Ur requirements-rocm.txt \
-    && python3 setup.py clean --all \
-    && python3 setup.py develop
-
-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
-    mkdir -p libs \
-    && cp /install/*.whl libs \
-    # Preemptively uninstall to avoid same-version no-installs
-    && python3 -m pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y triton; fi
-
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if ls libs/*.whl; then \
-    python3 -m pip install libs/*.whl; fi
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+# Performance environment variable.
+ENV HIP_FORCE_DEV_KERNARG=1
 
 CMD ["/bin/bash"]
+
diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
new file mode 100644
index 0000000000000..5bbe98b0c2204
--- /dev/null
+++ b/Dockerfile.rocm_base
@@ -0,0 +1,158 @@
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="8d4926e"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+
+FROM ${BASE_IMAGE} AS base
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.12
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y software-properties-common git curl sudo vim less \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+
+ARG BASE_IMAGE
+ARG HIPBLASLT_BRANCH
+ARG LEGACY_HIPBLASLT_OPTION
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
+    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
+    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
+    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
+    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
+    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 4256027e6c40e..8ef1bc95fd522 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -123,11 +123,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 <gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
-- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
-- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
-- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
+- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
+- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
 
@@ -137,10 +136,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
 DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```
 
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 6a976788f9b10..4d4b752fa5d64 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 1,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 1,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index 0a46390b2e31b..a218fc40642c1 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 91011e64c7de4..3682cc548f352 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index f807d4a5abaed..21742854c613f 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1

From 5fe6bf29d657518eb4251981ada9f8c4f34dbbde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 21 Jan 2025 05:23:14 +0100
Subject: [PATCH 45/50] [BugFix] Fix GGUF tp>1 when vocab_size is not divisible
 by 64 (#12230)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/models/decoder_only/language/test_gguf.py        | 10 ++++++++++
 vllm/model_executor/layers/vocab_parallel_embedding.py |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 81b93ebdf0fc0..38cea2462b440 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -66,12 +66,20 @@ def gguf_model(self):
     gguf_filename="starcoder2-3b.Q6_K.gguf",
 )
 
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
+    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
+    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
+)
+
 MODELS = [
     LLAMA_CONFIG,
     QWEN2_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
+    DOLPHIN_CONFIG
     # STARCODER_CONFIG, # broken
 ]
 
@@ -107,6 +115,7 @@ def test_models(
 
     # Run unquantized model.
     with vllm_runner(model_name=model.original_model,
+                     enforce_eager=True, # faster tests
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as original_model:
@@ -115,6 +124,7 @@ def test_models(
 
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
+                     enforce_eager=True,
                      tokenizer_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 65920aa61ba15..3eb5c39ccf580 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         elif isinstance(param, UninitializedParameter):
             shape = list(loaded_weight.shape)
             if output_dim is not None:
-                shape[output_dim] = shape[output_dim] // self.tp_size
+                shape[output_dim] = self.num_embeddings_per_partition
             param.materialize(tuple(shape), dtype=loaded_weight.dtype)
 
         # If parameter does not have output dim, then it should
@@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         else:
             assert loaded_weight.shape[output_dim] == self.org_vocab_size
 
-        # Copy the data.
+        # Copy the data. Select chunk corresponding to current shard.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         if current_platform.is_hpu():

From 2fc6944c5e69d5d0ce15d09a855452c795d75c3c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 13:25:03 +0800
Subject: [PATCH 46/50] [ci/build] disable failed and flaky tests (#12240)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d2b140e718501..ed8c15358830c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -477,7 +477,9 @@ steps:
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  # this test fails consistently.
+  # TODO: investigate and fix
+  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
@@ -515,7 +517,9 @@ steps:
   - vllm/engine
   - tests/multi_step
   commands:
-  - pytest -v -s multi_step/test_correctness_async_llm.py
+  # this test is quite flaky
+  # TODO: investigate and fix.
+  # - pytest -v -s multi_step/test_correctness_async_llm.py
   - pytest -v -s multi_step/test_correctness_llm.py
 
 - label: Pipeline Parallelism Test # 45min

From 96912550c8399af2632f3f6830f7c3fa9e10a75a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 15:31:19 +0800
Subject: [PATCH 47/50] [Misc] Rename `MultiModalInputsV2 -> MultiModalInputs`
 (#12244)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/inputs.md      |  2 +-
 vllm/inputs/data.py                       | 12 ++++++------
 vllm/inputs/preprocess.py                 |  6 +++---
 vllm/model_executor/models/blip2.py       |  4 ++--
 vllm/model_executor/models/chameleon.py   |  4 ++--
 vllm/model_executor/models/fuyu.py        |  4 ++--
 vllm/model_executor/models/llava.py       |  6 +++---
 vllm/model_executor/models/phi3v.py       |  4 ++--
 vllm/model_executor/models/qwen2_audio.py |  4 ++--
 vllm/multimodal/inputs.py                 |  2 +-
 vllm/multimodal/processing.py             | 10 +++++-----
 vllm/multimodal/profiling.py              |  4 ++--
 12 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
index 76b2fb95a5009..21bd938be9e89 100644
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@@ -43,7 +43,7 @@
 ```
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
     :members:
     :show-inheritance:
 ```
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index b8163a7acde1d..57e85779dd587 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
                                  MultiModalPlaceholderDict)
-    from vllm.multimodal.inputs import MultiModalInputsV2
+    from vllm.multimodal.inputs import MultiModalInputs
 
 
 class TextPrompt(TypedDict):
@@ -207,7 +207,7 @@ def token_inputs(
     return inputs
 
 
-DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
+DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
 """
 The inputs in :class:`~vllm.LLMEngine` before they are
 passed to the model executor.
@@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict):
 
     This specifies the required data for encoder-decoder models.
     """
-    encoder: Union[TokenInputs, "MultiModalInputsV2"]
+    encoder: Union[TokenInputs, "MultiModalInputs"]
     """The inputs for the encoder portion."""
 
-    decoder: Union[TokenInputs, "MultiModalInputsV2"]
+    decoder: Union[TokenInputs, "MultiModalInputs"]
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
+SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
 """
 A processed :class:`SingletonPrompt` which can be passed to
 :class:`vllm.sequence.Sequence`.
@@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]:
             return inputs.get("multi_modal_hashes", [])
 
         if inputs["type"] == "multimodal":
-            # only the case when we use MultiModalInputsV2
+            # only the case when we use MultiModalInputs
             return inputs.get("mm_hashes", [])  # type: ignore[return-value]
 
         assert_never(inputs)  # type: ignore[arg-type]
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 0890883cc984f..70372e0cad22d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
@@ -247,7 +247,7 @@ def _process_multimodal(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
         returning the corresponding token IDs and metadata.
@@ -271,7 +271,7 @@ async def _process_multimodal_async(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
         tokenizer_group = self.get_tokenizer_group()
         tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 917b88e802071..f5c796b1acae6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -490,7 +490,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a6634204699c9..e2207865a693d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -159,7 +159,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 63e7147f84e03..3f16d3ccbd061 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
@@ -232,7 +232,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 6cceded43a79d..a355ae494afd0 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
@@ -746,7 +746,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
@@ -805,7 +805,7 @@ def get_replacement_mantis(item_idx: int):
             for modality, placeholders in mm_placeholders.items()
         }
 
-        return MultiModalInputsV2(
+        return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7a230e5beb367..dd3b0b35c9294 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
@@ -484,7 +484,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 47d56175261e4..9cb8f83ad7873 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -245,7 +245,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|AUDIO|> tokens should be considered as placeholders,
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 4b63703585214..b35184f6855ab 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 """
 
 
-class MultiModalInputsV2(TypedDict):
+class MultiModalInputs(TypedDict):
     """
     Represents the outputs of
     :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fa199a07b4cf8..ff02bcc8e1f2d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -18,8 +18,8 @@
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                     MultiModalInputsV2, MultiModalKwargs,
-                     MultiModalKwargsItem, PlaceholderRange)
+                     MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
+                     PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
 
 if TYPE_CHECKING:
@@ -609,7 +609,7 @@ def __call__(
         prompt: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     def _get_data_parser(self) -> MultiModalDataParser:
@@ -1067,7 +1067,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
 
@@ -1169,7 +1169,7 @@ def apply(
             for modality, placeholders in mm_placeholders.items()
         }
 
-        return MultiModalInputsV2(
+        return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ec580cd6ecddd..20da0f1d8316f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -11,7 +11,7 @@
 from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .inputs import MultiModalDataDict, MultiModalInputs
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -131,7 +131,7 @@ def _get_dummy_mm_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         factory = self.dummy_inputs
         processor_inputs = factory.get_dummy_processor_inputs(
             seq_len, mm_counts)

From 1f1542afa915e0975d2b63559424403e5e8aae2c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 21 Jan 2025 15:49:08 +0800
Subject: [PATCH 48/50] [Misc]Add BNB quantization for
 PaliGemmaForConditionalGeneration  (#12237)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/paligemma.py | 13 ++++++++++++-
 vllm/model_executor/models/siglip.py    | 14 ++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index f9ad0c67adaba..ed9ae1887259e 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -136,7 +136,18 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
-
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index cca42842bc06e..211e5dc80066e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -344,10 +344,16 @@ def __init__(
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-
-        # For quantization, we require the hidden size to be a multiple of 64
-        quantizable = (config.hidden_size % 64 == 0
-                       and config.intermediate_size % 64 == 0)
+        # Special handling for BNB quantization
+        if quant_config and quant_config.get_name() == "bitsandbytes":
+            quantizable = True
+        else:
+            # For other quantization, we require the hidden size to be a 
+            # multiple of 64
+            quantizable = (
+                config.hidden_size % 64 == 0
+                and config.intermediate_size % 64 == 0
+            )
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,

From f2e9f2a3be6f1dce5a6f01b2263488c6533862ac Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 16:40:39 +0800
Subject: [PATCH 49/50] [Misc] Remove redundant TypeVar from base model
 (#12248)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4c353ae6ffc13..37b91a803d71e 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -19,9 +18,6 @@
 
 logger = init_logger(__name__)
 
-# The type of HF config
-C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True)
-
 # The type of hidden states
 # Currently, T = torch.Tensor for all models except for Medusa
 # which has T = List[torch.Tensor]
@@ -34,7 +30,7 @@
 
 
 @runtime_checkable
-class VllmModel(Protocol[C_co, T_co]):
+class VllmModel(Protocol[T_co]):
     """The interface required for all models in vLLM."""
 
     def __init__(
@@ -97,7 +93,7 @@ def is_vllm_model(
 
 
 @runtime_checkable
-class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     """The interface required for all generative models in vLLM."""
 
     def compute_logits(
@@ -143,7 +139,7 @@ def is_text_generation_model(
 
 
 @runtime_checkable
-class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForPooling(VllmModel[T], Protocol[T]):
     """The interface required for all pooling models in vLLM."""
 
     def pooler(

From a94eee4456b05458bafacc17377de4701ac598a0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 18:09:39 +0800
Subject: [PATCH 50/50] [Bugfix] Fix mm_limits access for merged multi-modal
 processor (#12252)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/profiling.py |  4 ++--
 vllm/multimodal/registry.py  | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 20da0f1d8316f..c68edaff80167 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -106,7 +106,7 @@ def processing_info(self) -> BaseProcessingInfo:
     def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
         return self.processor.dummy_inputs
 
-    def _get_mm_limits(self) -> Mapping[str, int]:
+    def get_mm_limits(self) -> Mapping[str, int]:
         mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
@@ -146,7 +146,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        mm_counts = self._get_mm_limits()
+        mm_counts = self.get_mm_limits()
 
         info = self.processing_info
         mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index aaf7ff34ca573..7a4b85385cac9 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -17,7 +17,7 @@
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
-from .profiling import BaseDummyInputsBuilder
+from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -282,13 +282,13 @@ def get_max_tokens_per_item_by_nonzero_modality(
             This is currently directly used only in V1 for profiling the memory 
             usage of a model.
         """
-        limits_per_plugin = self._limits_by_model[model_config]
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
 
         return {
             key: max_tokens_per_mm_item
             for key, max_tokens_per_mm_item in
             self.get_max_tokens_per_item_by_modality(model_config).items()
-            if limits_per_plugin[key] > 0
+            if mm_limits[key] > 0
         }
 
     def get_max_tokens_by_modality(
@@ -304,10 +304,10 @@ def get_max_tokens_by_modality(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
-        limits_per_plugin = self._limits_by_model[model_config]
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
 
         return {
-            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            key: mm_limits[key] * max_tokens_per_mm_item
             for key, max_tokens_per_mm_item in
             self.get_max_tokens_per_item_by_modality(model_config).items()
         }
@@ -371,6 +371,15 @@ def get_mm_limits_per_prompt(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = self.create_processor(model_config, tokenizer)
+            profiler = MultiModalProfiler(processor)
+            return profiler.get_mm_limits()
+
         return self._limits_by_model[model_config]
 
     def register_processor(