diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 0000000000000..8350e2705141e
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 64ba1b32fb074..708e548727cf5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -65,9 +65,9 @@ steps:
         - VLLM_USAGE_SOURCE
         - HF_TOKEN
 
-  - block: "Run H100 Benchmark"
-    key: block-h100
-    depends_on: ~
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
 
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 2de6fceb0c3fe..51618a2955fb1 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -55,3 +55,18 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d06604f96f2b8..4fc6d089cc666 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -4,6 +4,9 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
   --target vllm-openai \
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 44f47fac1c1b3..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -224,8 +224,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 7345dd4e66b29..3c756659a715a 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -23,6 +23,8 @@ wheel="$new_wheel"
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 
+normal_wheel="$wheel" # Save the original wheel filename
+
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
     suffix="${version##*.}"
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
         new_version="1.0.0.dev"
     fi
     new_wheel="${wheel/$version/$new_version}"
-    mv -- "$wheel" "$new_wheel"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
     wheel="$new_wheel"
     version="$new_version"
 fi
 
 # Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c1051d10a4860..e40ceaaa8b037 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,67 +39,68 @@ jobs:
             const script = require('.github/workflows/scripts/create_release.js')
             await script(github, context, core)
 
-  wheel:
-    name: Build Wheel
-    runs-on: ${{ matrix.os }}
-    needs: release
-
-    strategy:
-      fail-fast: false
-      matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['11.8', '12.1']
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-        with:
-          create-symlink: true
-          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
-      - name: Set up Linux Env
-        if: ${{ runner.os == 'Linux' }}
-        run: |
-          bash -x .github/workflows/scripts/env.sh
-
-      - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-            python-version: ${{ matrix.python-version }}
-
-      - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
-      - name: Build wheel
-        shell: bash
-        env:
-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        run: |
-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
-      - name: Upload Release Asset
-        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_name }}
-          asset_name: ${{ env.asset_name }}
-          asset_content_type: application/*
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # wheel:
+  #   name: Build Wheel
+  #   runs-on: ${{ matrix.os }}
+  #   needs: release
+
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #         os: ['ubuntu-20.04']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         cuda-version: ['11.8', '12.1']
+
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Setup ccache
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+  #       with:
+  #         create-symlink: true
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+  #     - name: Set up Linux Env
+  #       if: ${{ runner.os == 'Linux' }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/env.sh
+
+  #     - name: Set up Python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #           python-version: ${{ matrix.python-version }}
+
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+  #     - name: Build wheel
+  #       shell: bash
+  #       env:
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #       run: |
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+  #     - name: Upload Release Asset
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_name: ${{ env.asset_name }}
+  #         asset_content_type: application/*
 
       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
       # - name: Publish package
diff --git a/.gitignore b/.gitignore
index ceef6a5fba456..bb7e4d5b244a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,6 +81,8 @@ instance/
 docs/_build/
 docs/source/getting_started/examples/*.rst
 !**/*.template.rst
+docs/source/getting_started/examples/*.md
+!**/*.template.md
 
 # PyBuilder
 .pybuilder/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf19b3d227171..83c8033434f3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        GIT_SHALLOW FALSE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
@@ -241,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -270,7 +273,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures")
   endif()
 
-  #
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
@@ -323,6 +325,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  #
+  # 2:4 Sparse Kernels
+
+  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                     "if you intend on running FP8 sparse quantized models on Hopper.")
+    else()
+      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
 
   #
   # Machete kernels
@@ -404,7 +431,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/Dockerfile b/Dockerfile
index 123703848749c..153bff9cf565f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/dev/dockerfile/dockerfile.md and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
@@ -166,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+    fi
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip uninstall -y torch && \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
@@ -240,10 +240,11 @@ FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
+
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ebe226cf6d148..f163edc27cba8 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
+COPY requirements-build.txt requirements-build.txt
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cpu.txt requirements-cpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
     pip install -v -r requirements-cpu.txt
 
 COPY . .
diff --git a/Dockerfile.rocm.ubi b/Dockerfile.rocm.ubi
index 8766b995bb555..cc4b81396ef86 100644
--- a/Dockerfile.rocm.ubi
+++ b/Dockerfile.rocm.ubi
@@ -49,8 +49,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     export version="$(awk -F. '{print $1"."$2}' <<< $ROCM_VERSION)" && \
     uv pip install --pre \
         --index-url "https://download.pytorch.org/whl/nightly/rocm${version}" \
-        torch==2.6.0.dev20241107+rocm${version}\
-        torchvision==0.20.0.dev20241107+rocm${version} && \
+        torch==2.6.0.dev20241122+rocm${version}\
+        torchvision==0.20.0.dev20241122+rocm${version} && \
     # Install libdrm-amdgpu to avoid errors when retrieving device information (amdgpu.ids: No such file or directory)
     microdnf install -y libdrm-amdgpu && \
     microdnf clean all
diff --git a/README.md b/README.md
index 93b71ddaccc61..f83c9d759b359 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ vLLM is flexible and easy to use with:
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
-- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
 - Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1e5967bd9bf8b..c1b10b3cf8f58 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -4,7 +4,8 @@
 import json
 import random
 import time
-from typing import List, Optional
+from functools import cache
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import uvloop
@@ -17,8 +18,11 @@
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
@@ -28,15 +32,17 @@ class SampleRequest:
 
     Attributes:
         prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
         prompt_len: The length of the prompt in tokens.
         expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use. 
     """
     prompt: str
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
 
 
 def _get_prompt_for_image_model(question: str, *, model: str) -> str:
@@ -60,8 +66,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
     raise ValueError(f"Unsupported model {model}")
 
 
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> List[SampleRequest]:
+
     dataset_path: str = args.dataset
     num_requests: int = args.num_prompts
     fixed_output_len: Optional[int] = args.output_len
@@ -79,7 +107,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for data in dataset:
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
         if len(filtered_dataset) == num_requests:
             break
 
@@ -102,9 +132,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                 continue
             prompt = _get_prompt_for_image_model(question=prompt, model=model)
 
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
         # Tokenize the prompts and completions.
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion_token_ids = tokenizer(completion).input_ids
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
@@ -118,7 +155,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data))
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
 
     return filtered_dataset
 
@@ -146,14 +184,21 @@ def run_vllm(
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
             ))
+    lora_requests: Optional[List[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
 
     use_beam_search = False
 
     if not use_beam_search:
         start = time.perf_counter()
-        llm.generate(prompts, sampling_params, use_tqdm=True)
+        llm.generate(prompts,
+                     sampling_params,
+                     lora_request=lora_requests,
+                     use_tqdm=True)
         end = time.perf_counter()
     else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
@@ -185,6 +230,7 @@ async def run_vllm_async(
         # Add the requests to the engine.
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
+        lora_requests: List[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
                 TextPrompt(prompt=request.prompt,
@@ -197,11 +243,16 @@ async def run_vllm_async(
                     ignore_eos=True,
                     max_tokens=request.expected_output_len,
                 ))
+            lora_requests.append(request.lora_request)
 
         generators = []
         start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
             generators.append(generator)
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
@@ -297,6 +348,14 @@ def main(args: argparse.Namespace):
         vocab_size = tokenizer.vocab_size
         requests = []
         for _ in range(args.num_prompts):
+
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
+
             # Synthesize a prompt with the given input length.
             candidate_ids = [
                 random.randint(0, vocab_size - 1)
@@ -305,8 +364,8 @@ def main(args: argparse.Namespace):
             # As tokenizer may add additional tokens like BOS, we need to try
             # different lengths to get the desired input length.
             for _ in range(5):  # Max attempts to correct
-                candidate_prompt = tokenizer.decode(candidate_ids)
-                tokenized_len = len(tokenizer.encode(candidate_prompt))
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
 
                 if tokenized_len == args.input_len:
                     break
@@ -323,7 +382,8 @@ def main(args: argparse.Namespace):
             requests.append(
                 SampleRequest(prompt=candidate_prompt,
                               prompt_len=args.input_len,
-                              expected_output_len=args.output_len))
+                              expected_output_len=args.output_len,
+                              lora_request=lora_request))
     else:
         requests = sample_requests(tokenizer, args)
 
@@ -422,6 +482,14 @@ def main(args: argparse.Namespace):
                         action='store_true',
                         default=False,
                         help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
@@ -431,6 +499,8 @@ def main(args: argparse.Namespace):
         assert args.output_len is not None
     else:
         assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
 
     if args.backend == "vllm":
         if args.hf_max_batch_size is not None:
@@ -440,6 +510,9 @@ def main(args: argparse.Namespace):
             raise ValueError("HF max batch size is required for HF backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     elif args.backend == "mii":
         if args.dtype != "auto":
             raise ValueError("dtype must be auto for MII backend.")
@@ -452,4 +525,7 @@ def main(args: argparse.Namespace):
         if args.tokenizer != args.model:
             raise ValueError("Tokenizer must be the same as the model for MII "
                              "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     main(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 2924ea4a49f54..94999630bae12 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -10,7 +10,8 @@ set -ex
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pkill -f pt_main_thread
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
   sleep 10
 
   # remove vllm config file
@@ -54,7 +55,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -64,7 +65,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8200 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -87,7 +88,7 @@ benchmark() {
           --port 8100 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1.json \
           --request-rate "inf"
 
 
@@ -105,7 +106,7 @@ benchmark() {
           --port 8200 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1_overhead.json \
           --request-rate "$qps"
   kill_gpu_processes
 
@@ -118,7 +119,7 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
-  pip install quart httpx
+  pip install quart httpx datasets
 
   cd "$(dirname "$0")"
 
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index d8d9e976dce76..eb5d891d0d4a5 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 
-# Requirement: 8x H100 GPUs.
+# Requirement: 2x GPUs.
 
 
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
+# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
+# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
+# Resource: 2x GPU
 # Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
 # 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
 # Prefilling instance: max_output_token=1
@@ -114,7 +113,6 @@ benchmark() {
           --request-rate "$qps"
 
   sleep 2
-
 }
 
 
@@ -123,8 +121,9 @@ main() {
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
+  (which lsof) || (apt-get -y install lsof)
 
-  pip install quart httpx matplotlib aiohttp
+  pip install quart httpx matplotlib aiohttp datasets
 
   cd "$(dirname "$0")"
 
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index c69e87999ae71..26f7423fd7455 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
 
 /*
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 95764ecddc79f..c723adf126422 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 
 /*
@@ -36,13 +38,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index fff7ce34c838a..24341d63fb1f8 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -113,6 +113,92 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
   }
 }
 
+// TODO(simon): this is temporarily adapted from
+// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
+// we did this to unblock Deepseek V3 but there should be a better
+// implementation to manage shared memory.
+template <typename scalar_t>
+__global__ void moe_align_block_size_global_mem_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
+  }
+
+  /**
+   * In the first step we compute token_cnts[thread_index + 1][expert_index],
+   * which counts how many tokens in the token shard of thread_index are
+   * assigned to expert expert_index.
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
+  }
+
+  __syncthreads();
+
+  // For each expert we accumulate the token counts from the different threads.
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+    }
+  }
+
+  __syncthreads();
+
+  // We accumulate the token counts of all experts in thread 0.
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] +
+                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
+                          block_size) *
+                      block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+
+  __syncthreads();
+
+  /**
+   * For each expert, each thread processes the tokens of the corresponding
+   * blocks and stores the corresponding expert_id for each block.
+   */
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+
+  /**
+   * Each thread processes a token shard, calculating the index of each token
+   * after sorting by expert number. Given the example topk_ids =
+   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
+   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
+   * padding value(preset in python).
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    /** The cumsum[expert_id] stores the starting index of the tokens that the
+     * expert with expert_id needs to process, and
+     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
+     * processed by the expert with expert_id within the current thread's token
+     * shard.
+     */
+    int32_t rank_post_pad =
+        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
+        cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
+  }
+}
+
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
     scalar_t* __restrict__ out,          // [..., d]
@@ -137,25 +223,61 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
-        const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-        const int32_t shared_mem =
-            ((num_thread + 1) * num_experts + (num_experts + 1)) *
-            sizeof(int32_t);
-
-        // set dynamic shared mem
-        auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-            (void*)kernel, shared_mem));
-        kernel<<<1, num_thread, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-            topk_ids.numel());
-      });
+
+  // If we have very large number of experts, we can no longer use shared
+  // memory.
+  // TODO(simon): the right solution should be calculating the exact right
+  // amount of shared memory and use that. The num_experts >= 256 is just a
+  // temporary solution to unblock Deepseek V3.
+  if (num_experts >= 256) {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
+          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+          // tensors
+          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+
+          const int32_t mem_tokens_cnts =
+              ((num_experts + 1) * num_experts) * sizeof(int32_t);
+          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
+          // allocate global memory
+          int32_t* tokens_cnts;
+          int32_t* cumsum;
+          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
+          cudaMalloc(&cumsum, mem_cumsum);
+
+          auto kernel =
+              vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
+          kernel<<<1, num_thread, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel(), tokens_cnts, cumsum);
+          cudaFree(tokens_cnts);
+          cudaFree(cumsum);
+        });
+  } else {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+          // tensors
+          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+          const int32_t shared_mem =
+              ((num_thread + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);
+
+          // set dynamic shared mem
+          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem));
+          kernel<<<1, num_thread, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        });
+  }
 }
 
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
diff --git a/csrc/ops.h b/csrc/ops.h
index 816b471d062d2..347c502845d8f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -162,6 +162,17 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..f2fae4b66d651 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,15 +21,16 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
 
 /*
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   Epilogues defined in,
+   csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+   must contain a public type named EVTCompute of type Sm80EVT,
    as well as a static prepare_args function that constructs an
    EVTCompute::Arguments struct.
 */
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 33581a63d4c3d..123f4359c0d1a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,384 +1,18 @@
-// clang-format will break include orders
-// clang-format off
 #include <cudaTypedefs.h>
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 
-#include <torch/all.h>
+  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
+  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
 
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
+  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 using namespace vllm;
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& b,
-                                    EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_int8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NBig =
-      typename sm90_int8_config_M32_NBig<InType, OutType,
-                                         Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NSmall =
-      typename sm90_int8_config_M32_NSmall<InType, OutType,
-                                           Epilogue>::Cutlass3xGemm;
-
-  uint32_t const n = out.size(1);
-  bool const is_small_n = n < 8192;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 32) {
-    // m in [1, 32]
-    if (is_small_n) {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else if (mp2 <= 64) {
-    // m in (32, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..d4bc2f0ade50d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -0,0 +1,160 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace vllm {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..f08419b3122b2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
@@ -0,0 +1,96 @@
+#pragma once
+
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
new file mode 100644
index 0000000000000..34e5fd90ba26a
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..4f7b6588ef3f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
+
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -79,16 +81,6 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
-int32_t get_sm_version_num() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
new file mode 100644
index 0000000000000..bd53695503241
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -0,0 +1,165 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                             torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+
+  // The n (=1) dimension does not matter for the compressor
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // Offline compressor kernel
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
+      a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a) {
+  if (a.dtype() == torch::kBFloat16) {
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
+                                                               a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
+                                                                 a);
+  } else if (a.dtype() == torch::kInt8) {
+    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
+  }
+  return false;
+}
+#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
new file mode 100644
index 0000000000000..3401761c1b703
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -0,0 +1,42 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a);
+#endif
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                   torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
+  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
+              a_nzs.size(1) * 2 == a.size(1) &&
+              a_meta.size(1) * 2 * 4 == a.size(1));
+  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
+              a_meta.stride(1) == 1);  // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..6223dc8cca704
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,303 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..10178b53f4af0
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,496 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_sparse_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
+
+  // Transpose A and D
+  // A doesn't need to be transposed since cutlass expects a NxK matrix
+  // for B (which is At)
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{
+      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
+      static_cast<int>(size<1>(layout_A)), 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..d464b045b895f
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,70 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
+  // sparse CUTLASS kernels need at least
+  //   CUDA 12.2 and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
+#endif
+
+  return false;
+}
+
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1ffab14862fed..956258c1001d3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -321,6 +321,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
+  // given capability
+  ops.def(
+      "cutlass_sparse_scaled_mm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_sparse_scaled_mm_supported",
+           &cutlass_sparse_scaled_mm_supported);
+
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def(
+      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
+      "                              Tensor a) -> bool");
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ca2da4cd66d2d..4859c8ac08bea 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,7 +1,7 @@
 sphinx==6.2.1
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
-myst-parser==2.0.0
+myst-parser==3.0.1
 sphinx-argparse==0.4.0
 msgspec
 cloudpickle
diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md
new file mode 100644
index 0000000000000..c0c141c5fb7ef
--- /dev/null
+++ b/docs/source/automatic_prefix_caching/apc.md
@@ -0,0 +1,102 @@
+(apc)=
+
+# Introduction
+
+## What is Automatic Prefix Caching
+
+Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+
+```{note}
+Technical details on how vLLM implements APC are in the next page.
+```
+
+## Enabling APC in vLLM
+
+Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+
+```python
+import time
+from vllm import LLM, SamplingParams
+
+
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
+| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
+| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
+| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
+| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
+| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
+| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
+| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
+| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
+| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
+| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
+| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
+| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
+| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
+| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
+| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
+| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
+| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
+| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
+| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
+| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
+| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
+| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
+| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
+| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
+| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
+| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
+| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
+| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
+| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
+"""
+
+
+def get_generation_time(llm, sampling_params, prompts):
+    # time the generation
+    start_time = time.time()
+    output = llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    # print the output and generation time
+    print(f"Output: {output[0].outputs[0].text}")
+    print(f"Generation time: {end_time - start_time} seconds.")
+
+
+# set enable_prefix_caching=True to enable APC
+llm = LLM(
+    model='lmsys/longchat-13b-16k',
+    enable_prefix_caching=True
+)
+
+sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+# Querying the age of John Doe
+get_generation_time(
+    llm,
+    sampling_params,
+    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+)
+
+# Querying the age of Zack Blue
+# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+get_generation_time(
+    llm,
+    sampling_params,
+    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+)
+```
+
+## Example workloads
+
+We describe two example workloads, where APC can provide huge performance benefit:
+
+- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+
+## Limits
+
+APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst
deleted file mode 100644
index 0d70c74689bf9..0000000000000
--- a/docs/source/automatic_prefix_caching/apc.rst
+++ /dev/null
@@ -1,110 +0,0 @@
-.. _apc:
-
-Introduction
-============
-
-What is Automatic Prefix Caching
---------------------------------
-
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-
-
-.. note::
-
-   Technical details on how vLLM implements APC are in the next page.
-
-
-
-Enabling APC in vLLM
---------------------
-
-Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example:
-
-.. code-block:: python
-
-    import time
-    from vllm import LLM, SamplingParams
-
-
-    # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-    LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
-    | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
-    |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
-    | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
-    | 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
-    | 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
-    | 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
-    | 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
-    | 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
-    | 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
-    | 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
-    | 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
-    | 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
-    | 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
-    | 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
-    | 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
-    | 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
-    | 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
-    | 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
-    | 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
-    | 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
-    | 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
-    | 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
-    | 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
-    | 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
-    | 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
-    | 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
-    | 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
-    | 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
-    | 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
-    | 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
-    | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
-    | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
-    """
-
-
-    def get_generation_time(llm, sampling_params, prompts):
-        # time the generation
-        start_time = time.time()
-        output = llm.generate(prompts, sampling_params=sampling_params)
-        end_time = time.time()
-        # print the output and generation time
-        print(f"Output: {output[0].outputs[0].text}")
-        print(f"Generation time: {end_time - start_time} seconds.")
-
-
-    # set enable_prefix_caching=True to enable APC
-    llm = LLM(
-        model='lmsys/longchat-13b-16k',
-        enable_prefix_caching=True
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=100)
-
-    # Querying the age of John Doe
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-    )
-
-    # Querying the age of Zack Blue
-    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
-    )
-
-Example workloads
------------------
-
-We describe two example workloads, where APC can provide huge performance benefit:
-
-- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
-- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
-
-
-Limits
-------
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
new file mode 100644
index 0000000000000..43fa9ee616096
--- /dev/null
+++ b/docs/source/community/meetups.md
@@ -0,0 +1,15 @@
+(meetups)=
+
+# vLLM Meetups
+
+We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+
+- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
+- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
+- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
+- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
+- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
+- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
+- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
+
+We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
deleted file mode 100644
index c87f01aa263b3..0000000000000
--- a/docs/source/community/meetups.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _meetups:
-
-vLLM Meetups
-============
-
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
-
-- `The seventh vLLM meetup <https://lu.ma/h0qvrajz>`__, with Snowflake, November 14th 2024. `[Slides] <https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing>`__
-- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
-- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
-- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
-- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
-- `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
-- `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
-
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu <mailto:vllm-questions@lists.berkeley.edu>`__.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e9d9ac68c9560..1fe0474631140 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -51,7 +51,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.rst"]
+exclude_patterns: List[str] = ["**/*.template.md"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
@@ -74,6 +74,35 @@
 html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 
+myst_url_schemes = {
+    'http': None,
+    'https': None,
+    'mailto': None,
+    'ftp': None,
+    "gh-issue": {
+        "url":
+        "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
+        "title": "Issue #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-pr": {
+        "url":
+        "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
+        "title": "Pull Request #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-dir": {
+        "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+    "gh-file": {
+        "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+}
+
 # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
 READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
 if READTHEDOCS_VERSION_TYPE == "tag":
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
new file mode 100644
index 0000000000000..6535414a7dca4
--- /dev/null
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -0,0 +1,50 @@
+# Dockerfile
+
+We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
+
+The edges of the build graph represent:
+
+- FROM ... dependencies (with a solid line and a full arrow head)
+
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+
+- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+
+  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > :align: center
+  > :alt: query
+  > :width: 100%
+  > ```
+  >
+  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
+  >
+  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
+  >
+  > ```bash
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > ```
+  >
+  > or in case you want to run it directly with the docker image:
+  >
+  > ```bash
+  > docker run \
+  >    --rm \
+  >    --user "$(id -u):$(id -g)" \
+  >    --workdir /workspace \
+  >    --volume "$(pwd)":/workspace \
+  >    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+  >    --output png \
+  >    --dpi 200 \
+  >    --max-label-length 50 \
+  >    --filename Dockerfile \
+  >    --legend
+  > ```
+  >
+  > (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst
deleted file mode 100644
index 9c17c27aa61bf..0000000000000
--- a/docs/source/contributing/dockerfile/dockerfile.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Dockerfile
-====================
-
-See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
-
-Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
-
-- All build stages
-- The default build target (highlighted in grey)
-- External images (with dashed borders)
-   
-The edges of the build graph represent:
-
-- FROM ... dependencies (with a solid line and a full arrow head)
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
-- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
-
-   .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
-      :alt: query
-      :width: 100%
-      :align: center
-
-   Made using: https://github.com/patrickhoefler/dockerfilegraph
-
-   Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
-
-   .. code:: bash
-
-      dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
-
-   or in case you want to run it directly with the docker image:
-   
-   .. code:: bash
-
-      docker run \
-         --rm \
-         --user "$(id -u):$(id -g)" \
-         --workdir /workspace \
-         --volume "$(pwd)":/workspace \
-         ghcr.io/patrickhoefler/dockerfilegraph:alpine \
-         --output png \
-         --dpi 200 \
-         --max-label-length 50 \
-         --filename Dockerfile \
-         --legend
-
-   (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
-
-   
\ No newline at end of file
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md
similarity index 51%
rename from docs/source/contributing/overview.rst
rename to docs/source/contributing/overview.md
index 4cea0afdaea74..9dac41cff0bcb 100644
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.md
@@ -1,5 +1,4 @@
-Contributing to vLLM
-=====================
+# Contributing to vLLM
 
 Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
 
@@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe
 
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
 
-License
--------
+## License
 
-See `LICENSE <https://github.com/vllm-project/vllm/tree/main/LICENSE>`_.
+See <gh-file:LICENSE>.
 
-Developing
-----------
+## Developing
 
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source <https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source>`_ documentation for details.
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
+Check out the [building from source](#build-from-source) documentation for details.
 
-Testing
--------
+## Testing
 
-.. code-block:: bash
+```bash
+pip install -r requirements-dev.txt
 
-    pip install -r requirements-dev.txt
+# linting and formatting
+bash format.sh
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
 
-    # linting and formatting
-    bash format.sh
-    # Static type checking
-    mypy
-    # Unit tests
-    pytest tests/
+```{note}
+Currently, the repository does not pass the `mypy` tests.
+```
 
-.. note:: Currently, the repository does not pass the ``mypy`` tests.
+# Contribution Guidelines
 
-Contribution Guidelines
-=======================
+## Issues
 
-Issues
-------
+If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-If you encounter a bug or have a feature request, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
+```{important}
+If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+```
 
-.. important::
-   If you discover a security vulnerability, please follow the instructions `here <https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability>`_.
-
-Pull Requests & Code Reviews
-----------------------------
+## Pull Requests & Code Reviews
 
 Thank you for your contribution to vLLM! Before submitting the pull request,
 please ensure the PR meets the following criteria. This helps vLLM maintain the
 code quality and improve the efficiency of the review process.
 
-DCO and Signed-off-by
-^^^^^^^^^^^^^^^^^^^^^
+### DCO and Signed-off-by
 
-When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-Commits must include a ``Signed-off-by:`` header which certifies agreement with
-the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+When contributing changes to this project, you must agree to the <gh-file:DCO>.
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the DCO.
 
-Using ``-s`` with ``git commit`` will automatically add this header.
+Using `-s` with `git commit` will automatically add this header.
 
-PR Title and Classification
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### PR Title and Classification
 
 Only specific types of PRs will be reviewed. The PR title is prefixed
 appropriately to indicate the type of change. Please use one of the following:
 
-- ``[Bugfix]`` for bug fixes.
-- ``[CI/Build]`` for build or continuous integration improvements.
-- ``[Doc]`` for documentation fixes and improvements.
-- ``[Model]`` for adding a new model or improving an existing model. Model name
+- `[Bugfix]` for bug fixes.
+- `[CI/Build]` for build or continuous integration improvements.
+- `[Doc]` for documentation fixes and improvements.
+- `[Model]` for adding a new model or improving an existing model. Model name
   should appear in the title.
-- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
-  ``LLM`` class, etc.)
-- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
-- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
-  ``AsyncLLMEngine``, ``Scheduler``, etc.)
-- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
-  appear in the prefix (e.g., ``[Hardware][AMD]``).
-- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
+  `LLM` class, etc.)
+- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
+- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
+  `AsyncLLMEngine`, `Scheduler`, etc.)
+- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., `[Hardware][AMD]`).
+- `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-.. note::
-   If the PR spans more than one category, please include all relevant prefixes.
+```{note}
+If the PR spans more than one category, please include all relevant prefixes.
+```
 
-Code Quality
-^^^^^^^^^^^^
+### Code Quality
 
 The PR needs to meet the following code quality standards:
 
-- We adhere to `Google Python style guide
-  <https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
-  <https://google.github.io/styleguide/cppguide.html>`_.
-- Pass all linter checks. Please use `format.sh
-  <https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
-  code.
+- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+- Pass all linter checks. Please use <gh-file:format.sh> to format your code.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
   includes both unit tests and integration tests.
-- Please add documentation to ``docs/source/`` if the PR modifies the
+- Please add documentation to `docs/source/` if the PR modifies the
   user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
   new features or changes.
 
-Adding or Changing Kernels
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+### Adding or Changing Kernels
 
 Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
 
 - Make sure custom ops are registered following PyTorch guidelines:
-  `Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
-  and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
-- Custom operations that return ``Tensors`` require meta-functions.
+  [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
+  and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
+- Custom operations that return `Tensors` require meta-functions.
   Meta-functions should be implemented and registered in Python so that dynamic
   dims can be handled automatically. See above documents for a description of
   meta-functions.
-- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
+- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
   to test the function registration and meta-function for any registered ops.
-  See ``tests/kernels`` for examples.
+  See `tests/kernels` for examples.
 - When changing the C++ signature of an existing op, the schema must be updated
   to reflect the changes.
 - If a new custom type is needed, see the following document:
-  `Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
+  [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
 
-Notes for Large Changes
-^^^^^^^^^^^^^^^^^^^^^^^
+### Notes for Large Changes
 
 Please keep the changes as concise as possible. For major architectural changes
 (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
 (RFC) discussing the technical design and justification. Otherwise, we will tag
-it with ``rfc-required`` and might not go through the PR.
+it with `rfc-required` and might not go through the PR.
 
-What to Expect for the Reviews
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### What to Expect for the Reviews
 
 The goal of the vLLM team is to be a *transparent reviewing machine*. We would
 like to make the review process transparent and efficient and make sure no
@@ -150,15 +138,14 @@ review process:
 - After the PR is assigned, the reviewer will provide status updates every 2-3
   days. If the PR is not reviewed within 7 days, please feel free to ping the
   reviewer or the vLLM team.
-- After the review, the reviewer will put an ``action-required`` label on the PR
+- After the review, the reviewer will put an `action-required` label on the PR
   if there are changes required. The contributor should address the comments and
   ping the reviewer to re-review the PR.
 - Please respond to all comments within a reasonable time frame. If a comment
   isn't clear or you disagree with a suggestion, feel free to ask for
   clarification or discuss the suggestion.
 
-Thank You
----------
+## Thank You
 
 Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
 All of your contributions help make vLLM a great tool and community for everyone!
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
new file mode 100644
index 0000000000000..46210957c19ec
--- /dev/null
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -0,0 +1,41 @@
+# Profiling vLLM
+
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+
+The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+
+When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+
+```{warning}
+Only enable profiling in a development environment.
+```
+
+Traces can be visualized using <https://ui.perfetto.dev/>.
+
+```{tip}
+Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
+```
+
+```{tip}
+To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+`export VLLM_RPC_TIMEOUT=1800000`
+```
+
+## Example commands and usage
+
+### Offline Inference
+
+Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+
+### OpenAI Server
+
+```bash
+VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
+```
+
+benchmark_serving.py:
+
+```bash
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
+```
diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst
deleted file mode 100644
index a422b1fcda521..0000000000000
--- a/docs/source/contributing/profiling/profiling_index.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-==============
-Profiling vLLM
-==============
-
-We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
-
-The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set.
-
-When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag.
-
-.. warning::
-
-   Only enable profiling in a development environment. 
-
-
-Traces can be visualized using https://ui.perfetto.dev/.
-
-.. tip::
-
-   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-
-.. tip::
-
-   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-   Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-   ``export VLLM_RPC_TIMEOUT=1800000``
-  
-Example commands and usage:
-===========================
-
-Offline Inference:
-------------------
-
-Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
-
-
-OpenAI Server:
---------------
-
-.. code-block:: bash
-
-    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
-
-benchmark_serving.py:
-
-.. code-block:: bash
-
-    python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 
\ No newline at end of file
diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md
similarity index 54%
rename from docs/source/design/arch_overview.rst
rename to docs/source/design/arch_overview.md
index bc3f509f0a66e..475a3e5fa9ddc 100644
--- a/docs/source/design/arch_overview.rst
+++ b/docs/source/design/arch_overview.md
@@ -1,25 +1,24 @@
-.. _arch_overview:
+(arch-overview)=
 
-Architecture Overview
-======================
+# Architecture Overview
 
 This document provides an overview of the vLLM architecture.
 
-.. contents:: Table of Contents
-    :local:
-    :depth: 2
+```{contents} Table of Contents
+:depth: 2
+:local: true
+```
 
-Entrypoints
------------
+## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
-    :alt: Entrypoints Diagram
+```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:alt: Entrypoints Diagram
+```
 
-LLM Class
-^^^^^^^^^
+### LLM Class
 
 The LLM class provides the primary Python interface for doing offline inference,
 which is interacting with a model without using a separate model inference
@@ -27,75 +26,70 @@ server.
 
 Here is a sample of `LLM` class usage:
 
-.. code-block:: python
+```python
+from vllm import LLM, SamplingParams
 
-    from vllm import LLM, SamplingParams
+# Define a list of input prompts
+prompts = [
+    "Hello, my name is",
+    "The capital of France is",
+    "The largest ocean is",
+]
 
-    # Define a list of input prompts
-    prompts = [
-        "Hello, my name is",
-        "The capital of France is",
-        "The largest ocean is",
-    ]
+# Define sampling parameters
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-    # Define sampling parameters
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Initialize the LLM engine with the OPT-125M model
+llm = LLM(model="facebook/opt-125m")
 
-    # Initialize the LLM engine with the OPT-125M model
-    llm = LLM(model="facebook/opt-125m")
+# Generate outputs for the input prompts
+outputs = llm.generate(prompts, sampling_params)
 
-    # Generate outputs for the input prompts
-    outputs = llm.generate(prompts, sampling_params)
+# Print the generated outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
 
-    # Print the generated outputs
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-More API details can be found in the :doc:`Offline Inference
+More API details can be found in the {doc}`Offline Inference
 </dev/offline_inference/offline_index>` section of the API docs.
 
-The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py>`_.
+The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
-OpenAI-compatible API server
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### OpenAI-compatible API server
 
 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
 
-.. code-block:: bash
-
-    vllm serve <model>
+```bash
+vllm serve <model>
+```
 
-The code for the `vllm` CLI can be found in `vllm/scripts.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py>`_.
+The code for the `vllm` CLI can be found in <gh-file:vllm/scripts.py>.
 
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
 
-.. code-block:: bash
-
-    python -m vllm.entrypoints.openai.api_server --model <model>
+```bash
+python -m vllm.entrypoints.openai.api_server --model <model>
+```
 
-That code can be found in `vllm/entrypoints/openai/api_server.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py>`_.
+That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the :doc:`OpenAI Compatible
+More details on the API server can be found in the {doc}`OpenAI Compatible
 Server </serving/openai_compatible_server>` document.
 
-LLM Engine
-----------
+## LLM Engine
 
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
-    :alt: LLMEngine Diagram
+```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:alt: LLMEngine Diagram
+```
 
-LLMEngine
-^^^^^^^^^
+### LLMEngine
 
 The `LLMEngine` class is the core component of the vLLM engine. It is
 responsible for receiving requests from clients and generating outputs from the
@@ -105,21 +99,15 @@ processing.
 
 - **Input Processing**: Handles tokenization of input text using the specified
   tokenizer.
-
 - **Scheduling**: Chooses which requests are processed in each step.
-
 - **Model Execution**: Manages the execution of the language model, including
   distributed execution across multiple GPUs.
-
 - **Output Processing**: Processes the outputs generated by the model, decoding the
   token IDs from a language model into human-readable text.
 
-The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
-
-.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
 
-AsyncLLMEngine
-^^^^^^^^^^^^^^
+### AsyncLLMEngine
 
 The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
 It uses `asyncio` to create a background loop that continuously processes
@@ -127,55 +115,46 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.
 
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in
-`vllm/entrypoints/api_server.py`_.
-
-.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
+API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
 
-The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
+The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
 
-.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
-
-Worker
-------
+## Worker
 
 A worker is a process that runs the model inference. vLLM follows the common
 practice of using one process to control one accelerator device, such as GPUs.
 For example, if we use tensor parallelism of size 2 and pipeline parallelism of
 size 2, we will have 4 workers in total. Workers are identified by their
-``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
-``local_rank`` is mainly used for assigning the accelerator device and accessing
+`rank` and `local_rank`. `rank` is used for global orchestration, while
+`local_rank` is mainly used for assigning the accelerator device and accessing
 local resources such as the file system and shared memory.
 
-Model Runner
-------------
+## Model Runner
 
 Every worker has one model runner object, responsible for loading and running
 the model. Much of the model execution logic resides here, such as preparing
 input tensors and capturing cudagraphs.
 
-Model
------
+## Model
 
 Every model runner object has one model object, which is the actual
-``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
+`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
 configurations affect the class we ultimately get.
 
-Class Hierarchy
----------------
+## Class Hierarchy
 
 The following figure shows the class hierarchy of vLLM:
 
-    .. figure:: /assets/design/hierarchy.png
-        :alt: query
-        :width: 100%
-        :align: center
+> ```{figure} /assets/design/hierarchy.png
+> :align: center
+> :alt: query
+> :width: 100%
+> ```
 
 There are several important design choices behind this class hierarchy:
 
-1. **Extensibility**: All classes in the hierarchy accept a configuration object
-containing all the necessary information. The `VllmConfig
-<https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__
+1\. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
 class is the main configuration object that is passed around. The class
 hierarchy is quite deep, and every class needs to read the configuration it is
 interested in. By encapsulating all configurations in one object, we can easily
@@ -188,7 +167,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't
 need to change the constructor of the engine, worker, or model class to pass the
 new configuration option.
 
-2. **Uniformity**: The model runner needs a unified interface to create and
+2\. **Uniformity**: The model runner needs a unified interface to create and
 initialize the model. vLLM supports more than 50 types of popular open-source
 models. Each model has its own initialization logic. If the constructor
 signature varies with models, the model runner does not know how to call the
@@ -200,46 +179,46 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-.. note::
-
-    To support this change, all vLLM models' signatures have been updated to:
-
-    .. code-block:: python
-
-        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-
-    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
-
-    .. code-block:: python
-
-        class MyOldModel(nn.Module):
-            def __init__(
-                self,
-                config,
-                cache_config: Optional[CacheConfig] = None,
-                quant_config: Optional[QuantizationConfig] = None,
-                lora_config: Optional[LoRAConfig] = None,
-                prefix: str = "",
-            ) -> None:
-                ...
-
-        from vllm.config import VllmConfig
-        class MyNewModel(MyOldModel):
-            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-                config = vllm_config.model_config.hf_config
-                cache_config = vllm_config.cache_config
-                quant_config = vllm_config.quant_config
-                lora_config = vllm_config.lora_config
-                super().__init__(config, cache_config, quant_config, lora_config, prefix)
-
-        if __version__ >= "0.6.4":
-            MyModel = MyNewModel
-        else:
-            MyModel = MyOldModel
-
-    This way, the model can work with both old and new versions of vLLM.
-
-3. **Sharding and Quantization at Initialization**: Certain features require
+````{note}
+To support this change, all vLLM models' signatures have been updated to:
+
+```python
+def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+```
+
+To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+```python
+class MyOldModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        ...
+
+from vllm.config import VllmConfig
+class MyNewModel(MyOldModel):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+if __version__ >= "0.6.4":
+    MyModel = MyNewModel
+else:
+    MyModel = MyOldModel
+```
+
+This way, the model can work with both old and new versions of vLLM.
+````
+
+3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
 model weights, and quantization needs to quantize the model weights. There are
 two possible ways to implement this feature. One way is to change the model
@@ -252,23 +231,23 @@ initialized, we need to load the full 810GB weights to every GPU and then shard
 the weights, leading to a huge memory overhead. Instead, if we shard the weights
 during the model initialization, every layer will only create a shard of the
 weights it needs, leading to a much smaller memory overhead. The same idea
-applies to quantization. Note that we also add an additional argument ``prefix``
+applies to quantization. Note that we also add an additional argument `prefix`
 to the model's constructor so that the model can initialize itself differently
 based on the prefix. This is useful for non-uniform quantization, where
-different parts of the model are quantized differently. The ``prefix`` is
-usually an empty string for the top-level model and a string like ``"vision"``
-or ``"language"`` for the sub-models. In general, it matches the name of the
+different parts of the model are quantized differently. The `prefix` is
+usually an empty string for the top-level model and a string like `"vision"`
+or `"language"` for the sub-models. In general, it matches the name of the
 module's state dict in the checkpoint file.
 
 One disadvantage of this design is that it is hard to write unit tests for
 individual components in vLLM because every component needs to be initialized by
 a complete config object. We solve this problem by providing a default
 initialization function that creates a default config object with all fields set
-to ``None``. If the component we want to test only cares about a few fields in
+to `None`. If the component we want to test only cares about a few fields in
 the config object, we can create a default config object and set the fields we
 care about. This way, we can test the component in isolation. Note that many
 tests in vLLM are end-to-end tests that test the whole system, so this is not a
 big problem.
 
-In summary, the complete config object ``VllmConfig`` can be treated as an
+In summary, the complete config object `VllmConfig` can be treated as an
 engine-level global state that is shared among all vLLM classes.
diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
new file mode 100644
index 0000000000000..99b4cb56424c6
--- /dev/null
+++ b/docs/source/design/huggingface_integration.md
@@ -0,0 +1,36 @@
+(huggingface-integration)=
+
+# Integration with HuggingFace
+
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
+
+Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
+
+1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
+
+   - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+   - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
+   - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
+
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+
+   - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+   - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
+
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
+
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
+
+   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
+
+This completes the integration between vLLM and HuggingFace.
+
+In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
deleted file mode 100644
index e6c1cea6001ea..0000000000000
--- a/docs/source/design/huggingface_integration.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _huggingface_integration:
-
-Integration with HuggingFace
-===================================
-
-This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
-
-Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
-
-1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182>`__ for the implementation. Within this process:
-
-   - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
-   
-   - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works.
-
-   - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91>`__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
-
-2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186>`__ for the implementation.
-
-3. Next, vLLM `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config dictionary to `generate <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216>`__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
-
-   - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
-
-   - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
-
-4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
-
-5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
-
-Beyond that, there are two more things vLLM depends on HuggingFace for.
-
-1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained>`__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87>`__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24>`__.
-
-2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
-
-   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format. This part of the logic can be found `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385>`__. Please note that:
-
-This completes the integration between vLLM and HuggingFace.
-
-In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
new file mode 100644
index 0000000000000..bb16920e3d0c0
--- /dev/null
+++ b/docs/source/design/input_processing/input_processing_pipeline.md
@@ -0,0 +1,19 @@
+(input-processing-pipeline)=
+
+# Input Processing Pipeline
+
+1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using {meth}`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst
deleted file mode 100644
index 48abec8f75286..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _input_processing_pipeline:
-
-Input Processing Pipeline
-=========================
-
-1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
-
-   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
-
-   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
new file mode 100644
index 0000000000000..cb415366e5a66
--- /dev/null
+++ b/docs/source/design/input_processing/model_inputs_index.md
@@ -0,0 +1,43 @@
+(input-processing)=
+
+# Input Processing
+
+```{eval-rst}
+.. currentmodule:: vllm.inputs
+```
+
+Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
+{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+
+Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+## Guides
+
+```{toctree}
+:maxdepth: 1
+
+input_processing_pipeline
+```
+
+## Module Contents
+
+### LLM Engine Inputs
+
+```{eval-rst}
+.. autoclass:: vllm.inputs.DecoderOnlyInputs
+    :members:
+    :show-inheritance:
+```
+
+### Registry
+
+```{eval-rst}
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+```
+
+```{eval-rst}
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst
deleted file mode 100644
index f0ec1fea15ddb..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. _input_processing:
-
-Input Processing
-================
-
-.. currentmodule:: vllm.inputs
-
-Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
-:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-Guides
-++++++
-
-.. toctree::
-   :maxdepth: 1
-
-   input_processing_pipeline
-
-Module Contents
-+++++++++++++++
-
-LLM Engine Inputs
------------------
-
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
-    :members:
-    :show-inheritance:
-
-Registry
---------
-
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-
-.. automodule:: vllm.inputs.registry
-    :members:
-    :show-inheritance:
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
new file mode 100644
index 0000000000000..c21985b36eb3a
--- /dev/null
+++ b/docs/source/design/kernel/paged_attention.md
@@ -0,0 +1,527 @@
+# vLLM Paged Attention
+
+- Currently, vLLM utilizes its own implementation of a multi-head query
+  attention kernel (`csrc/attention/attention_kernels.cu`).
+  This kernel is designed to be compatible with
+  vLLM's paged KV caches, where the key and value cache are stored in
+  separate blocks (note that this block concept differs from the GPU
+  thread block. So in a later document, I will refer to vLLM paged
+  attention block as "block", while refer to GPU thread block as
+  "thread block").
+- To achieve high performance, this kernel relies on a specially
+  designed memory layout and access method, specifically when threads
+  read data from global memory to shared memory. The purpose of this
+  document is to provide a high-level explanation of the kernel
+  implementation step by step, aiding those who wish to learn about the
+  vLLM multi-head query attention kernel. After going through this
+  document, users will likely have a better understanding and feel easier
+  to follow the actual implementation.
+- Please note that this document may not cover all details, such as how
+  to calculate the correct index for the corresponding data or the dot
+  multiplication implementation. However, after reading this document
+  and becoming familiar with the high-level logic flow, it should be
+  easier for you to read the actual code and understand the details.
+
+## Inputs
+
+- The kernel function takes a list of arguments for the current thread
+  to perform its assigned work. The three most important arguments are
+  the input pointers `q`, `k_cache`, and `v_cache`, which point
+  to query, key, and value data on global memory that need to be read
+  and processed. The output pointer `out` points to global memory
+  where the result should be written. These four pointers actually
+  refer to multi-dimensional arrays, but each thread only accesses the
+  portion of data assigned to it. I have omitted all other runtime
+  parameters here for simplicity.
+
+  ```cpp
+  template<
+  typename scalar_t,
+  int HEAD_SIZE,
+  int BLOCK_SIZE,
+  int NUM_THREADS,
+  int PARTITION_SIZE = 0>
+  __device__ void paged_attention_kernel(
+  ... // Other side args.
+  const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  ... // Other side args.
+  )
+  ```
+
+- There are also a list of template arguments above the function
+  signature that are determined during compilation time. `scalar_t`
+  represents the data type of the query, key, and value data elements,
+  such as FP16. `HEAD_SIZE` indicates the number of elements in each
+  head. `BLOCK_SIZE` refers to the number of tokens in each block.
+  `NUM_THREADS` denotes the number of threads in each thread block.
+  `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
+  simplicity, we assume this is 0 and tensor parallel is disabled).
+
+- With these arguments, we need to perform a sequence of preparations.
+  This includes calculating the current head index, block index, and
+  other necessary variables. However, for now, we can ignore these
+  preparations and proceed directly to the actual calculations. It will
+  be easier to understand them once we grasp the entire flow.
+
+## Concepts
+
+- Just before we dive into the calculation flow, I want to describe a
+  few concepts that are needed for later sections. However, you may
+  skip this section and return later if you encounter any confusing
+  terminologies.
+- **Sequence**: A sequence represents a client request. For example,
+  the data pointed to by `q` has a shape of
+  `[num_seqs, num_heads, head_size]`. That represents there are total
+  `num_seqs` of query sequence data are pointed by `q`. Since this
+  kernel is a single query attention kernel, each sequence only has one
+  query token. Hence, the `num_seqs` equals the total number of tokens
+  that are processed in the batch.
+- **Context**: The context consists of the generated tokens from the
+  sequence. For instance, `["What", "is", "your"]` are the context
+  tokens, and the input query token is `"name"`. The model might
+  generate the token `"?"`.
+- **Vec**: The vec is a list of elements that are fetched and
+  calculated together. For query and key data, the vec size
+  (`VEC_SIZE`) is determined so that each thread group can fetch and
+  calculate 16 bytes of data at a time. For value data, the vec size
+  (`V_VEC_SIZE`) is determined so that each thread can fetch and
+  calculate 16 bytes of data at a time. For example, if the
+  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
+  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
+- **Thread group**: The thread group is a small group of
+  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
+  query token and one key token at a time. Each thread handles only a
+  portion of the token data. The total number of elements processed by
+  one thread group is referred as `x`. For example, if the thread
+  group contains 2 threads and the head size is 8, then thread 0
+  handles the query and key elements at index 0, 2, 4, 6, while thread
+  1 handles the elements at index 1, 3, 5, 7.
+- **Block**: The key and value cache data in vLLM are split into
+  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
+  of tokens at one head. Each block may contain only a portion of the
+  whole context tokens. For example, if the block size is 16 and the
+  head size is 128, then for one head, one block can store 16 * 128 =
+  2048 elements.
+- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
+  execute simultaneously on a stream multiprocessor (SM). In this
+  kernel, each warp processes the calculation between one query token
+  and key tokens of one entire block at a time (it may process multiple
+  blocks in multiple iterations). For example, if there are 4 warps and
+  6 blocks for one context, the assignment would be like warp 0 handles
+  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+  handles the 2nd block and warp 3 handles the 3rd block.
+- **Thread block**: A thread block is a group of
+  threads(`NUM_THREADS`) that can access the same shared memory.
+  Each thread block contains multiple warps(`NUM_WARPS`), and in
+  this kernel, each thread block processes the calculation between one
+  query token and key tokens of a whole context.
+- **Grid**: A grid is a collection of thread blocks and defines the
+  shape of the collection. In this kernel, the shape is
+  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
+  block only handles the calculation for one head, one sequence, and
+  one partition.
+
+## Query
+
+- This section will introduce how query data is stored in memory and
+  fetched by each thread. As mentioned above, each thread group fetches
+  one query token data, while each thread itself only handles a part of
+  one query token data. Within each warp, every thread group will fetch
+  the same query token data, but will multiply it with different key
+  token data.
+
+  ```cpp
+  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+  ```
+
+  ```{figure} ../../assets/kernel/query.png
+  :align: center
+  :alt: query
+  :width: 70%
+
+  Query data of one token at one head
+  ```
+
+- Each thread defines its own `q_ptr` which points to the assigned
+  query token data on global memory. For example, if `VEC_SIZE` is 4
+  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
+  total of 128 elements divided into 128 / 4 = 32 vecs.
+
+  ```{figure} ../../assets/kernel/q_vecs.png
+  :align: center
+  :alt: q_vecs
+  :width: 70%
+
+  `q_vecs` for one thread group
+  ```
+
+  ```cpp
+  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+  ```
+
+- Next, we need to read the global memory data pointed to by `q_ptr`
+  into shared memory as `q_vecs`. It is important to note that each
+  vecs is assigned to a different row. For example, if the
+  `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
+  while thread 1 handles the 1st row vecs. By reading the query data in
+  this way, neighboring threads like thread 0 and thread 1 can read
+  neighbor memory, achieving the memory coalescing to improve
+  performance.
+
+## Key
+
+- Similar to the "Query" section, this section introduces memory layout
+  and assignment for keys. While each thread group only handle one
+  query token one kernel run, it may handle multiple key tokens across
+  multiple iterations. Meanwhile, each warp will process multiple blocks
+  of key tokens in multiple iterations, ensuring that all context
+  tokens are processed by the entire thread group after the kernel run.
+  In this context, "handle" refers to performing the dot multiplication
+  between query data and key data.
+
+  ```cpp
+  const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                      + kv_head_idx * kv_head_stride
+                      + physical_block_offset * x;
+  ```
+
+- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
+  key token at different iterations. As shown above, that `k_ptr`
+  points to key token data based on `k_cache` at assigned block,
+  assigned head and assigned token.
+
+  ```{figure} ../../assets/kernel/key.png
+  :align: center
+  :alt: key
+  :width: 70%
+
+  Key data of all context tokens at one head
+  ```
+
+- The diagram above illustrates the memory layout for key data. It
+  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
+  8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
+  rectangle represents all the elements for one key token at one head,
+  which will be processed by one thread group. The left half shows the
+  total 16 blocks of key token data for warp 0, while the right half
+  represents the remaining key token data for other warps or
+  iterations. Inside each rectangle, there are a total 32 vecs (128
+  elements for one token) that will be processed by 2 threads (one
+  thread group) separately.
+
+  ```{figure} ../../assets/kernel/k_vecs.png
+  :align: center
+  :alt: k_vecs
+  :width: 70%
+
+  `k_vecs` for one thread
+  ```
+
+  ```cpp
+  K_vec k_vecs[NUM_VECS_PER_THREAD]
+  ```
+
+- Next, we need to read the key token data from `k_ptr` and store
+  them on register memory as `k_vecs`. We use register memory for
+  `k_vecs` because it will only be accessed by one thread once,
+  whereas `q_vecs` will be accessed by multiple threads multiple
+  times. Each `k_vecs` will contain multiple vectors for later
+  calculation. Each vec will be set at each inner iteration. The
+  assignment of vecs allows neighboring threads in a warp to read
+  neighboring memory together, which again promotes the memory
+  coalescing. For instance, thread 0 will read vec 0, while thread 1
+  will read vec 1. In the next inner loop, thread 0 will read vec 2,
+  while thread 1 will read vec 3, and so on.
+
+- You may still be a little confused about the overall flow. Don't
+  worry, please keep reading the next "QK" section. It will illustrate
+  the query and key calculation flow in a clearer and higher-level
+  manner.
+
+## QK
+
+- As shown the pseudo code below, before the entire for loop block, we
+  fetch the query data for one token and store it in `q_vecs`. Then,
+  in the outer for loop, we iterate through different `k_ptrs` that
+  point to different tokens and prepare the `k_vecs` in the inner for
+  loop. Finally, we perform the dot multiplication between the
+  `q_vecs` and each `k_vecs`.
+
+  ```cpp
+  q_vecs = ...
+  for ... {
+     k_ptr = ...
+     for ... {
+        k_vecs[i] = ...
+     }
+     ...
+     float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+  }
+  ```
+
+- As mentioned before, for each thread, it only fetches part of the
+  query and key token data at a time. However, there will be a cross
+  thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
+  returned here is not just between part of the query and key token dot
+  multiplication, but actually a full result between entire query and
+  key token data.
+
+- For example, if the value of `HEAD_SIZE` is 128 and
+  `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
+  total 64 elements. However, the returned `qk` is actually the
+  result of dot multiplication between 128 query elements and 128 key
+  elements. If you want to learn more about the details of the dot
+  multiplication and reduction, you may refer to the implementation of
+  `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
+  cover it in this document.
+
+## Softmax
+
+- Next, we need to calculate the normalized softmax for all `qk`s,
+  as shown above, where each $x$ represents a `qk`. To do this,
+  we must obtain the reduced value of `qk_max`($m(x)$) and
+  the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
+  should be performed across the entire thread block, encompassing
+  results between the query token and all context key tokens.
+
+  ```{math}
+  :nowrap: true
+
+  \begin{gather*}
+  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+  \end{gather*}
+  ```
+
+### `qk_max` and `logits`
+
+- Just right after we get the `qk` result, we can set the temporary
+  `logits` result with `qk` (In the end, the `logits` should
+  store the normalized softmax result). Also we can compare and collect
+  the `qk_max` for all `qk`s that are calculated by current
+  thread group.
+
+  ```cpp
+  if (thread_group_offset == 0) {
+     const bool mask = token_idx >= context_len;
+     logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+     qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+  }
+  ```
+
+- Please note that the `logits` here is on shared memory, so each
+  thread group will set the fields for its own assigned context tokens.
+  Overall, the size of logits should be number of context tokens.
+
+  ```cpp
+  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+
+  if (lane == 0) {
+     red_smem[warp_idx] = qk_max;
+  }
+  ```
+
+- Then we need to get the reduced `qk_max` across each warp. The main
+  idea is to make threads in warp to communicate with each other and
+  get the final max `qk` .
+
+  ```cpp
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+  ```
+
+- Finally, we can get the reduced `qk_max` from whole thread block by
+  compare the `qk_max` from all warps in this thread block. Then we
+  need to broadcast the final result to each thread.
+
+### `exp_sum`
+
+- Similar to `qk_max`, we need to get the reduced sum value from the
+  entire thread block too.
+
+  ```cpp
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+      float val = __expf(logits[i] - qk_max);
+      logits[i] = val;
+      exp_sum += val;
+  }
+  ...
+  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+  ```
+
+- Firstly, sum all exp values from each thread group, and meanwhile,
+  convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
+  Please note, the `qk_max` here is already the max `qk` across the
+  whole thread block. And then we can do reduction for `exp_sum`
+  across whole thread block just like the `qk_max`.
+
+  ```cpp
+  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+     logits[i] *= inv_sum;
+  }
+  ```
+
+- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
+  the final normalized softmax result as `logits`. This `logits`
+  variable will be used for dot multiplication with the value data in
+  later steps. Now, it should store the normalized softmax result of
+  `qk` for all assigned context tokens.
+
+## Value
+
+```{figure} ../../assets/kernel/value.png
+:align: center
+:alt: value
+:width: 70%
+
+Value data of all context tokens at one head
+```
+
+```{figure} ../../assets/kernel/logits_vec.png
+:align: center
+:alt: logits_vec
+:width: 50%
+
+`logits_vec` for one thread
+```
+
+```{figure} ../../assets/kernel/v_vec.png
+:align: center
+:alt: v_vec
+:width: 70%
+
+List of `v_vec` for one thread
+```
+
+- Now we need to retrieve the value data and perform dot multiplication
+  with `logits`. Unlike query and key, there is no thread group
+  concept for value data. As shown in diagram, different from key token
+  memory layout, elements from the same column correspond to the same
+  value token. For one block of value data, there are `HEAD_SIZE` of
+  rows and `BLOCK_SIZE` of columns that are split into multiple
+  `v_vecs`.
+
+- Each thread always fetches `V_VEC_SIZE` elements from the same
+  `V_VEC_SIZE` of tokens at a time. As a result, a single thread
+  retrieves multiple `v_vec`s from different rows and the same
+  columns through multiple inner iterations. For each `v_vec`, it
+  needs to be dot multiplied with the corresponding `logits_vec`,
+  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
+  multiple inner iterations, each warp will process one block of value
+  tokens. And with multiple outer iterations, the whole context value
+  tokens are processd
+
+  ```cpp
+  float accs[NUM_ROWS_PER_THREAD];
+  for ... { // Iteration over different blocks.
+      logits_vec = ...
+      for ... { // Iteration over different rows.
+          v_vec = ...
+          ...
+          accs[i] += dot(logits_vec, v_vec);
+      }
+  }
+  ```
+
+- As shown in the above pseudo code, in the outer loop, similar to
+  `k_ptr`, `logits_vec` iterates over different blocks and reads
+  `V_VEC_SIZE` elements from `logits`. In the inner loop, each
+  thread reads `V_VEC_SIZE` elements from the same tokens as a
+  `v_vec` and performs dot multiplication. It is important to note
+  that in each inner iteration, the thread fetches different head
+  position elements for the same tokens. The dot result is then
+  accumulated in `accs`. Therefore, each entry of `accs` is mapped
+  to a head position assigned to the current thread.
+
+- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
+  thread fetches 8 value elements for 8 tokens at a time. Each element
+  is from different tokens at the same head position. If `HEAD_SIZE`
+  is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
+  fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
+  a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
+  a whole block of value tokens. And each `accs` in each thread
+  contains 8 elements that accumulated at 8 different head positions.
+  For the thread 0, the `accs` variable will have 8 elements, which
+  are 0th, 32th … 224th elements of a value head that are accumulated
+  from all assigned 8 tokens.
+
+## LV
+
+- Now, we need to perform reduction for `accs` within each warp. This
+  process allows each thread to accumulate the `accs` for the
+  assigned head positions of all tokens in one block.
+
+  ```cpp
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+     float acc = accs[i];
+     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+     }
+     accs[i] = acc;
+  }
+  ```
+
+- Next, we perform reduction for `accs` across all warps, allowing
+  each thread to have the accumulation of `accs` for the assigned
+  head positions of all context tokens. Please note that each `accs`
+  in every thread only stores the accumulation for a portion of
+  elements of the entire head for all context tokens. However, overall,
+  all results for output have been calculated but are just stored in
+  different thread register memory.
+
+  ```cpp
+  float* out_smem = reinterpret_cast<float*>(shared_mem);
+  for (int i = NUM_WARPS; i > 1; i /= 2) {
+      // Upper warps write to shared memory.
+      ...
+          float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                  ...
+          dst[row_idx] = accs[i];
+      }
+
+      // Lower warps update the output.
+          const float* src = &out_smem[warp_idx * HEAD_SIZE];
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                  ...
+          accs[i] += src[row_idx];
+      }
+
+          // Write out the accs.
+  }
+  ```
+
+## Output
+
+- Now we can write all of calculated result from local register memory
+  to final output global memory.
+
+  ```cpp
+  scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                  + head_idx * max_num_partitions * HEAD_SIZE
+                  + partition_idx * HEAD_SIZE;
+  ```
+
+- First, we need to define the `out_ptr` variable, which points to
+  the start address of the assigned sequence and assigned head.
+
+  ```cpp
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+  const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+  if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+      from_float(*(out_ptr + row_idx), accs[i]);
+  }
+  }
+  ```
+
+- Finally, we need to iterate over different assigned head positions
+  and write out the corresponding accumulated result based on the
+  `out_ptr`.
diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst
deleted file mode 100644
index ba4f7a2718158..0000000000000
--- a/docs/source/design/kernel/paged_attention.rst
+++ /dev/null
@@ -1,525 +0,0 @@
-vLLM Paged Attention
-====================
-
--  Currently, vLLM utilizes its own implementation of a multi-head query
-   attention kernel (``csrc/attention/attention_kernels.cu``). 
-   This kernel is designed to be compatible with
-   vLLM's paged KV caches, where the key and value cache are stored in
-   separate blocks (note that this block concept differs from the GPU
-   thread block. So in a later document, I will refer to vLLM paged
-   attention block as "block", while refer to GPU thread block as
-   "thread block").
--  To achieve high performance, this kernel relies on a specially
-   designed memory layout and access method, specifically when threads
-   read data from global memory to shared memory. The purpose of this
-   document is to provide a high-level explanation of the kernel
-   implementation step by step, aiding those who wish to learn about the
-   vLLM multi-head query attention kernel. After going through this 
-   document, users will likely have a better understanding and feel easier
-   to follow the actual implementation.
--  Please note that this document may not cover all details, such as how
-   to calculate the correct index for the corresponding data or the dot
-   multiplication implementation. However, after reading this document
-   and becoming familiar with the high-level logic flow, it should be
-   easier for you to read the actual code and understand the details.
-
-Inputs
-------
-
--  The kernel function takes a list of arguments for the current thread
-   to perform its assigned work. The three most important arguments are
-   the input pointers ``q``, ``k_cache``, and ``v_cache``, which point
-   to query, key, and value data on global memory that need to be read
-   and processed. The output pointer ``out`` points to global memory
-   where the result should be written. These four pointers actually
-   refer to multi-dimensional arrays, but each thread only accesses the
-   portion of data assigned to it. I have omitted all other runtime
-   parameters here for simplicity.
-
-   .. code:: cpp
-
-      template<
-      typename scalar_t,
-      int HEAD_SIZE,
-      int BLOCK_SIZE,
-      int NUM_THREADS,
-      int PARTITION_SIZE = 0>
-      __device__ void paged_attention_kernel(
-      ... // Other side args.
-      const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
-      ... // Other side args.
-      )
-
--  There are also a list of template arguments above the function
-   signature that are determined during compilation time. ``scalar_t``
-   represents the data type of the query, key, and value data elements,
-   such as FP16. ``HEAD_SIZE`` indicates the number of elements in each
-   head. ``BLOCK_SIZE`` refers to the number of tokens in each block.
-   ``NUM_THREADS`` denotes the number of threads in each thread block.
-   ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For
-   simplicity, we assume this is 0 and tensor parallel is disabled).
--  With these arguments, we need to perform a sequence of preparations.
-   This includes calculating the current head index, block index, and
-   other necessary variables. However, for now, we can ignore these
-   preparations and proceed directly to the actual calculations. It will
-   be easier to understand them once we grasp the entire flow.
-
-Concepts
---------
-
--  Just before we dive into the calculation flow, I want to describe a
-   few concepts that are needed for later sections. However, you may
-   skip this section and return later if you encounter any confusing
-   terminologies.
--  **Sequence**: A sequence represents a client request. For example,
-   the data pointed to by ``q`` has a shape of
-   ``[num_seqs, num_heads, head_size]``. That represents there are total
-   ``num_seqs`` of query sequence data are pointed by ``q``. Since this 
-   kernel is a single query attention kernel, each sequence only has one
-   query token. Hence, the ``num_seqs`` equals the total number of tokens 
-   that are processed in the batch.
--  **Context**: The context consists of the generated tokens from the
-   sequence. For instance, ``["What", "is", "your"]`` are the context
-   tokens, and the input query token is ``"name"``. The model might
-   generate the token ``"?"``.
--  **Vec**: The vec is a list of elements that are fetched and
-   calculated together. For query and key data, the vec size
-   (``VEC_SIZE``) is determined so that each thread group can fetch and
-   calculate 16 bytes of data at a time. For value data, the vec size
-   (``V_VEC_SIZE``) is determined so that each thread can fetch and
-   calculate 16 bytes of data at a time. For example, if the
-   ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the 
-   ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8.
--  **Thread group**: The thread group is a small group of
-   threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one
-   query token and one key token at a time. Each thread handles only a
-   portion of the token data. The total number of elements processed by
-   one thread group is referred as ``x``. For example, if the thread
-   group contains 2 threads and the head size is 8, then thread 0
-   handles the query and key elements at index 0, 2, 4, 6, while thread
-   1 handles the elements at index 1, 3, 5, 7.
--  **Block**: The key and value cache data in vLLM are split into
-   blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``)
-   of tokens at one head. Each block may contain only a portion of the
-   whole context tokens. For example, if the block size is 16 and the
-   head size is 128, then for one head, one block can store 16 \* 128 =
-   2048 elements.
--  **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that
-   execute simultaneously on a stream multiprocessor (SM). In this
-   kernel, each warp processes the calculation between one query token
-   and key tokens of one entire block at a time (it may process multiple
-   blocks in multiple iterations). For example, if there are 4 warps and
-   6 blocks for one context, the assignment would be like warp 0 handles
-   the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
-   handles the 2nd block and warp 3 handles the 3rd block.
--  **Thread block**: A thread block is a group of
-   threads(\ ``NUM_THREADS``) that can access the same shared memory.
-   Each thread block contains multiple warps(\ ``NUM_WARPS``), and in
-   this kernel, each thread block processes the calculation between one
-   query token and key tokens of a whole context.
--  **Grid**: A grid is a collection of thread blocks and defines the
-   shape of the collection. In this kernel, the shape is
-   ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread
-   block only handles the calculation for one head, one sequence, and
-   one partition.
-
-Query
------
-
--  This section will introduce how query data is stored in memory and
-   fetched by each thread. As mentioned above, each thread group fetches
-   one query token data, while each thread itself only handles a part of
-   one query token data. Within each warp, every thread group will fetch
-   the same query token data, but will multiply it with different key
-   token data.
-
-   .. code:: cpp
-
-      const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-   .. figure:: ../../assets/kernel/query.png
-      :alt: query
-      :width: 70%
-      :align: center
-
-      Query data of one token at one head
-
--  Each thread defines its own ``q_ptr`` which points to the assigned
-   query token data on global memory. For example, if ``VEC_SIZE`` is 4
-   and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains
-   total of 128 elements divided into 128 / 4 = 32 vecs.
-
-   .. figure:: ../../assets/kernel/q_vecs.png
-      :alt: q_vecs
-      :width: 70%
-      :align: center
-
-      ``q_vecs`` for one thread group
-
-   .. code:: cpp
-
-      __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
-
--  Next, we need to read the global memory data pointed to by ``q_ptr``
-   into shared memory as ``q_vecs``. It is important to note that each
-   vecs is assigned to a different row. For example, if the
-   ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs,
-   while thread 1 handles the 1st row vecs. By reading the query data in
-   this way, neighboring threads like thread 0 and thread 1 can read
-   neighbor memory, achieving the memory coalescing to improve
-   performance.
-
-Key
----
-
--  Similar to the "Query" section, this section introduces memory layout
-   and assignment for keys. While each thread group only handle one
-   query token one kernel run, it may handle multiple key tokens across
-   multiple iterations. Meanwhile, each warp will process multiple blocks
-   of key tokens in multiple iterations, ensuring that all context
-   tokens are processed by the entire thread group after the kernel run.
-   In this context, "handle" refers to performing the dot multiplication
-   between query data and key data.
-
-   .. code:: cpp
-
-      const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
-                          + kv_head_idx * kv_head_stride
-                          + physical_block_offset * x;
-
--  Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different
-   key token at different iterations. As shown above, that ``k_ptr``
-   points to key token data based on ``k_cache`` at assigned block,
-   assigned head and assigned token.
-
-   .. figure:: ../../assets/kernel/key.png
-      :alt: key
-      :width: 70%
-      :align: center
-
-      Key data of all context tokens at one head
-
--  The diagram above illustrates the memory layout for key data. It
-   assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is
-   8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each
-   rectangle represents all the elements for one key token at one head,
-   which will be processed by one thread group. The left half shows the
-   total 16 blocks of key token data for warp 0, while the right half
-   represents the remaining key token data for other warps or
-   iterations. Inside each rectangle, there are a total 32 vecs (128
-   elements for one token) that will be processed by 2 threads (one
-   thread group) separately.
-
-   .. figure:: ../../assets/kernel/k_vecs.png
-      :alt: k_vecs
-      :width: 70%
-      :align: center
-
-      ``k_vecs`` for one thread
-
-   .. code:: cpp
-
-      K_vec k_vecs[NUM_VECS_PER_THREAD]
-
--  Next, we need to read the key token data from ``k_ptr`` and store
-   them on register memory as ``k_vecs``. We use register memory for
-   ``k_vecs`` because it will only be accessed by one thread once,
-   whereas ``q_vecs`` will be accessed by multiple threads multiple
-   times. Each ``k_vecs`` will contain multiple vectors for later
-   calculation. Each vec will be set at each inner iteration. The
-   assignment of vecs allows neighboring threads in a warp to read
-   neighboring memory together, which again promotes the memory
-   coalescing. For instance, thread 0 will read vec 0, while thread 1
-   will read vec 1. In the next inner loop, thread 0 will read vec 2,
-   while thread 1 will read vec 3, and so on.
--  You may still be a little confused about the overall flow. Don't
-   worry, please keep reading the next "QK" section. It will illustrate
-   the query and key calculation flow in a clearer and higher-level
-   manner.
-
-QK
----
-
--  As shown the pseudo code below, before the entire for loop block, we
-   fetch the query data for one token and store it in ``q_vecs``. Then,
-   in the outer for loop, we iterate through different ``k_ptrs`` that
-   point to different tokens and prepare the ``k_vecs`` in the inner for
-   loop. Finally, we perform the dot multiplication between the
-   ``q_vecs`` and each ``k_vecs``.
-
-   .. code:: cpp
-
-      q_vecs = ...
-      for ... {
-         k_ptr = ...
-         for ... {
-            k_vecs[i] = ...
-         }
-         ...
-         float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
-      }
-
--  As mentioned before, for each thread, it only fetches part of the
-   query and key token data at a time. However, there will be a cross
-   thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk``
-   returned here is not just between part of the query and key token dot
-   multiplication, but actually a full result between entire query and
-   key token data.
--  For example, if the value of ``HEAD_SIZE`` is 128 and
-   ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain
-   total 64 elements. However, the returned ``qk`` is actually the
-   result of dot multiplication between 128 query elements and 128 key
-   elements. If you want to learn more about the details of the dot
-   multiplication and reduction, you may refer to the implementation of
-   ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not
-   cover it in this document.
-
-Softmax
--------
-
--  Next, we need to calculate the normalized softmax for all ``qk``\ s,
-   as shown above, where each :math:`x` represents a ``qk``. To do this,
-   we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and
-   the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction
-   should be performed across the entire thread block, encompassing
-   results between the query token and all context key tokens.
-
-   .. math::
-      :nowrap:
-
-      \begin{gather*}
-      m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
-      \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
-      \end{gather*}
-
-``qk_max`` and ``logits``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
--  Just right after we get the ``qk`` result, we can set the temporary
-   ``logits`` result with ``qk`` (In the end, the ``logits`` should
-   store the normalized softmax result). Also we can compare and collect
-   the ``qk_max`` for all ``qk``\ s that are calculated by current
-   thread group.
-
-   .. code:: cpp
-
-      if (thread_group_offset == 0) {
-         const bool mask = token_idx >= context_len;
-         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      }
-
--  Please note that the ``logits`` here is on shared memory, so each
-   thread group will set the fields for its own assigned context tokens.
-   Overall, the size of logits should be number of context tokens.
-
-   .. code:: cpp
-
-      for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-      }
-
-      if (lane == 0) {
-         red_smem[warp_idx] = qk_max;
-      }
-
--  Then we need to get the reduced ``qk_max`` across each warp. The main
-   idea is to make threads in warp to communicate with each other and
-   get the final max ``qk`` .
-
-   .. code:: cpp
-
-      for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-      }
-      qk_max = VLLM_SHFL_SYNC(qk_max, 0);
-
--  Finally, we can get the reduced ``qk_max`` from whole thread block by
-   compare the ``qk_max`` from all warps in this thread block. Then we
-   need to broadcast the final result to each thread.
-
-``exp_sum``
-~~~~~~~~~~~
-
--  Similar to ``qk_max``, we need to get the reduced sum value from the
-   entire thread block too.
-
-   .. code:: cpp
-
-      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-          float val = __expf(logits[i] - qk_max);
-          logits[i] = val;
-          exp_sum += val;
-      }
-      ...
-      exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
-
--  Firstly, sum all exp values from each thread group, and meanwhile,
-   convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``.
-   Please note, the ``qk_max`` here is already the max ``qk`` across the
-   whole thread block. And then we can do reduction for ``exp_sum``
-   across whole thread block just like the ``qk_max``.
-
-   .. code:: cpp
-
-      const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
-      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-         logits[i] *= inv_sum;
-      }
-
--  Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain
-   the final normalized softmax result as ``logits``. This ``logits``
-   variable will be used for dot multiplication with the value data in
-   later steps. Now, it should store the normalized softmax result of
-   ``qk`` for all assigned context tokens.
-
-Value
------
-
-.. figure:: ../../assets/kernel/value.png
-   :alt: value
-   :width: 70%
-   :align: center
-
-   Value data of all context tokens at one head
-
-.. figure:: ../../assets/kernel/logits_vec.png
-   :alt: logits_vec
-   :width: 50%
-   :align: center
-
-   ``logits_vec`` for one thread
-
-.. figure:: ../../assets/kernel/v_vec.png
-   :alt: v_vec
-   :width: 70%
-   :align: center
-
-   List of ``v_vec`` for one thread
-
--  Now we need to retrieve the value data and perform dot multiplication
-   with ``logits``. Unlike query and key, there is no thread group
-   concept for value data. As shown in diagram, different from key token
-   memory layout, elements from the same column correspond to the same
-   value token. For one block of value data, there are ``HEAD_SIZE`` of
-   rows and ``BLOCK_SIZE`` of columns that are split into multiple
-   ``v_vecs``.
--  Each thread always fetches ``V_VEC_SIZE`` elements from the same
-   ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread
-   retrieves multiple ``v_vec``\ s from different rows and the same
-   columns through multiple inner iterations. For each ``v_vec``, it
-   needs to be dot multiplied with the corresponding ``logits_vec``,
-   which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with
-   multiple inner iterations, each warp will process one block of value
-   tokens. And with multiple outer iterations, the whole context value
-   tokens are processd
-
-   .. code:: cpp
-
-      float accs[NUM_ROWS_PER_THREAD];
-      for ... { // Iteration over different blocks.
-          logits_vec = ...
-          for ... { // Iteration over different rows.
-              v_vec = ...
-              ...
-              accs[i] += dot(logits_vec, v_vec);
-          }
-      }
-
--  As shown in the above pseudo code, in the outer loop, similar to
-   ``k_ptr``, ``logits_vec`` iterates over different blocks and reads
-   ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each
-   thread reads ``V_VEC_SIZE`` elements from the same tokens as a
-   ``v_vec`` and performs dot multiplication. It is important to note
-   that in each inner iteration, the thread fetches different head
-   position elements for the same tokens. The dot result is then
-   accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped
-   to a head position assigned to the current thread.
--  For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each
-   thread fetches 8 value elements for 8 tokens at a time. Each element
-   is from different tokens at the same head position. If ``HEAD_SIZE``
-   is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to
-   fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are
-   a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle
-   a whole block of value tokens. And each ``accs`` in each thread
-   contains 8 elements that accumulated at 8 different head positions.
-   For the thread 0, the ``accs`` variable will have 8 elements, which
-   are 0th, 32th … 224th elements of a value head that are accumulated
-   from all assigned 8 tokens.
-
-LV
----
--  Now, we need to perform reduction for ``accs`` within each warp. This
-   process allows each thread to accumulate the ``accs`` for the
-   assigned head positions of all tokens in one block.
-
-   .. code:: cpp
-
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-         float acc = accs[i];
-         for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-            acc += VLLM_SHFL_XOR_SYNC(acc, mask);
-         }
-         accs[i] = acc;
-      }
-
--  Next, we perform reduction for ``accs`` across all warps, allowing
-   each thread to have the accumulation of ``accs`` for the assigned
-   head positions of all context tokens. Please note that each ``accs``
-   in every thread only stores the accumulation for a portion of
-   elements of the entire head for all context tokens. However, overall,
-   all results for output have been calculated but are just stored in
-   different thread register memory.
-
-   .. code:: cpp
-
-      float* out_smem = reinterpret_cast<float*>(shared_mem);
-      for (int i = NUM_WARPS; i > 1; i /= 2) {
-          // Upper warps write to shared memory.
-          ...
-              float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                      ...
-              dst[row_idx] = accs[i];
-          }
-
-          // Lower warps update the output.
-              const float* src = &out_smem[warp_idx * HEAD_SIZE];
-          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                      ...
-              accs[i] += src[row_idx];
-          }
-
-              // Write out the accs.
-      }
-
-Output
-------
-
--  Now we can write all of calculated result from local register memory
-   to final output global memory.
-
-   .. code:: cpp
-
-      scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
-                      + head_idx * max_num_partitions * HEAD_SIZE
-                      + partition_idx * HEAD_SIZE;
-
--  First, we need to define the ``out_ptr`` variable, which points to
-   the start address of the assigned sequence and assigned head.
-
-   .. code:: cpp
-
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-          from_float(*(out_ptr + row_idx), accs[i]);
-      }
-      }
-
--  Finally, we need to iterate over different assigned head positions
-   and write out the corresponding accumulated result based on the
-   ``out_ptr``.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
new file mode 100644
index 0000000000000..bcccd284879bb
--- /dev/null
+++ b/docs/source/design/multimodal/adding_multimodal_plugin.md
@@ -0,0 +1,16 @@
+(adding-multimodal-plugin)=
+
+# Adding a Multimodal Plugin
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
+
+```{note}
+This article is a work in progress.
+```
+
+% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst
deleted file mode 100644
index b726138f840a3..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _adding_multimodal_plugin:
-
-Adding a Multimodal Plugin
-==========================
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
-
-.. note::
-  This article is a work in progress.
-
-..
-  TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md
similarity index 61%
rename from docs/source/design/multimodal/multimodal_index.rst
rename to docs/source/design/multimodal/multimodal_index.md
index c6d47f90b62d5..88af07afc7018 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -1,66 +1,83 @@
-.. _multi_modality:
+(multi-modality)=
 
-Multi-Modality
-==============
+# Multi-Modality
 
+```{eval-rst}
 .. currentmodule:: vllm.multimodal
-    
-vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
+```
 
-Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_mm_models>`
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following :ref:`this guide <adding_multimodal_plugin>`.
+by following [this guide](#adding-multimodal-plugin).
 
-Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-Guides
-++++++
+## Guides
 
-.. toctree::
-   :maxdepth: 1
+```{toctree}
+:maxdepth: 1
 
-   adding_multimodal_plugin
+adding_multimodal_plugin
+```
 
-Module Contents
-+++++++++++++++
+## Module Contents
 
+```{eval-rst}
 .. automodule:: vllm.multimodal
+```
 
-Registry
---------
+### Registry
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
     :show-inheritance:
+```
 
-Base Classes
-------------
+### Base Classes
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.NestedTensors
+```
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.BatchedTensorInputs
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:
+```
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.MultiModalDataDict
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalKwargs
     :members:
     :show-inheritance:
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalPlugin
     :members:
     :show-inheritance:
+```
 
-Image Classes
--------------
+### Image Classes
 
+```{eval-rst}
 .. automodule:: vllm.multimodal.image
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index b58456ecc6da8..34564413b34f6 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -2,13 +2,14 @@
 
 ## Debugging
 
-Please see the [Debugging
-Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+Please see the [Debugging Tips](#debugging-python-multiprocessing)
 page for information on known issues and how to solve them.
 
 ## Introduction
 
-*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+```{important}
+The source code references are to the state of the code at the time of writing in December, 2024.
+```
 
 The use of Python multiprocessing in vLLM is complicated by:
 
@@ -20,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.
@@ -82,7 +83,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
 
 Related PRs:
 
-- <https://github.com/vllm-project/vllm/pull/8823>
+- <gh-pr:8823>
 
 ## Prior State in v1
 
@@ -96,7 +97,7 @@ engine core.
 
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
-- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45>
 
 It was off by default for all the reasons mentioned above - compatibility with
 dependencies and code using vLLM as a library.
@@ -119,17 +120,17 @@ instruct users to either add a `__main__` guard or to disable multiprocessing.
 If that known-failure case occurs, the user will see two messages that explain
 what is happening. First, a log message from vLLM:
 
-```
-    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
-      initialized. We must use the `spawn` multiprocessing start method. Setting
-      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
-      for more information.
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    for more information.
 ```
 
 Second, Python itself will raise an exception with a nice explanation:
 
-```
+```console
 RuntimeError:
         An attempt has been made to start a new process before the
         current process has finished its bootstrapping phase.
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
new file mode 100644
index 0000000000000..79aff757518f2
--- /dev/null
+++ b/docs/source/design/plugin_system.md
@@ -0,0 +1,54 @@
+(plugin-system)=
+
+# vLLM's Plugin System
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+## How Plugins Work in vLLM
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+
+## How vLLM Discovers Plugins
+
+vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+```python
+# inside `setup.py` file
+from setuptools import setup
+
+setup(name='vllm_add_dummy_model',
+      version='0.1',
+      packages=['vllm_add_dummy_model'],
+      entry_points={
+          'vllm.general_plugins':
+          ["register_dummy_model = vllm_add_dummy_model:register"]
+      })
+
+# inside `vllm_add_dummy_model.py` file
+def register():
+    from vllm import ModelRegistry
+
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyLlava",
+                                        "vllm_add_dummy_model.my_llava:MyLlava")
+```
+
+For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+
+## What Can Plugins Do?
+
+Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+
+## Guidelines for Writing Plugins
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+## Compatibility Guarantee
+
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst
deleted file mode 100644
index 5a96cc8b3a464..0000000000000
--- a/docs/source/design/plugin_system.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. _plugin_system:
-
-vLLM's Plugin System
-====================
-
-The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
-
-How Plugins Work in vLLM
-------------------------
-
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins <https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16>`__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work.
-
-How vLLM Discovers Plugins
---------------------------
-
-vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-
-.. code-block:: python
-
-    # inside `setup.py` file
-    from setuptools import setup
-
-    setup(name='vllm_add_dummy_model',
-          version='0.1',
-          packages=['vllm_add_dummy_model'],
-          entry_points={
-              'vllm.general_plugins':
-              ["register_dummy_model = vllm_add_dummy_model:register"]
-          })
-    
-    # inside `vllm_add_dummy_model.py` file
-    def register():
-        from vllm import ModelRegistry
-
-        if "MyLlava" not in ModelRegistry.get_supported_archs():
-            ModelRegistry.register_model("MyLlava",
-                                            "vllm_add_dummy_model.my_llava:MyLlava")
-
-For more information on adding entry points to your package, please check the `official documentation <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__.
-
-Every plugin has three parts:
-
-1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins.
-
-2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name.
-
-3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module.
-
-What Can Plugins Do?
---------------------
-
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
-
-Guidelines for Writing Plugins
-------------------------------
-
-- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
-
-Compatibility Guarantee
------------------------
-
-vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md
similarity index 59%
rename from docs/source/dev/engine/async_llm_engine.rst
rename to docs/source/dev/engine/async_llm_engine.md
index 93fc310cb543b..904feaa505164 100644
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.md
@@ -1,6 +1,7 @@
-AsyncLLMEngine
-=================================
+# AsyncLLMEngine
 
+```{eval-rst}
 .. autoclass:: vllm.AsyncLLMEngine
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md
new file mode 100644
index 0000000000000..701cb95d3be33
--- /dev/null
+++ b/docs/source/dev/engine/engine_index.md
@@ -0,0 +1,17 @@
+# vLLM Engine
+
+```{eval-rst}
+.. automodule:: vllm.engine
+```
+
+```{eval-rst}
+.. currentmodule:: vllm.engine
+```
+
+```{toctree}
+:caption: Engines
+:maxdepth: 2
+
+llm_engine
+async_llm_engine
+```
diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst
deleted file mode 100644
index ba9ae55ddea46..0000000000000
--- a/docs/source/dev/engine/engine_index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-vLLM Engine
-=================================
-
-.. automodule:: vllm.engine
-.. currentmodule:: vllm.engine
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Engines
-
-   llm_engine
-   async_llm_engine
-
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md
similarity index 60%
rename from docs/source/dev/engine/llm_engine.rst
rename to docs/source/dev/engine/llm_engine.md
index 0b8c1e219d7c9..d6613ef5562dc 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.md
@@ -1,6 +1,7 @@
-LLMEngine
-=================================
+# LLMEngine
 
+```{eval-rst}
 .. autoclass:: vllm.LLMEngine
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md
similarity index 67%
rename from docs/source/dev/offline_inference/llm.rst
rename to docs/source/dev/offline_inference/llm.md
index 83ba1b6987c6d..9f129d5e41686 100644
--- a/docs/source/dev/offline_inference/llm.rst
+++ b/docs/source/dev/offline_inference/llm.md
@@ -1,6 +1,7 @@
-LLM Class
-=========
+# LLM Class
 
+```{eval-rst}
 .. autoclass:: vllm.LLM
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md
similarity index 78%
rename from docs/source/dev/offline_inference/llm_inputs.rst
rename to docs/source/dev/offline_inference/llm_inputs.md
index 0d47281db485e..21f688a12c536 100644
--- a/docs/source/dev/offline_inference/llm_inputs.rst
+++ b/docs/source/dev/offline_inference/llm_inputs.md
@@ -1,14 +1,19 @@
-LLM Inputs
-==========
+# LLM Inputs
 
+```{eval-rst}
 .. autodata:: vllm.inputs.PromptType
+```
 
+```{eval-rst}
 .. autoclass:: vllm.inputs.TextPrompt
     :show-inheritance:
     :members:
     :member-order: bysource
+```
 
+```{eval-rst}
 .. autoclass:: vllm.inputs.TokensPrompt
     :show-inheritance:
     :members:
     :member-order: bysource
+```
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md
new file mode 100644
index 0000000000000..318a02d8c78df
--- /dev/null
+++ b/docs/source/dev/offline_inference/offline_index.md
@@ -0,0 +1,8 @@
+# Offline Inference
+
+```{toctree}
+:maxdepth: 1
+
+llm
+llm_inputs
+```
diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst
deleted file mode 100644
index 27dfb0e9df90e..0000000000000
--- a/docs/source/dev/offline_inference/offline_index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Offline Inference
-=================================
-
-.. toctree::
-   :maxdepth: 1
-
-   llm
-   llm_inputs
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md
similarity index 55%
rename from docs/source/dev/pooling_params.rst
rename to docs/source/dev/pooling_params.md
index 334e0287aff09..74b2c57443e4b 100644
--- a/docs/source/dev/pooling_params.rst
+++ b/docs/source/dev/pooling_params.md
@@ -1,5 +1,6 @@
-Pooling Parameters
-==================
+# Pooling Parameters
 
+```{eval-rst}
 .. autoclass:: vllm.PoolingParams
     :members:
+```
diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md
similarity index 55%
rename from docs/source/dev/sampling_params.rst
rename to docs/source/dev/sampling_params.md
index f645941a6c022..bdc36af5153db 100644
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.md
@@ -1,5 +1,6 @@
-Sampling Parameters
-===================
+# Sampling Parameters
 
+```{eval-rst}
 .. autoclass:: vllm.SamplingParams
     :members:
+```
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 79b49a186236a..aef32f7559f74 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -15,18 +15,12 @@ def fix_case(text: str) -> str:
     return text
 
 
-def underline(title: str, character: str = "=") -> str:
-    return f"{title}\n{character * len(title)}"
-
-
 def generate_title(filename: str) -> str:
     # Turn filename into a title
     title = filename.replace("_", " ").title()
     # Handle acronyms and names
     title = fix_case(title)
-    # Underline title
-    title = underline(title)
-    return title
+    return f"# {title}"
 
 
 def generate_examples():
@@ -38,24 +32,23 @@ def generate_examples():
 
     # Destination paths
     doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
 
     # Generate the example docs for each example script
     for script_path, doc_path in zip(script_paths, doc_paths):
-        script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
         # Make script_path relative to doc_path and call it include_path
         include_path = '../../../..' / script_path.relative_to(root_dir)
         content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source {script_url}.\n\n"
-                   f".. literalinclude:: {include_path}\n"
-                   "    :language: python\n"
-                   "    :linenos:\n")
+                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
+                   f"```{{literalinclude}} {include_path}\n"
+                   ":language: python\n"
+                   ":linenos:\n```")
         with open(doc_path, "w+") as f:
             f.write(content)
 
     # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.rst") as f:
+    with open(doc_dir / "examples_index.template.md") as f:
         examples_index = f.read()
-    with open(doc_dir / "examples_index.rst", "w+") as f:
-        example_docs = "\n   ".join(path.stem for path in script_paths)
+    with open(doc_dir / "examples_index.md", "w+") as f:
+        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
         f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md
new file mode 100644
index 0000000000000..6d01efbbf8828
--- /dev/null
+++ b/docs/source/getting_started/amd-installation.md
@@ -0,0 +1,163 @@
+(installation-rocm)=
+
+# Installation with ROCm
+
+vLLM supports AMD GPUs with ROCm 6.2.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- ROCm 6.2
+
+Installation options:
+
+1. [Build from source with docker](#build-from-source-docker-rocm)
+2. [Build from source](#build-from-source-rocm)
+
+(build-from-source-docker-rocm)=
+
+## Option 1: Build from source with docker (recommended)
+
+You can build and install vLLM from source.
+
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
+- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
+- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
+- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+```console
+$ docker run -it \
+   --network=host \
+   --group-add=video \
+   --ipc=host \
+   --cap-add=SYS_PTRACE \
+   --security-opt seccomp=unconfined \
+   --device /dev/kfd \
+   --device /dev/dri \
+   -v <path/to/model>:/app/model \
+   vllm-rocm \
+   bash
+```
+
+Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+(build-from-source-rocm)=
+
+## Option 2: Build from source
+
+0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
+
+- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
+- [PyTorch](https://pytorch.org/)
+
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+
+1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
+
+Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+
+```console
+$ python3 -m pip install ninja cmake wheel pybind11
+$ pip uninstall -y triton
+$ git clone https://github.com/OpenAI/triton.git
+$ cd triton
+$ git checkout e192dba
+$ cd python
+$ pip3 install .
+$ cd ../..
+```
+
+```{note}
+- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+```
+
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
+
+Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+Alternatively, wheels intended for vLLM use can be accessed under the releases.
+
+For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+
+```console
+$ git clone https://github.com/ROCm/flash-attention.git
+$ cd flash-attention
+$ git checkout 3cea2fb
+$ git submodule update --init
+$ GPU_ARCHS="gfx90a" python3 setup.py install
+$ cd ..
+```
+
+```{note}
+- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+```
+
+3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
+
+```bash
+$ pip install --upgrade pip
+
+# Install PyTorch
+$ pip uninstall torch -y
+$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+
+# Build & install AMD SMI
+$ pip install /opt/rocm/share/amd_smi
+
+# Install dependencies
+$ pip install --upgrade numba scipy huggingface-hub[cli]
+$ pip install "numpy<2"
+$ pip install -r requirements-rocm.txt
+
+# Build vLLM for MI210/MI250/MI300.
+$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+$ python3 setup.py develop
+```
+
+This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+
+```{tip}
+- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+```
+
+```{tip}
+- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+```
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
deleted file mode 100644
index ece5d785e0c65..0000000000000
--- a/docs/source/getting_started/amd-installation.rst
+++ /dev/null
@@ -1,178 +0,0 @@
-.. _installation_rocm:
-
-Installation with ROCm
-======================
-
-vLLM supports AMD GPUs with ROCm 6.2.
-
-Requirements
-------------
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.2
-
-Installation options:
-
-#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
-#. :ref:`Build from source <build_from_source_rocm>`
-
-.. _build_from_source_docker_rocm:
-
-Option 1: Build from source with docker (recommended)
------------------------------------------------------
-
-You can build and install vLLM from source.
-
-First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
-
-.. code-block:: console
-    
-    {
-        "features": {
-            "buildkit": true
-        }
-    }
-
-
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
-It provides flexibility to customize the build of docker image using the following arguments:
-
-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
-* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
-* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
-* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. 
-
-Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
-
-
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
-
-.. code-block:: console
-
-    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
-
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
-
-.. code-block:: console
-
-    $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To run the above docker image ``vllm-rocm``, use the below command:
-
-.. code-block:: console
-
-    $ docker run -it \
-       --network=host \
-       --group-add=video \
-       --ipc=host \
-       --cap-add=SYS_PTRACE \
-       --security-opt seccomp=unconfined \
-       --device /dev/kfd \
-       --device /dev/dri \
-       -v <path/to/model>:/app/model \
-       vllm-rocm \
-       bash
-
-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
-
-
-.. _build_from_source_rocm:
-
-Option 2: Build from source
----------------------------
-
-0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
-
-- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
-- `PyTorch <https://pytorch.org/>`_
-
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
-
-
-1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
-
-Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
-
-    .. code-block:: console
-
-        $ python3 -m pip install ninja cmake wheel pybind11
-        $ pip uninstall -y triton 
-        $ git clone https://github.com/OpenAI/triton.git
-        $ cd triton
-        $ git checkout e192dba
-        $ cd python
-        $ pip3 install .
-        $ cd ../..
-
-.. note::
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-
-
-2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_
-
-
-Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
-Alternatively, wheels intended for vLLM use can be accessed under the releases.
-
-For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`.
-Note to get your gfx architecture, run `rocminfo |grep gfx`.
-
-    .. code-block:: console
-
-        $ git clone https://github.com/ROCm/flash-attention.git
-        $ cd flash-attention
-        $ git checkout 3cea2fb
-        $ git submodule update --init
-        $ GPU_ARCHS="gfx90a" python3 setup.py install
-        $ cd ..
-
-.. note::
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-
-3. Build vLLM.
-
-    For example, vLLM on ROCM 6.2 can be built with the following steps:
-
-    .. code-block:: console
-
-        $ pip install --upgrade pip
-
-        $ # Install PyTorch
-        $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
-
-        $ # Build & install AMD SMI
-        $ pip install /opt/rocm/share/amd_smi
-
-        $ # Install dependencies
-        $ pip install --upgrade numba scipy huggingface-hub[cli]
-        $ pip install "numpy<2"
-        $ pip install -r requirements-rocm.txt
-
-        $ # Build vLLM for MI210/MI250/MI300.
-        $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-        $ python3 setup.py develop
-
-
-    This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation.
-
-
-.. tip::
-
-    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-    - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
-    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-
-
-.. tip::
-    - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html>`_ for performance optimization and tuning tips on system and workflow level.
-      For vLLM, please refer to `vLLM performance optimization <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization>`_.
-
-
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
new file mode 100644
index 0000000000000..de807e198b4f6
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.md
@@ -0,0 +1,46 @@
+(installation-arm)=
+
+# Installation for ARM CPUs
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+- CPU backend inference capabilities
+- Relevant runtime environment variables
+- Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. [Requirements](#arm-backend-requirements)
+2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
+3. [Building from Source](#build-arm-backend-from-source)
+
+(arm-backend-requirements)=
+
+## Requirements
+
+- **Operating System**: Linux or macOS
+- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Instruction Set Architecture (ISA)**: NEON support is required
+
+(arm-backend-quick-start-dockerfile)=
+
+## Quick Start with Dockerfile
+
+You can quickly set up vLLM on ARM using Docker:
+
+```console
+$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
+
+(build-arm-backend-from-source)=
+
+## Building from Source
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
deleted file mode 100644
index 7b457df92c11d..0000000000000
--- a/docs/source/getting_started/arm-installation.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. _installation_arm:
-
-Installation for ARM CPUs
-=========================
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
-
-* CPU backend inference capabilities
-* Relevant runtime environment variables
-* Performance optimization tips
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-Contents:
-
-1. :ref:`Requirements <arm_backend_requirements>`
-2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
-3. :ref:`Building from Source <build_arm_backend_from_source>`
-
-.. _arm_backend_requirements:
-
-Requirements
-------------
-
-* **Operating System**: Linux or macOS
-* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
-* **Instruction Set Architecture (ISA)**: NEON support is required
-
-.. _arm_backend_quick_start_dockerfile:
-
-Quick Start with Dockerfile
----------------------------
-
-You can quickly set up vLLM on ARM using Docker:
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --cpuset-cpus=<cpu-id-list, optional> \
-                 --cpuset-mems=<memory-node, optional> \
-                 vllm-cpu-env
-
-.. _build_arm_backend_from_source:
-
-Building from Source
---------------------
-
-To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
new file mode 100644
index 0000000000000..b6f181ace6274
--- /dev/null
+++ b/docs/source/getting_started/cpu-installation.md
@@ -0,0 +1,154 @@
+(installation-cpu)=
+
+# Installation with CPU
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
+
+- Tensor Parallel
+- Model Quantization (`INT8 W8A8, AWQ`)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
+
+Table of contents:
+
+1. [Requirements](#cpu-backend-requirements)
+2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
+3. [Build from source](#build-cpu-backend-from-source)
+4. [Related runtime environment variables](#env-intro)
+5. [Intel Extension for PyTorch](#ipex-guidance)
+6. [Performance tips](#cpu-backend-performance-tips)
+
+(cpu-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
+
+(cpu-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
+
+(build-cpu-backend-from-source)=
+
+## Build from source
+
+- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get update  -y
+$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+- Second, install Python packages for vLLM CPU backend building:
+
+```console
+$ pip install --upgrade pip
+$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+- Finally, build and install vLLM CPU backend:
+
+```console
+$ VLLM_TARGET_DEVICE=cpu python setup.py install
+```
+
+```{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+```
+
+(env-intro)=
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+
+(ipex-guidance)=
+
+## Intel Extension for PyTorch
+
+- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+(cpu-backend-performance-tips)=
+
+## Performance tips
+
+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+$ find / -name *libtcmalloc* # find the dynamic link library path
+$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+$ python examples/offline_inference.py # run vLLM
+```
+
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
+
+```console
+$ export VLLM_CPU_KVCACHE_SPACE=40
+$ export VLLM_CPU_OMP_THREADS_BIND=0-29
+$ vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+```console
+$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+$ export VLLM_CPU_OMP_THREADS_BIND=0-7
+$ python examples/offline_inference.py
+```
+
+- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
+
+## CPU Backend Considerations
+
+- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
+
+- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
+
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+
+  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+
+    ```console
+    $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```
+
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
deleted file mode 100644
index 649de1cd9b53c..0000000000000
--- a/docs/source/getting_started/cpu-installation.rst
+++ /dev/null
@@ -1,164 +0,0 @@
-.. _installation_cpu:
-
-Installation with CPU
-========================
-
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
-
-- Tensor Parallel 
-- Model Quantization (``INT8 W8A8, AWQ``)
-- Chunked-prefill
-- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
-
-Table of contents:
-
-#. :ref:`Requirements <cpu_backend_requirements>`
-#. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
-#. :ref:`Build from source <build_cpu_backend_from_source>`
-#. :ref:`Related runtime environment variables <env_intro>`
-#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
-#. :ref:`Performance tips <cpu_backend_performance_tips>`
-
-.. _cpu_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
-
-.. _cpu_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --cpuset-cpus=<cpu-id-list, optional> \
-                 --cpuset-mems=<memory-node, optional> \
-                 vllm-cpu-env
-
-.. _build_cpu_backend_from_source:
-
-Build from source
------------------
-
-- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
-    $ sudo apt-get update  -y
-    $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-    $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-- Second, install Python packages for vLLM CPU backend building:
-
-.. code-block:: console
-
-    $ pip install --upgrade pip
-    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, build and install vLLM CPU backend: 
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE=cpu python setup.py install
-
-.. note::
-    - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. 
-    
-    - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
-
-.. _env_intro:
-
-Related runtime environment variables
--------------------------------------
-
-- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
-
-.. _ipex_guidance:
-
-Intel Extension for PyTorch
----------------------------
-
-- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
-
-.. _cpu_backend_performance_tips:
-
-Performance tips
------------------
-
-- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
-    $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
-    $ find / -name *libtcmalloc* # find the dynamic link library path
-    $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-    $ python examples/offline_inference.py # run vLLM
-
-- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
-
-.. code-block:: console
-
-    $ export VLLM_CPU_KVCACHE_SPACE=40
-    $ export VLLM_CPU_OMP_THREADS_BIND=0-29 
-    $ vllm serve facebook/opt-125m
-
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
-
-.. code-block:: console
-
-    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
-
-    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. 
-    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
-    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-
-    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
-    $ export VLLM_CPU_OMP_THREADS_BIND=0-7 
-    $ python examples/offline_inference.py
-
-- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.
-
-CPU Backend Considerations
---------------------------
-
-- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
-
-- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
-
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology <https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa>`_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.  
-
-  * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU <https://github.com/vllm-project/vllm/pull/6125>`_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
-
-    .. code-block:: console
-
-         $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
-
-
-  * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving <https://docs.ray.io/en/latest/serve/index.html>`_. Here is the example to setup a scalable LLM serving with `Ray Serve <https://github.com/intel/llm-on-ray/blob/main/docs/setup.md>`_.
\ No newline at end of file
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
new file mode 100644
index 0000000000000..3b0029f2e88ce
--- /dev/null
+++ b/docs/source/getting_started/debugging.md
@@ -0,0 +1,200 @@
+(debugging)=
+
+# Debugging Tips
+
+This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+
+```{note}
+Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
+```
+
+## Hangs downloading a model
+
+If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
+It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue.
+
+## Hangs loading a model from disk
+
+If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
+It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
+
+```{note}
+To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+```
+
+## Model is too large
+
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+
+## Enable more logging
+
+If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
+
+- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
+- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+
+## Incorrect network setup
+
+The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
+If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=<your_ip_address>`.
+
+You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>` and `export GLOO_SOCKET_IFNAME=<your_network_interface>` to specify the network interface for the IP address.
+
+## Error near `self.graph.replay()`
+
+If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
+To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+
+## Incorrect hardware/driver
+
+If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
+
+```python
+# Test PyTorch NCCL
+import torch
+import torch.distributed as dist
+dist.init_process_group(backend="nccl")
+local_rank = dist.get_rank() % torch.cuda.device_count()
+torch.cuda.set_device(local_rank)
+data = torch.FloatTensor([1,] * 128).to("cuda")
+dist.all_reduce(data, op=dist.ReduceOp.SUM)
+torch.cuda.synchronize()
+value = data.mean().item()
+world_size = dist.get_world_size()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch NCCL is successful!")
+
+# Test PyTorch GLOO
+gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+cpu_data = torch.FloatTensor([1,] * 128)
+dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+value = cpu_data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch GLOO is successful!")
+
+if world_size <= 1:
+    exit()
+
+# Test vLLM NCCL, with cuda graph
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+# pynccl is enabled by default for 0.6.5+,
+# but for 0.6.4 and below, we need to enable it manually.
+# keep the code for backward compatibility when because people
+# prefer to read the latest documentation.
+pynccl.disabled = False
+
+s = torch.cuda.Stream()
+with torch.cuda.stream(s):
+    data.fill_(1)
+    pynccl.all_reduce(data, stream=s)
+    value = data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL is successful!")
+
+g = torch.cuda.CUDAGraph()
+with torch.cuda.graph(cuda_graph=g, stream=s):
+    pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+
+data.fill_(1)
+g.replay()
+torch.cuda.current_stream().synchronize()
+value = data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL with cuda graph is successful!")
+
+dist.destroy_process_group(gloo_group)
+dist.destroy_process_group()
+```
+
+If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
+```
+
+If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+```
+
+If the script runs successfully, you should see the message `sanity check is successful!`.
+
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
+
+```{note}
+A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+
+- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
+
+Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
+```
+
+(debugging-python-multiprocessing)=
+## Python multiprocessing
+
+### `RuntimeError` Exception
+
+If you have seen a warning in your logs like this:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    for more information.
+```
+
+or an error from Python that looks like this:
+
+```console
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+then you must update your Python code to guard usage of `vllm` behind a `if
+__name__ == '__main__':` block. For example, instead of this:
+
+```python
+import vllm
+
+llm = vllm.LLM(...)
+```
+
+try this instead:
+
+```python
+if __name__ == '__main__':
+    import vllm
+
+    llm = vllm.LLM(...)
+```
+
+## Known Issues
+
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
deleted file mode 100644
index d6c83014dc69f..0000000000000
--- a/docs/source/getting_started/debugging.rst
+++ /dev/null
@@ -1,197 +0,0 @@
-.. _debugging:
-
-===============
-Debugging Tips
-===============
-
-This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
-
-.. note::
-
-    Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-
-Hangs downloading a model 
-----------------------------------------
-If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. 
-It's recommended to download the model first using the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
-
-Hangs loading a model from disk
-----------------------------------------
-If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. 
-It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
-
-.. note::
-
-    To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-
-Model is too large
-----------------------------------------
-If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-
-Enable more logging 
-----------------------------------------
-If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
-
-- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
-- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
-- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
-- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
-
-Incorrect network setup
-----------------------------------------
-The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. 
-If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=<your_ip_address>``. 
-
-You might also need to set ``export NCCL_SOCKET_IFNAME=<your_network_interface>`` and ``export GLOO_SOCKET_IFNAME=<your_network_interface>`` to specify the network interface for the IP address.
-
-Error near ``self.graph.replay()`` 
-----------------------------------------
-If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. 
-To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
-
-Incorrect hardware/driver
-----------------------------------------
-If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
-
-.. code-block:: python
-
-    # Test PyTorch NCCL
-    import torch
-    import torch.distributed as dist
-    dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
-    data = torch.FloatTensor([1,] * 128).to("cuda")
-    dist.all_reduce(data, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-    value = data.mean().item()
-    world_size = dist.get_world_size()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("PyTorch NCCL is successful!")
-
-    # Test PyTorch GLOO
-    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
-    cpu_data = torch.FloatTensor([1,] * 128)
-    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
-    value = cpu_data.mean().item()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("PyTorch GLOO is successful!")
-
-    if world_size <= 1:
-        exit()
-
-    # Test vLLM NCCL, with cuda graph
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-
-    pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-
-    s = torch.cuda.Stream()
-    with torch.cuda.stream(s):
-        data.fill_(1)
-        pynccl.all_reduce(data, stream=s)
-        value = data.mean().item()
-        assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("vLLM NCCL is successful!")
-
-    g = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cuda_graph=g, stream=s):
-        pynccl.all_reduce(data, stream=torch.cuda.current_stream())
-
-    data.fill_(1)
-    g.replay()
-    torch.cuda.current_stream().synchronize()
-    value = data.mean().item()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("vLLM NCCL with cuda graph is successful!")
-
-    dist.destroy_process_group(gloo_group)
-    dist.destroy_process_group()
-
-If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
-
-.. code-block:: console
-
-    $ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
-
-If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
-
-.. code-block:: console
-
-    $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
-
-If the script runs successfully, you should see the message ``sanity check is successful!``.
-
-If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
-
-.. note::
-
-    A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
-
-    - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
-    - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
-
-    Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
-
-Python multiprocessing
-----------------------
-
-`RuntimeError` Exception
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you have seen a warning in your logs like this:
-
-.. code-block:: console
-
-    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
-        initialized. We must use the `spawn` multiprocessing start method. Setting
-        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-        https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
-        for more information.
-
-or an error from Python that looks like this:
-
-.. code-block:: console
-
-    RuntimeError:
-            An attempt has been made to start a new process before the
-            current process has finished its bootstrapping phase.
-
-            This probably means that you are not using fork to start your
-            child processes and you have forgotten to use the proper idiom
-            in the main module:
-
-                if __name__ == '__main__':
-                    freeze_support()
-                    ...
-
-            The "freeze_support()" line can be omitted if the program
-            is not going to be frozen to produce an executable.
-
-            To fix this issue, refer to the "Safe importing of main module"
-            section in https://docs.python.org/3/library/multiprocessing.html
-
-then you must update your Python code to guard usage of ``vllm`` behind a ``if
-__name__ == '__main__':`` block. For example, instead of this:
-
-.. code-block:: python
-
-    import vllm
-
-    llm = vllm.LLM(...)
-
-try this instead:
-
-.. code-block:: python
-
-    if __name__ == '__main__':
-        import vllm
-
-        llm = vllm.LLM(...)
-
-Known Issues
-----------------------------------------
-- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
new file mode 100644
index 0000000000000..de7a91c0ffa48
--- /dev/null
+++ b/docs/source/getting_started/examples/examples_index.template.md
@@ -0,0 +1,8 @@
+# Examples
+
+```{toctree}
+:maxdepth: 1
+:caption: Scripts
+
+%EXAMPLE_DOCS%
+```
\ No newline at end of file
diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst
deleted file mode 100644
index 1b34cccbae15a..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Examples
-=================================
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Scripts
-
-   %EXAMPLE_DOCS%
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
new file mode 100644
index 0000000000000..acf42f210dffb
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -0,0 +1,386 @@
+# Installation with Intel® Gaudi® AI Accelerators
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+## Requirements and Installation
+
+Please follow the instructions provided in the [Gaudi Installation
+Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the [Optimizing Training Platform
+Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+
+### Requirements
+
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
+
+### Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+```{tip}
+If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+```
+
+### Build from source
+
+#### Environment verification
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+```console
+$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+$ pip list | grep neural # verify that neural_compressor is installed
+```
+
+Refer to [Intel Gaudi Software Stack
+Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+for more details.
+
+#### Run Docker Image
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the [Intel Gaudi
+documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+for more details.
+
+Use the following commands to run a Docker image:
+
+```console
+$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+```
+
+#### Build and Install vLLM
+
+To build and install vLLM from source, run:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python setup.py develop
+```
+
+Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+
+```console
+$ git clone https://github.com/HabanaAI/vllm-fork.git
+$ cd vllm-fork
+$ git checkout habana_main
+$ python setup.py develop
+```
+
+## Supported Features
+
+- [Offline batched inference](#offline-batched-inference)
+- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+  prefill attention, Root Mean Square Layer Normalization, Rotary
+  Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+  for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+
+## Unsupported Features
+
+- Beam search
+- LoRA adapters
+- Quantization
+- Prefill chunking (mixed-batch inferencing)
+
+## Supported Configurations
+
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+
+## Performance Tuning
+
+### Execution modes
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
+
+```{eval-rst}
+.. list-table:: vLLM execution modes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - ``PT_HPU_LAZY_MODE``
+     - ``enforce_eager``
+     - execution mode
+   * - 0
+     - 0
+     - torch.compile
+   * - 0
+     - 1
+     - PyTorch eager mode
+   * - 1
+     - 0
+     - HPU Graphs
+   * - 1
+     - 1
+     - PyTorch lazy mode
+```
+
+```{warning}
+In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+```
+
+### Bucketing mechanism
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+```{note}
+Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+```
+
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+```
+INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+```
+
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+```
+min = 2, step = 32, max = 64
+=> ramp_up = (2, 4, 8, 16)
+=> stable = (32, 64)
+=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+```
+
+Example (without ramp-up)
+
+```
+min = 128, step = 128, max = 512
+=> ramp_up = ()
+=> stable = (128, 256, 384, 512)
+=> buckets = ramp_up + stable => (128, 256, 384, 512)
+```
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+```{warning}
+If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+```
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
+
+```{note}
+Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+```
+
+### Warmup
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+```
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+...
+INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+...
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+```
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+
+```{tip}
+Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+```
+
+### HPU Graph capture
+
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
+Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
+With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
+Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+```{note}
+`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+```
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+
+```{note}
+`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+```
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+```
+INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+...
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+```
+
+### Recommended vLLM Parameters
+
+- We recommend running inference on Gaudi 2 with `block_size` of 128
+  for BF16 data type. Using default values (16, 32) might lead to
+  sub-optimal performance due to Matrix Multiplication Engine
+  under-utilization (see [Gaudi
+  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- For max throughput on Llama 7B, we recommend running with batch size
+  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+  If you encounter out-of-memory issues, see troubleshooting section.
+
+### Environment variables
+
+**Diagnostic and profiling knobs:**
+
+- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
+
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
+
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
+
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+  - `{phase}` is either `PROMPT` or `DECODE`
+
+  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+
+  - `{param}` is either `MIN`, `STEP` or `MAX`
+
+  - Default values:
+
+    - Prompt:
+      : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+        - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+        - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+        - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+        - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+        - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+    - Decode:
+      : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+        - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+        - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+        - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+        - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+        - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+
+## Troubleshooting: Tweaking HPU Graphs
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+- Tweak `gpu_memory_utilization` knob. It will decrease the
+  allocation of KV cache, leaving some headroom for capturing graphs
+  with larger batch size. By default `gpu_memory_utilization` is set
+  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+  short profiling run. Note that decreasing reduces the number of KV
+  cache blocks you have available, and therefore reduces the effective
+  maximum number of tokens you can handle at a given time.
+- If this method is not efficient, you can disable `HPUGraph`
+  completely. With HPU Graphs disabled, you are trading latency and
+  throughput at lower batches for potentially higher throughput on
+  higher batches. You can do that by adding `--enforce-eager` flag to
+  server (for online inference), or by passing `enforce_eager=True`
+  argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
deleted file mode 100644
index 249e08278ff8f..0000000000000
--- a/docs/source/getting_started/gaudi-installation.rst
+++ /dev/null
@@ -1,402 +0,0 @@
-Installation with Intel® Gaudi® AI Accelerators
-===============================================
-
-This README provides instructions on running vLLM with Intel Gaudi devices.
-
-Requirements and Installation
------------------------------
-
-Please follow the instructions provided in the `Gaudi Installation
-Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the `Optimizing Training Platform
-Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
-
-Requirements
-~~~~~~~~~~~~
-
--  OS: Ubuntu 22.04 LTS
--  Python: 3.10
--  Intel Gaudi accelerator
--  Intel Gaudi software version 1.18.0
-
-
-Quick start using Dockerfile
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. code:: console
-
-   $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-
-
-.. tip::
-   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html>`__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
-
-
-Build from source
-~~~~~~~~~~~~~~~~~
-
-Environment verification
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-.. code:: console
-
-   $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-   $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-   $ pip list | grep neural # verify that neural_compressor is installed
-
-Refer to `Intel Gaudi Software Stack
-Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
-for more details.
-
-Run Docker Image
-^^^^^^^^^^^^^^^^
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the `Intel Gaudi
-documentation <https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers>`__
-for more details.
-
-Use the following commands to run a Docker image:
-
-.. code:: console
-
-   $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-
-Build and Install vLLM
-^^^^^^^^^^^^^^^^^^^^^^
-
-To build and install vLLM from source, run:
-
-.. code:: console
-
-   $ git clone https://github.com/vllm-project/vllm.git
-   $ cd vllm
-   $ python setup.py develop
-
-
-Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
-
-.. code:: console
-
-   $ git clone https://github.com/HabanaAI/vllm-fork.git
-   $ cd vllm-fork
-   $ git checkout habana_main
-   $ python setup.py develop
-
-
-Supported Features
-------------------
-
--  `Offline batched
-   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
--  Online inference via `OpenAI-Compatible
-   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
--  HPU autodetection - no need to manually select device within vLLM
--  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
--  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-   prefill attention, Root Mean Square Layer Normalization, Rotary
-   Positional Encoding
--  Tensor parallelism support for multi-card inference
--  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
-   for accelerating low-batch latency and throughput
--  Attention with Linear Biases (ALiBi)
-
-Unsupported Features
---------------------
-
--  Beam search
--  LoRA adapters
--  Quantization
--  Prefill chunking (mixed-batch inferencing)
-
-Supported Configurations
-------------------------
-
-The following configurations have been validated to be function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
--  `meta-llama/Llama-2-7b <https://huggingface.co/meta-llama/Llama-2-7b>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Llama-2-7b-chat-hf <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-8B <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-70B <https://huggingface.co/meta-llama/Meta-Llama-3-70B>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-70B <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-
-Performance Tuning
-------------------
-
-Execution modes
-~~~~~~~~~~~~~~~
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
-
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager`` 
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
-
-.. warning::
-   In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-
-Bucketing mechanism
-~~~~~~~~~~~~~~~~~~~
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
-
-.. note::
-   Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-.. code-block::
-
-      INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-      INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-      INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-      INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-
-``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
-
-Example (with ramp-up)
-
-.. code-block:: 
-   
-    min = 2, step = 32, max = 64
-    => ramp_up = (2, 4, 8, 16)
-    => stable = (32, 64)
-    => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-
-Example (without ramp-up)
-
-.. code-block:: 
-   
-    min = 128, step = 128, max = 512
-    => ramp_up = ()
-    => stable = (128, 256, 384, 512)
-    => buckets = ramp_up + stable => (128, 256, 384, 512)
-
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. 
-
-.. warning::
-   If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. 
-
-.. note::
-   Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-Warmup
-~~~~~~
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-.. code-block::
-
-   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-   INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-   ...
-   INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-   INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-   ...
-   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. 
-
-.. tip::
-   Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-HPU Graph capture
-~~~~~~~~~~~~~~~~~
-
-`HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). 
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. 
-Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
-Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
-Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
-
-.. note:: 
-   ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.   
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
--    ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
--    ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
-
-
-.. note::
-   ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-.. code-block::
-
-   INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-   INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-   INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-   INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-   INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-   INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-   ...
-   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   ...
-   INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-   INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-   ...
-   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-   INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-   INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-
-
-Recommended vLLM Parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
--  We recommend running inference on Gaudi 2 with ``block_size`` of 128
-   for BF16 data type. Using default values (16, 32) might lead to
-   sub-optimal performance due to Matrix Multiplication Engine
-   under-utilization (see `Gaudi
-   Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html>`__).
--  For max throughput on Llama 7B, we recommend running with batch size
-   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-   If you encounter out-of-memory issues, see troubleshooting section.
-
-Environment variables
-~~~~~~~~~~~~~~~~~~~~~
-
-**Diagnostic and profiling knobs:**
-
--   ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai <https://perfetto.habana.ai/#!/viewer>`__. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
-
-**Performance tuning knobs:**
-
--   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
--   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
--   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
--   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
--   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
--   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - ``{phase}`` is either ``PROMPT`` or ``DECODE``
-    - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
-    - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
-    - Default values:
-
-      - Prompt:
-         - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
-         - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
-         - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
-         - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
-
-      - Decode:
-         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
-         - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
-         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
-
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
-
--   ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default 
--   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
-
-Troubleshooting: Tweaking HPU Graphs
-------------------------------------
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
--  Tweak ``gpu_memory_utilization`` knob. It will decrease the
-   allocation of KV cache, leaving some headroom for capturing graphs
-   with larger batch size. By default ``gpu_memory_utilization`` is set
-   to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-   short profiling run. Note that decreasing reduces the number of KV
-   cache blocks you have available, and therefore reduces the effective
-   maximum number of tokens you can handle at a given time.
-
--  If this method is not efficient, you can disable ``HPUGraph``
-   completely. With HPU Graphs disabled, you are trading latency and
-   throughput at lower batches for potentially higher throughput on
-   higher batches. You can do that by adding ``--enforce-eager`` flag to
-   server (for online inference), or by passing ``enforce_eager=True``
-   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
new file mode 100644
index 0000000000000..996fb346f43d4
--- /dev/null
+++ b/docs/source/getting_started/installation.md
@@ -0,0 +1,199 @@
+(installation)=
+
+# Installation
+
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Install released versions
+
+You can install vLLM using pip:
+
+```console
+$ # (Recommended) Create a new conda environment.
+$ conda create -n myenv python=3.12 -y
+$ conda activate myenv
+
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm
+```
+
+```{note}
+Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+```
+
+````{note}
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
+We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+
+```console
+$ # Install vLLM with CUDA 11.8.
+$ export VLLM_VERSION=0.6.1.post1
+$ export PYTHON_VERSION=310
+$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
+````
+
+(install-the-latest-code)=
+
+## Install the latest code
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+
+```console
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
+Another way to access the latest code is to use the docker images:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+(build-from-source)=
+
+## Build from source
+
+(python-only-build)=
+
+### Python-only build (without compilation)
+
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
+
+The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
+
+```console
+$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+$ pip install --editable .
+```
+
+You can find more information about vLLM's wheels [above](#install-the-latest-code).
+
+```{note}
+There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
+```
+
+### Full build (with compilation)
+
+If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -e .
+```
+
+```{tip}
+Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
+For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
+As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+
+[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
+The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
+```
+
+#### Use an existing PyTorch installation
+
+There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+
+To build vLLM using an existing PyTorch installation:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python use_existing_torch.py
+$ pip install -r requirements-build.txt
+$ pip install -e . --no-build-isolation
+```
+
+#### Use the local cutlass for compilation
+
+Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
+To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+```
+
+#### Troubleshooting
+
+To avoid your system being overloaded, you can limit the number of compilation jobs
+to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+```console
+$ export MAX_JOBS=6
+$ pip install -e .
+```
+
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
+
+Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
+
+```console
+$ # Use `--ipc=host` to make sure the shared memory is large enough.
+$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+```
+
+If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+
+```console
+$ export CUDA_HOME=/usr/local/cuda
+$ export PATH="${CUDA_HOME}/bin:$PATH"
+```
+
+Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
+
+```console
+$ nvcc --version # verify that nvcc is in your PATH
+$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+```
+
+### Unsupported OS build
+
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
+
+Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
+
+```console
+$ export VLLM_TARGET_DEVICE=empty
+$ pip install -e .
+```
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
deleted file mode 100644
index 9b6cb0e80d60e..0000000000000
--- a/docs/source/getting_started/installation.rst
+++ /dev/null
@@ -1,214 +0,0 @@
-.. _installation:
-
-============
-Installation
-============
-
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
-
-Requirements
-============
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Install released versions
-=========================
-
-You can install vLLM using pip:
-
-.. code-block:: console
-
-    $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.12 -y
-    $ conda activate myenv
-
-    $ # Install vLLM with CUDA 12.1.
-    $ pip install vllm
-
-.. note::
-
-    Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
-
-.. note::
-
-    As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-    We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
-
-    .. code-block:: console
-
-        $ # Install vLLM with CUDA 11.8.
-        $ export VLLM_VERSION=0.6.1.post1
-        $ export PYTHON_VERSION=310
-        $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-
-    In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-    Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-
-
-.. _install-the-latest-code:
-
-Install the latest code
-=======================
-
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command:
-
-.. code-block:: console
-
-    $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
-
-.. code-block:: console
-
-    $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-Note that the wheels are built with Python 3.8 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Another way to access the latest code is to use the docker images:
-
-.. code-block:: console
-
-    $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
-
-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
-
-The latest code can contain bugs and may not be stable. Please use it with caution.
-
-.. _build_from_source:
-
-Build from source
-=================
-
-.. _python-only-build:
-
-Python-only build (without compilation)
----------------------------------------
-
-If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_, changes you make to the code will be reflected when you run vLLM:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ VLLM_USE_PRECOMPILED=1 pip install --editable .
-
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
-
-The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
-
-.. code-block:: console
-
-   $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-   $ pip install --editable .
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-.. note::
-
-    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
-
-Full build (with compilation)
------------------------------
-
-If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ pip install -e .
-
-.. tip::
-
-    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
-    As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
-
-    `sccache <https://github.com/mozilla/sccache>`_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
-    The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
-
-
-Use an existing PyTorch installation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
-
-* Building vLLM with PyTorch nightly or a custom PyTorch build.
-* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly <https://pytorch.org/get-started/locally/>`_, and then build vLLM on top of it.
-
-To build vLLM using an existing PyTorch installation:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ python use_existing_torch.py
-    $ pip install -r requirements-build.txt
-    $ pip install -e . --no-build-isolation
-
-
-Use the local cutlass for compilation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
-To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
-
-
-Troubleshooting
-~~~~~~~~~~~~~~~
-
-To avoid your system being overloaded, you can limit the number of compilation jobs
-to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
-
-.. code-block:: console
-
-    $ export MAX_JOBS=6
-    $ pip install -e .
-
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process.
-
-Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
-
-.. code-block:: console
-
-    $ # Use `--ipc=host` to make sure the shared memory is large enough.
-    $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
-
-If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
-
-.. code-block:: console
-
-    $ export CUDA_HOME=/usr/local/cuda
-    $ export PATH="${CUDA_HOME}/bin:$PATH"
-
-Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
-
-.. code-block:: console
-
-    $ nvcc --version # verify that nvcc is in your PATH
-    $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
-
-
-Unsupported OS build
---------------------
-
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
-
-Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
-
-.. code-block:: console
-
-    $ export VLLM_TARGET_DEVICE=empty
-    $ pip install -e .
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
new file mode 100644
index 0000000000000..d6de5760cc82c
--- /dev/null
+++ b/docs/source/getting_started/neuron-installation.md
@@ -0,0 +1,132 @@
+(installation-neuron)=
+
+# Installation with Neuron
+
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
+Paged Attention and Chunked Prefill are currently in development and will be available soon.
+Data types currently supported in Neuron SDK are FP16 and BF16.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.11
+- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
+- Pytorch 2.0.1/2.1.1
+- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+
+Installation steps:
+
+- [Build from source](#build-from-source-neuron)
+
+  - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
+  - [Step 1. Install drivers and tools](#install-drivers)
+  - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
+  - [Step 3. Install vLLM from source](#install-vllm)
+
+(build-from-source-neuron)=
+
+```{note}
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+```
+
+## Build from source
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+(launch-instances)=
+
+### Step 0. Launch Trn1/Inf2 instances
+
+Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
+
+- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
+- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
+- Select Ubuntu Server 22.04 TLS AMI
+- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
+- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
+
+(install-drivers)=
+
+### Step 1. Install drivers and tools
+
+The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
+
+```console
+# Configure Linux for Neuron repository updates
+. /etc/os-release
+sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
+wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+
+# Update OS packages
+sudo apt-get update -y
+
+# Install OS headers
+sudo apt-get install linux-headers-$(uname -r) -y
+
+# Install git
+sudo apt-get install git -y
+
+# install Neuron Driver
+sudo apt-get install aws-neuronx-dkms=2.* -y
+
+# Install Neuron Runtime
+sudo apt-get install aws-neuronx-collectives=2.* -y
+sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+
+# Install Neuron Tools
+sudo apt-get install aws-neuronx-tools=2.* -y
+
+# Add PATH
+export PATH=/opt/aws/neuron/bin:$PATH
+```
+
+(install-tnx)=
+
+### Step 2. Install transformers-neuronx and its dependencies
+
+[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
+Follow the steps below to install transformer-neuronx package and its dependencies.
+
+```console
+# Install Python venv
+sudo apt-get install -y python3.10-venv g++
+
+# Create Python venv
+python3.10 -m venv aws_neuron_venv_pytorch
+
+# Activate Python venv
+source aws_neuron_venv_pytorch/bin/activate
+
+# Install Jupyter notebook kernel
+pip install ipykernel
+python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+pip install jupyter notebook
+pip install environment_kernels
+
+# Set pip repository pointing to the Neuron repository
+python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+# Install wget, awscli
+python -m pip install wget
+python -m pip install awscli
+
+# Update Neuron Compiler and Framework
+python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+```
+
+(install-vllm)=
+
+### Step 3. Install vLLM from source
+
+Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -U -r requirements-neuron.txt
+$ VLLM_TARGET_DEVICE="neuron" pip install .
+```
+
+If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
deleted file mode 100644
index 025ba6ef7ebd8..0000000000000
--- a/docs/source/getting_started/neuron-installation.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. _installation_neuron:
-
-Installation with Neuron
-========================
-
-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
-Paged Attention and Chunked Prefill are currently in development and will be available soon.
-Data types currently supported in Neuron SDK are FP16 and BF16.
-
-Requirements
-------------
-
-* OS: Linux
-* Python: 3.9 -- 3.11
-* Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
-* Pytorch 2.0.1/2.1.1
-* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
-
-Installation steps:
-
-- :ref:`Build from source <build_from_source_neuron>`
-
-  - :ref:`Step 0. Launch Trn1/Inf2 instances <launch_instances>`
-  - :ref:`Step 1. Install drivers and tools <install_drivers>`
-  - :ref:`Step 2. Install transformers-neuronx and its dependencies <install_tnx>`
-  - :ref:`Step 3. Install vLLM from source <install_vllm>`
-
-.. _build_from_source_neuron:
-
-.. note::
-
-    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-
-Build from source
------------------
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-.. _launch_instances:
-
-Step 0. Launch Trn1/Inf2 instances
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html>`_.
-
-- Please follow the instructions at `launch an Amazon EC2 Instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance>`_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
-- To get more information about instances sizes and pricing see: `Trn1 web page <https://aws.amazon.com/ec2/instance-types/trn1/>`_, `Inf2 web page <https://aws.amazon.com/ec2/instance-types/inf2/>`_
-- Select Ubuntu Server 22.04 TLS AMI
-- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
-- After launching the instance, follow the instructions in `Connect to your instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html>`_ to connect to the instance
-
-.. _install_drivers:
-
-Step 1. Install drivers and tools
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron <https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html>`_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-
-.. code-block:: console
-
-    # Configure Linux for Neuron repository updates
-    . /etc/os-release
-    sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
-    deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
-    EOF
-    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
-
-    # Update OS packages
-    sudo apt-get update -y
-
-    # Install OS headers
-    sudo apt-get install linux-headers-$(uname -r) -y
-
-    # Install git
-    sudo apt-get install git -y
-
-    # install Neuron Driver
-    sudo apt-get install aws-neuronx-dkms=2.* -y
-
-    # Install Neuron Runtime
-    sudo apt-get install aws-neuronx-collectives=2.* -y
-    sudo apt-get install aws-neuronx-runtime-lib=2.* -y
-
-    # Install Neuron Tools
-    sudo apt-get install aws-neuronx-tools=2.* -y
-
-    # Add PATH
-    export PATH=/opt/aws/neuron/bin:$PATH
-
-
-.. _install_tnx:
-
-Step 2. Install transformers-neuronx and its dependencies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`transformers-neuronx <https://github.com/aws-neuron/transformers-neuronx>`_ will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
-
-.. code-block:: console
-
-    # Install Python venv
-    sudo apt-get install -y python3.10-venv g++
-
-    # Create Python venv
-    python3.10 -m venv aws_neuron_venv_pytorch
-
-    # Activate Python venv
-    source aws_neuron_venv_pytorch/bin/activate
-
-    # Install Jupyter notebook kernel
-    pip install ipykernel
-    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
-    pip install jupyter notebook
-    pip install environment_kernels
-
-    # Set pip repository pointing to the Neuron repository
-    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-
-    # Install wget, awscli
-    python -m pip install wget
-    python -m pip install awscli
-
-    # Update Neuron Compiler and Framework
-    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
-
-.. _install_vllm:
-
-Step 3. Install vLLM from source
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ pip install -U -r requirements-neuron.txt
-    $ VLLM_TARGET_DEVICE="neuron" pip install .
-
-If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md
new file mode 100644
index 0000000000000..8b43c0a90447f
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.md
@@ -0,0 +1,104 @@
+(installation-openvino)=
+
+# Installation with OpenVINO
+
+vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
+
+**Table of contents**:
+
+- [Requirements](#openvino-backend-requirements)
+- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
+- [Build from source](#install-openvino-backend-from-source)
+- [Performance tips](#openvino-backend-performance-tips)
+- [Limitations](#openvino-backend-limitations)
+
+(openvino-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Instruction set architecture (ISA) requirement: at least AVX2.
+
+(openvino-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+$ docker run -it --rm vllm-openvino-env
+```
+
+(install-openvino-backend-from-source)=
+
+## Install from source
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  ```console
+  $ sudo apt-get update  -y
+  $ sudo apt-get install python3
+  ```
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  ```console
+  $ pip install --upgrade pip
+  $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+  ```
+
+- Finally, install vLLM with OpenVINO backend:
+
+  ```console
+  $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+  ```
+
+- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+
+(openvino-backend-performance-tips)=
+
+## Performance tips
+
+### vLLM OpenVINO backend environment variables
+
+- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
+- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
+
+### CPU performance tips
+
+CPU uses the following environment variables to control behavior:
+
+- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
+
+OpenVINO best known configuration for CPU is:
+
+```console
+$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+```
+
+### GPU performance tips
+
+GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
+
+Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
+
+OpenVINO best known configuration for GPU is:
+
+```console
+$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+(openvino-backend-limitations)=
+
+## Limitations
+
+- LoRA serving is not supported.
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
deleted file mode 100644
index 5eeb7c78f7e51..0000000000000
--- a/docs/source/getting_started/openvino-installation.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-.. _installation_openvino:
-
-Installation with OpenVINO
-==========================
-
-vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs <https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu>`_). OpenVINO vLLM backend supports the following advanced vLLM features:
-
-- Prefix caching (``--enable-prefix-caching``)
-- Chunked prefill (``--enable-chunked-prefill``)
-
-**Table of contents**:
-
-- :ref:`Requirements <openvino_backend_requirements>`
-- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
-- :ref:`Build from source <install_openvino_backend_from_source>`
-- :ref:`Performance tips <openvino_backend_performance_tips>`
-- :ref:`Limitations <openvino_backend_limitations>`
-
-.. _openvino_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Instruction set architecture (ISA) requirement: at least AVX2.
-
-.. _openvino_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
-    $ docker run -it --rm vllm-openvino-env
-
-.. _install_openvino_backend_from_source:
-
-Install from source
--------------------
-
-- First, install Python. For example, on Ubuntu 22.04, you can run:
-
-  .. code-block:: console
-
-      $ sudo apt-get update  -y
-      $ sudo apt-get install python3
-
-- Second, install prerequisites vLLM OpenVINO backend installation:
-
-  .. code-block:: console
-
-      $ pip install --upgrade pip
-      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, install vLLM with OpenVINO backend:
-
-  .. code-block:: console
-
-      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-
-- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html <https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html>`_.
-
-.. _openvino_backend_performance_tips:
-
-Performance tips
-----------------
-
-vLLM OpenVINO backend environment variables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default.
-
-- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-CPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-
-CPU uses the following environment variables to control behavior:
-
-- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
-
-OpenVINO best known configuration for CPU is:
-
-.. code-block:: console
-
-    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-
-GPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-.. code-block:: console
-
-    $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-
-.. _openvino_backend_limitations:
-
-Limitations
------------
-
-- LoRA serving is not supported.
-
-- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
-
-- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
new file mode 100644
index 0000000000000..165e5df146dcd
--- /dev/null
+++ b/docs/source/getting_started/quickstart.md
@@ -0,0 +1,175 @@
+(quickstart)=
+
+# Quickstart
+
+This guide will help you quickly get started with vLLM to:
+
+- [Run offline batched inference](#offline-batched-inference)
+- [Run OpenAI-compatible inference](#openai-compatible-server)
+
+## Prerequisites
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Installation
+
+You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+```console
+$ conda create -n myenv python=3.10 -y
+$ conda activate myenv
+$ pip install vllm
+```
+
+Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
+
+(offline-batched-inference)=
+
+## Offline Batched Inference
+
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
+
+The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
+
+- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine.
+- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process.
+
+```python
+from vllm import LLM, SamplingParams
+```
+
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+
+```python
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+```
+
+The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models).
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+```{note}
+By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+```
+
+Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
+
+```python
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+(openai-compatible-server)=
+
+## OpenAI-Compatible Server
+
+vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
+By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints.
+
+Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
+
+```console
+$ vllm serve Qwen/Qwen2.5-1.5B-Instruct
+```
+
+```{note}
+By default, the server uses a predefined chat template stored in the tokenizer.
+You can learn about overriding it [here](#chat-template).
+```
+
+This server can be queried in the same format as OpenAI API. For example, to list the models:
+
+```console
+$ curl http://localhost:8000/v1/models
+```
+
+You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+
+### OpenAI Completions API with vLLM
+
+Once your server is started, you can query the model with input prompts:
+
+```console
+$ curl http://localhost:8000/v1/completions \
+$     -H "Content-Type: application/json" \
+$     -d '{
+$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$         "prompt": "San Francisco is a",
+$         "max_tokens": 7,
+$         "temperature": 0
+$     }'
+```
+
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+                                      prompt="San Francisco is a")
+print("Completion result:", completion)
+```
+
+A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
+
+### OpenAI Chat Completions API with vLLM
+
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
+
+```console
+$ curl http://localhost:8000/v1/chat/completions \
+$     -H "Content-Type: application/json" \
+$     -d '{
+$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$         "messages": [
+$             {"role": "system", "content": "You are a helpful assistant."},
+$             {"role": "user", "content": "Who won the world series in 2020?"}
+$         ]
+$     }'
+```
+
+Alternatively, you can use the `openai` python package:
+
+```python
+from openai import OpenAI
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+chat_response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-1.5B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Tell me a joke."},
+    ]
+)
+print("Chat response:", chat_response)
+```
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
deleted file mode 100644
index 0c0491c860563..0000000000000
--- a/docs/source/getting_started/quickstart.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-.. _quickstart:
-
-==========
-Quickstart
-==========
-
-This guide will help you quickly get started with vLLM to:
-
-* :ref:`Run offline batched inference <offline_batched_inference>` 
-* :ref:`Run OpenAI-compatible inference <openai_compatible_server>`
-
-Prerequisites
---------------
-- OS: Linux
-- Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Installation
---------------
-
-You can install vLLM using pip. It's recommended to use `conda <https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html>`_ to create and manage Python environments.
-
-.. code-block:: console
-
-    $ conda create -n myenv python=3.10 -y
-    $ conda activate myenv
-    $ pip install vllm
-
-Please refer to the :ref:`installation documentation <installation>` for more details on installing vLLM.
-
-.. _offline_batched_inference:
-
-Offline Batched Inference
--------------------------
-
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`__.
-
-The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`:
-
-- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine.
-- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature <https://arxiv.org/html/2402.05201v1>`_ is set to ``0.8`` and the `nucleus sampling probability <https://en.wikipedia.org/wiki/Top-p_sampling>`_ is set to ``0.95``. You can find more information about the sampling parameters `here <https://docs.vllm.ai/en/stable/dev/sampling_params.html>`__.
-
-.. code-block:: python
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_ for offline inference. The list of supported models can be found :ref:`here <supported_models>`.
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-
-.. note::
-
-    By default, vLLM downloads models from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine.
-
-Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens.
-
-.. code-block:: python
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-.. _openai_compatible_server:
-
-OpenAI-Compatible Server
-------------------------
-
-vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. 
-
-Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct>`_ model:
-
-.. code-block:: console
-
-    $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
-
-.. note::
-
-    By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here <https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template>`__.
-
-This server can be queried in the same format as OpenAI API. For example, to list the models:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/models
-
-You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header.
-
-OpenAI Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once your server is started, you can query the model with input prompts:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/completions \
-    $     -H "Content-Type: application/json" \
-    $     -d '{
-    $         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-    $         "prompt": "San Francisco is a",
-    $         "max_tokens": 7,
-    $         "temperature": 0
-    $     }'
-
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                          prompt="San Francisco is a")
-    print("Completion result:", completion)
-
-A more detailed client example can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`__.
-
-OpenAI Chat Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
-
-You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to interact with the model:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/chat/completions \
-    $     -H "Content-Type: application/json" \
-    $     -d '{
-    $         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-    $         "messages": [
-    $             {"role": "system", "content": "You are a helpful assistant."},
-    $             {"role": "user", "content": "Who won the world series in 2020?"}
-    $         ]
-    $     }'
-
-Alternatively, you can use the ``openai`` python package:
-
-.. code-block:: python
-
-    from openai import OpenAI
-    # Set OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    chat_response = client.chat.completions.create(
-        model="Qwen/Qwen2.5-1.5B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Tell me a joke."},
-        ]
-    )
-    print("Chat response:", chat_response)
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
new file mode 100644
index 0000000000000..f2a949e7247d8
--- /dev/null
+++ b/docs/source/getting_started/tpu-installation.md
@@ -0,0 +1,192 @@
+(installation-tpu)=
+
+# Installation with TPU
+
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
+For more information on the TPU versions supported with vLLM, see:
+
+- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
+- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
+- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
+- [TPU v4](https://cloud.google.com/tpu/docs/v4)
+
+These TPU versions allow you to configure the physical arrangements of the TPU
+chips. This can improve throughput and networking performance. For more
+information see:
+
+- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
+- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
+- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
+- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you
+want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
+
+For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
+
+You may need additional persistent storage for your TPU VMs. For more
+information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
+
+## Requirements
+
+- Google Cloud TPU VM
+- TPU versions: v6e, v5e, v5p, v4
+- Python: 3.10 or newer
+
+### Provision Cloud TPUs
+
+You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
+or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
+API. This section shows how to create TPUs using the queued resource API. For
+more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
+Queued resources enable you to request Cloud TPU resources in a queued manner.
+When you request queued resources, the request is added to a queue maintained by
+the Cloud TPU service. When the requested resource becomes available, it's
+assigned to your Google Cloud project for your immediate exclusive use.
+
+```{note}
+In all of the following commands, replace the ALL CAPS parameter names with
+appropriate values. See the parameter descriptions table for more information.
+```
+
+## Provision a Cloud TPU with the queued resource API
+
+Create a TPU v5e with 4 TPU chips:
+
+```console
+gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+--node-id TPU_NAME \
+--project PROJECT_ID \
+--zone ZONE \
+--accelerator-type ACCELERATOR_TYPE \
+--runtime-version RUNTIME_VERSION \
+--service-account SERVICE_ACCOUNT
+```
+
+```{eval-rst}
+.. list-table:: Parameter descriptions
+    :header-rows: 1
+
+    * - Parameter name
+      - Description
+    * - QUEUED_RESOURCE_ID
+      - The user-assigned ID of the queued resource request.
+    * - TPU_NAME
+      - The user-assigned name of the TPU which is created when the queued
+        resource request is allocated.
+    * - PROJECT_ID
+      - Your Google Cloud project
+    * - ZONE
+      - The GCP zone where you want to create your Cloud TPU. The value you use
+        depends on the version of TPUs you are using. For more information, see
+        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
+    * - ACCELERATOR_TYPE
+      - The TPU version you want to use. Specify the TPU version, for example
+        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+    * - RUNTIME_VERSION
+      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+    * - SERVICE_ACCOUNT
+      - The email address for your service account. You can find it in the IAM
+        Cloud Console under *Service Accounts*. For example:
+        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+```
+
+Connect to your TPU using SSH:
+
+```bash
+gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
+```
+
+Install Miniconda
+
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+Create and activate a Conda environment for vLLM:
+
+```bash
+conda create -n vllm python=3.10 -y
+conda activate vllm
+```
+
+Clone the vLLM repository and go to the vLLM directory:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git && cd vllm
+```
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+```bash
+pip uninstall torch torch-xla -y
+```
+
+Install build dependencies:
+
+```bash
+pip install -r requirements-tpu.txt
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+
+Run the setup script:
+
+```bash
+VLLM_TARGET_DEVICE="tpu" python setup.py develop
+```
+
+## Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see
+<https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
+<https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
+<https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+
+(build-docker-tpu)=
+
+## Build a docker image with {code}`Dockerfile.tpu`
+
+You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
+
+```console
+$ docker build -f Dockerfile.tpu -t vllm-tpu .
+```
+
+Run the Docker image with the following command:
+
+```console
+$ # Make sure to add `--privileged --net host --shm-size=16G`.
+$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+```
+
+```{note}
+Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
+possible input shapes and compiles an XLA graph for each shape. The
+compilation time may take 20~30 minutes in the first run. However, the
+compilation time reduces to ~5 minutes afterwards because the XLA graphs are
+cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
+```
+
+````{tip}
+If you encounter the following error:
+
+```console
+from torch._C import *  # noqa: F403
+ImportError: libopenblas.so.0: cannot open shared object file: No such
+file or directory
+```
+
+Install OpenBLAS with the following command:
+
+```console
+$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+````
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
deleted file mode 100644
index 22cc684a1c778..0000000000000
--- a/docs/source/getting_started/tpu-installation.rst
+++ /dev/null
@@ -1,200 +0,0 @@
-.. _installation_tpu:
-
-#####################
-Installation with TPU
-#####################
-
-Tensor Processing Units (TPUs) are Google's custom-developed application-specific 
-integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs 
-are available in different versions each with different hardware specifications.
-For more information about TPUs, see `TPU System Architecture <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm>`_. 
-For more information on the TPU versions supported with vLLM, see:
-
-* `TPU v6e <https://cloud.google.com/tpu/docs/v6e>`_
-* `TPU v5e <https://cloud.google.com/tpu/docs/v5e>`_
-* `TPU v5p <https://cloud.google.com/tpu/docs/v5p>`_
-* `TPU v4 <https://cloud.google.com/tpu/docs/v4>`_
-
-These TPU versions allow you to configure the physical arrangements of the TPU 
-chips. This can improve throughput and networking performance. For more 
-information see: 
-
-* `TPU v6e topologies <https://cloud.google.com/tpu/docs/v6e#configurations>`_
-* `TPU v5e topologies <https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config>`_
-* `TPU v5p topologies <https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config>`_
-* `TPU v4 topologies <https://cloud.google.com/tpu/docs/v4#tpu-v4-config>`_
-
-In order for you to use Cloud TPUs you need to have TPU quota granted to your 
-Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
-GPC project and are specified in terms of TPU version, the number of TPU you 
-want to use, and quota type. For more information, see `TPU quota <https://cloud.google.com/tpu/docs/quota#tpu_quota>`_. 
-
-For TPU pricing information, see `Cloud TPU pricing <https://cloud.google.com/tpu/pricing>`_.
-
-You may need additional persistent storage for your TPU VMs. For more 
-information, see `Storage options for Cloud TPU data <https://cloud.devsite.corp.google.com/tpu/docs/storage-options>`_.
-
-Requirements
-------------
-
-* Google Cloud TPU VM 
-* TPU versions: v6e, v5e, v5p, v4
-* Python: 3.10 or newer
-
-Provision Cloud TPUs
-====================
-
-You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_ 
-or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_ 
-API. This section shows how to create TPUs using the queued resource API. For 
-more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
-Queued resources enable you to request Cloud TPU resources in a queued manner. 
-When you request queued resources, the request is added to a queue maintained by 
-the Cloud TPU service. When the requested resource becomes available, it's 
-assigned to your Google Cloud project for your immediate exclusive use. 
-
-.. note::
-   In all of the following commands, replace the ALL CAPS parameter names with 
-   appropriate values. See the parameter descriptions table for more information.
-
-Provision a Cloud TPU with the queued resource API
---------------------------------------------------
-Create a TPU v5e with 4 TPU chips:
-
-.. code-block:: console
-
-    gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
-    --node-id TPU_NAME \
-    --project PROJECT_ID \
-    --zone ZONE \
-    --accelerator-type ACCELERATOR_TYPE \
-    --runtime-version RUNTIME_VERSION \
-    --service-account SERVICE_ACCOUNT
-
-   
-.. list-table:: Parameter descriptions
-    :header-rows: 1
-
-    * - Parameter name
-      - Description
-    * - QUEUED_RESOURCE_ID
-      - The user-assigned ID of the queued resource request.
-    * - TPU_NAME
-      - The user-assigned name of the TPU which is created when the queued 
-        resource request is allocated.
-    * - PROJECT_ID
-      - Your Google Cloud project
-    * - ZONE
-      - The GCP zone where you want to create your Cloud TPU. The value you use 
-        depends on the version of TPUs you are using. For more information, see 
-        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_ 
-    * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, for example 
-        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, 
-        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-    * - RUNTIME_VERSION
-      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-    * - SERVICE_ACCOUNT
-      - The email address for your service account. You can find it in the IAM 
-        Cloud Console under *Service Accounts*. For example: 
-        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-
-Connect to your TPU using SSH:
-
-.. code-block:: bash
-
-    gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
-
-Install Miniconda
-
-.. code-block:: bash
-
-    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-    bash Miniconda3-latest-Linux-x86_64.sh
-    source ~/.bashrc
-
-Create and activate a Conda environment for vLLM:
-
-.. code-block:: bash
-
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-Clone the vLLM repository and go to the vLLM directory:
-
-.. code-block:: bash
-
-    git clone https://github.com/vllm-project/vllm.git && cd vllm
-
-Uninstall the existing `torch` and `torch_xla` packages:
-
-.. code-block:: bash
-
-    pip uninstall torch torch-xla -y
-
-Install build dependencies:
-
-.. code-block:: bash
-
-    pip install -r requirements-tpu.txt
-    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
-
-Run the setup script:
-
-.. code-block:: bash
-
-   VLLM_TARGET_DEVICE="tpu" python setup.py develop
-
-
-Provision Cloud TPUs with GKE 
------------------------------
-
-For more information about using TPUs with GKE, see 
-https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
-https://cloud.google.com/kubernetes-engine/docs/concepts/tpus
-https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus
-
-.. _build_docker_tpu:
-
-Build a docker image with :code:`Dockerfile.tpu`
-------------------------------------------------
-
-You can use `Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ 
-to build a Docker image with TPU support.
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.tpu -t vllm-tpu .
-
-Run the Docker image with the following command:
-
-.. code-block:: console
-
-    $ # Make sure to add `--privileged --net host --shm-size=16G`.
-    $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
-
-.. note::
-
-    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the 
-    possible input shapes and compiles an XLA graph for each shape. The 
-    compilation time may take 20~30 minutes in the first run. However, the 
-    compilation time reduces to ~5 minutes afterwards because the XLA graphs are 
-    cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
-
-.. tip::
-
-    If you encounter the following error:
-
-    .. code-block:: console
-
-        from torch._C import *  # noqa: F403
-        ImportError: libopenblas.so.0: cannot open shared object file: No such 
-        file or directory
-
-
-    Install OpenBLAS with the following command:
-
-    .. code-block:: console
-
-        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
-
diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md
new file mode 100644
index 0000000000000..9554ae4b7fb44
--- /dev/null
+++ b/docs/source/getting_started/xpu-installation.md
@@ -0,0 +1,74 @@
+(installation-xpu)=
+
+# Installation with XPU
+
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+
+Table of contents:
+
+1. [Requirements](#xpu-backend-requirements)
+2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile)
+3. [Build from source](#build-xpu-backend-from-source)
+
+(xpu-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
+- OneAPI requirements: oneAPI 2024.2
+
+(xpu-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --device /dev/dri \
+             -v /dev/dri/by-path:/dev/dri/by-path \
+             vllm-xpu-env
+```
+
+(build-xpu-backend-from-source)=
+
+## Build from source
+
+- First, install required driver and intel OneAPI 2024.2 or later.
+- Second, install Python packages for vLLM XPU backend building:
+
+```console
+$ source /opt/intel/oneapi/setvars.sh
+$ pip install --upgrade pip
+$ pip install -v -r requirements-xpu.txt
+```
+
+- Finally, build and install vLLM XPU backend:
+
+```console
+$ VLLM_TARGET_DEVICE=xpu python setup.py install
+```
+
+```{note}
+- FP16 is the default data type in the current XPU backend. The BF16 data
+  type will be supported in the future.
+```
+
+## Distributed inference and serving
+
+XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+
+```console
+$ python -m vllm.entrypoints.openai.api_server \
+$      --model=facebook/opt-13b \
+$      --dtype=bfloat16 \
+$      --device=xpu \
+$      --max_model_len=1024 \
+$      --distributed-executor-backend=ray \
+$      --pipeline-parallel-size=2 \
+$      -tp=8
+```
+
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
deleted file mode 100644
index b1868acbc84b0..0000000000000
--- a/docs/source/getting_started/xpu-installation.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-.. _installation_xpu:
-
-Installation with XPU
-========================
-
-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
-
-Table of contents:
-
-#. :ref:`Requirements <xpu_backend_requirements>`
-#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
-#. :ref:`Build from source <build_xpu_backend_from_source>`
-
-.. _xpu_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-* OneAPI requirements: oneAPI 2024.2 
-
-.. _xpu_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --device /dev/dri \
-                 -v /dev/dri/by-path:/dev/dri/by-path \
-                 vllm-xpu-env
-
-.. _build_xpu_backend_from_source:
-
-Build from source
------------------
-
-- First, install required driver and intel OneAPI 2024.2 or later.
-
-- Second, install Python packages for vLLM XPU backend building:
-
-.. code-block:: console
-
-    $ source /opt/intel/oneapi/setvars.sh
-    $ pip install --upgrade pip
-    $ pip install -v -r requirements-xpu.txt 
-
-- Finally, build and install vLLM XPU backend: 
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE=xpu python setup.py install
-
-.. note::
-    - FP16 is the default data type in the current XPU backend. The BF16 data
-      type will be supported in the future.
-
-
-Distributed inference and serving
----------------------------------
-
-XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
-
-.. code-block:: console
-
-    $ python -m vllm.entrypoints.openai.api_server \
-    $      --model=facebook/opt-13b \
-    $      --dtype=bfloat16 \
-    $      --device=xpu \
-    $      --max_model_len=1024 \
-    $      --distributed-executor-backend=ray \
-    $      --pipeline-parallel-size=2 \
-    $      -tp=8
-
-By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_.
diff --git a/docs/source/index.md b/docs/source/index.md
new file mode 100644
index 0000000000000..34f9c4caebe6f
--- /dev/null
+++ b/docs/source/index.md
@@ -0,0 +1,200 @@
+# Welcome to vLLM!
+
+```{figure} ./assets/logos/vllm-logo-text-light.png
+:align: center
+:alt: vLLM
+:class: no-scaled-link
+:width: 60%
+```
+
+```{raw} html
+<p style="text-align:center">
+<strong>Easy, fast, and cheap LLM serving for everyone
+</strong>
+</p>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+```
+
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with **PagedAttention**
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Speculative decoding
+- Chunked prefill
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular HuggingFace models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism and pipeline parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Prefix caching support
+- Multi-lora support
+
+For more information, check out the following:
+
+- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
+- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
+- {ref}`vLLM Meetups <meetups>`.
+
+## Documentation
+
+```{toctree}
+:caption: Getting Started
+:maxdepth: 1
+
+getting_started/installation
+getting_started/amd-installation
+getting_started/openvino-installation
+getting_started/cpu-installation
+getting_started/gaudi-installation
+getting_started/arm-installation
+getting_started/neuron-installation
+getting_started/tpu-installation
+getting_started/xpu-installation
+getting_started/quickstart
+getting_started/debugging
+getting_started/examples/examples_index
+```
+
+```{toctree}
+:caption: Serving
+:maxdepth: 1
+
+serving/openai_compatible_server
+serving/deploying_with_docker
+serving/deploying_with_k8s
+serving/deploying_with_helm
+serving/deploying_with_nginx
+serving/distributed_serving
+serving/metrics
+serving/integrations
+serving/tensorizer
+serving/runai_model_streamer
+```
+
+```{toctree}
+:caption: Models
+:maxdepth: 1
+
+models/supported_models
+models/generative_models
+models/pooling_models
+models/adding_model
+models/enabling_multimodal_inputs
+```
+
+```{toctree}
+:caption: Usage
+:maxdepth: 1
+
+usage/lora
+usage/multimodal_inputs
+usage/tool_calling
+usage/structured_outputs
+usage/spec_decode
+usage/compatibility_matrix
+usage/performance
+usage/faq
+usage/engine_args
+usage/env_vars
+usage/usage_stats
+usage/disagg_prefill
+```
+
+```{toctree}
+:caption: Quantization
+:maxdepth: 1
+
+quantization/supported_hardware
+quantization/auto_awq
+quantization/bnb
+quantization/gguf
+quantization/int8
+quantization/fp8
+quantization/fp8_e5m2_kvcache
+quantization/fp8_e4m3_kvcache
+```
+
+```{toctree}
+:caption: Automatic Prefix Caching
+:maxdepth: 1
+
+automatic_prefix_caching/apc
+automatic_prefix_caching/details
+```
+
+```{toctree}
+:caption: Performance
+:maxdepth: 1
+
+performance/benchmarks
+```
+
+% Community: User community resources
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
+% API Documentation: API reference aimed at vllm library usage
+
+```{toctree}
+:caption: API Documentation
+:maxdepth: 2
+
+dev/sampling_params
+dev/pooling_params
+dev/offline_inference/offline_index
+dev/engine/engine_index
+```
+
+% Design: docs about vLLM internals
+
+```{toctree}
+:caption: Design
+:maxdepth: 2
+
+design/arch_overview
+design/huggingface_integration
+design/plugin_system
+design/input_processing/model_inputs_index
+design/kernel/paged_attention
+design/multimodal/multimodal_index
+design/multiprocessing
+```
+
+% For Developers: contributing to the vLLM project
+
+```{toctree}
+:caption: For Developers
+:maxdepth: 2
+
+contributing/overview
+contributing/profiling/profiling_index
+contributing/dockerfile/dockerfile
+```
+
+# Indices and tables
+
+- {ref}`genindex`
+- {ref}`modindex`
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index fd741ea5e9766..0000000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,193 +0,0 @@
-Welcome to vLLM!
-================
-
-.. figure:: ./assets/logos/vllm-logo-text-light.png
-  :width: 60%
-  :align: center
-  :alt: vLLM
-  :class: no-scaled-link
-
-.. raw:: html
-
-   <p style="text-align:center">
-   <strong>Easy, fast, and cheap LLM serving for everyone
-   </strong>
-   </p>
-
-   <p style="text-align:center">
-   <script async defer src="https://buttons.github.io/buttons.js"></script>
-   <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-   <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-   <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
-   </p>
-
-
-
-vLLM is a fast and easy-to-use library for LLM inference and serving.
-
-vLLM is fast with:
-
-* State-of-the-art serving throughput
-* Efficient management of attention key and value memory with **PagedAttention**
-* Continuous batching of incoming requests
-* Fast model execution with CUDA/HIP graph
-* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, INT4, INT8, and FP8
-* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
-* Speculative decoding
-* Chunked prefill
-
-vLLM is flexible and easy to use with:
-
-* Seamless integration with popular HuggingFace models
-* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism and pipeline parallelism support for distributed inference
-* Streaming outputs
-* OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
-* Prefix caching support
-* Multi-lora support
-
-For more information, check out the following:
-
-* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
-* `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
-* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
-* :ref:`vLLM Meetups <meetups>`.
-
-
-Documentation
--------------
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Getting Started
-
-   getting_started/installation
-   getting_started/amd-installation
-   getting_started/openvino-installation
-   getting_started/cpu-installation
-   getting_started/gaudi-installation
-   getting_started/arm-installation
-   getting_started/neuron-installation
-   getting_started/tpu-installation
-   getting_started/xpu-installation
-   getting_started/quickstart
-   getting_started/debugging
-   getting_started/examples/examples_index
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Serving
-
-   serving/openai_compatible_server
-   serving/deploying_with_docker
-   serving/deploying_with_k8s
-   serving/deploying_with_helm
-   serving/deploying_with_nginx
-   serving/distributed_serving
-   serving/metrics
-   serving/integrations
-   serving/tensorizer
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Models
-
-   models/supported_models
-   models/generative_models
-   models/pooling_models
-   models/adding_model
-   models/enabling_multimodal_inputs
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Usage
-
-   usage/lora
-   usage/multimodal_inputs
-   usage/tool_calling
-   usage/structured_outputs
-   usage/spec_decode
-   usage/compatibility_matrix
-   usage/performance
-   usage/faq
-   usage/engine_args
-   usage/env_vars
-   usage/usage_stats
-   usage/disagg_prefill
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Quantization
-
-   quantization/supported_hardware
-   quantization/auto_awq
-   quantization/bnb
-   quantization/gguf
-   quantization/int8
-   quantization/fp8
-   quantization/fp8_e5m2_kvcache
-   quantization/fp8_e4m3_kvcache
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Automatic Prefix Caching
-
-   automatic_prefix_caching/apc
-   automatic_prefix_caching/details
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Performance
-
-   performance/benchmarks
-
-.. Community: User community resources
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Community
-
-   community/meetups
-   community/sponsors
-
-.. API Documentation: API reference aimed at vllm library usage
-
-.. toctree::
-   :maxdepth: 2
-   :caption: API Documentation
-
-   dev/sampling_params
-   dev/pooling_params
-   dev/offline_inference/offline_index
-   dev/engine/engine_index
-
-.. Design: docs about vLLM internals
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Design
-
-   design/arch_overview
-   design/huggingface_integration
-   design/plugin_system
-   design/input_processing/model_inputs_index
-   design/kernel/paged_attention
-   design/multimodal/multimodal_index
-   design/multiprocessing
-
-.. For Developers: contributing to the vLLM project
-
-.. toctree::
-   :maxdepth: 2
-   :caption: For Developers
-
-   contributing/overview
-   contributing/profiling/profiling_index
-   contributing/dockerfile/dockerfile
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md
new file mode 100644
index 0000000000000..02537fba020c4
--- /dev/null
+++ b/docs/source/models/adding_model.md
@@ -0,0 +1,155 @@
+(adding-a-new-model)=
+
+# Adding a New Model
+
+This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+
+```{note}
+The complexity of adding a new model depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+```
+
+```{note}
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
+please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
+```
+
+```{tip}
+If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
+We will be happy to help you out!
+```
+
+## 0. Fork the vLLM repository
+
+Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
+This gives you the ability to modify the codebase and test your model.
+
+```{tip}
+If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
+```
+
+## 1. Bring your model code
+
+Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+```{warning}
+When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
+```
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+```python
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.attention import Attention
+
+class MyAttention(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.attn = Attention(prefix=f"{prefix}.attn")
+
+class MyDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+class MyModel(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+        )
+
+class MyModelForCausalLM(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+```
+
+### Computation Code
+
+Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    ...
+```
+
+```{note}
+Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+```
+
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
+
+## 6. Out-of-Tree Model Integration
+
+You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
+
+To register the model, use the following code:
+
+```python
+from vllm import ModelRegistry
+from your_code import YourModelForCausalLM
+ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+from vllm import ModelRegistry
+
+ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+```
+
+```{important}
+If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+Read more about that [here](#enabling-multimodal-inputs).
+```
+
+```{note}
+Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+```
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
deleted file mode 100644
index df06d736ca86b..0000000000000
--- a/docs/source/models/adding_model.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-.. _adding_a_new_model:
-
-Adding a New Model
-==================
-
-This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.
-
-.. note::
-    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-
-.. note::
-    By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
-    please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
-
-.. tip::
-    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
-    We will be happy to help you out!
-
-
-0. Fork the vLLM repository
---------------------------------
-
-Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
-This gives you the ability to modify the codebase and test your model.
-
-.. tip::
-    If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
-
-1. Bring your model code
-------------------------
-
-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`_ directory.
-For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adapted from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
-
-.. warning::
-    When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
-
-
-2. Make your code compatible with vLLM
---------------------------------------
-
-To ensure compatibility with vLLM, your model must meet the following requirements:
-
-Initialization Code
-^^^^^^^^^^^^^^^^^^^
-
-All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for:
-
-* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
-* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode.
-
-The initialization code should look like this:
-
-.. code-block:: python
-
-    from torch import nn
-    from vllm.config import VllmConfig
-    from vllm.attention import Attention
-
-    class MyAttention(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.attn = Attention(prefix=f"{prefix}.attn")
-
-    class MyDecoderLayer(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
-
-    class MyModel(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.layers = nn.ModuleList(
-                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
-            )
-
-    class MyModelForCausalLM(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-            super().__init__()
-            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-
-Computation Code
-^^^^^^^^^^^^^^^^
-
-Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
-
-.. code-block:: python
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        ...
-
-.. note::
-    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-
-For reference, check out the `LLAMA model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py>`__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`__ directory for more examples.
-
-3. (Optional) Implement tensor parallelism and quantization support
--------------------------------------------------------------------
-
-If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
-To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
-When it comes to the linear layers, we provide the following options to parallelize them:
-
-* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
-* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
-* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
-* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
-* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
-
-Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
-
-4. Implement the weight loading logic
--------------------------------------
-
-You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
-
-5. Register your model
-----------------------
-
-Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
-
-6. Out-of-Tree Model Integration
---------------------------------
-
-You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`.
-
-To register the model, use the following code:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-    from your_code import YourModelForCausalLM
-    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-
-If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-
-    ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
-
-.. important::
-    If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-    Read more about that :ref:`here <enabling_multimodal_inputs>`.
-
-.. note::
-    Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md
new file mode 100644
index 0000000000000..fdd770887900e
--- /dev/null
+++ b/docs/source/models/enabling_multimodal_inputs.md
@@ -0,0 +1,143 @@
+(enabling-multimodal-inputs)=
+
+# Enabling Multimodal Inputs
+
+This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
+
+```{seealso}
+[Adding a New Model](adding-a-new-model)
+```
+
+## 1. Update the base vLLM model
+
+It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
+Further update the model as follows:
+
+- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+
+  ```diff
+  + from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+  - class YourModelForImage2Seq(nn.Module):
+  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+  ```
+
+  ```{note}
+  The model class does not have to be named {code}`*ForCausalLM`.
+  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
+  ```
+
+- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward`
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+
+## 2. Register input mappers
+
+For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`.
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 3. Register maximum number of multi-modal tokens
+
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
+and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
++ @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+Here are some examples:
+
+- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 4. (Optional) Register dummy data
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
++ @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+```{note}
+The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+```
+
+Here are some examples:
+
+- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 5. (Optional) Register input processor
+
+Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor.
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call.
+You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
++ @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
+
+- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
deleted file mode 100644
index 5c1236e1a8972..0000000000000
--- a/docs/source/models/enabling_multimodal_inputs.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-.. _enabling_multimodal_inputs:
-
-Enabling Multimodal Inputs
-==========================
-
-This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs <multimodal_inputs>`.
-
-.. seealso::
-    :ref:`adding_a_new_model`
-
-
-1. Update the base vLLM model
------------------------------
-
-It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
-Further update the model as follows:
-
-- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-
-  .. code-block:: diff
-
-      + from vllm.model_executor.models.interfaces import SupportsMultiModal
-
-      - class YourModelForImage2Seq(nn.Module):
-      + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-  .. note::
-      The model class does not have to be named :code:`*ForCausalLM`.
-      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
-
-- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
-  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-
-  .. code-block:: diff
-
-        def forward(
-            self,
-            input_ids: torch.Tensor,
-            positions: torch.Tensor,
-            kv_caches: List[torch.Tensor],
-            attn_metadata: AttentionMetadata,
-      +     pixel_values: torch.Tensor,
-        ) -> SamplerOutput:
-
-
-2. Register input mappers
--------------------------
-
-For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
-This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
-
-.. code-block:: diff
-
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-    + from vllm.multimodal import MULTIMODAL_REGISTRY
-
-    + @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-3. Register maximum number of multi-modal tokens
-------------------------------------------------
-
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
-and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-    + @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-Here are some examples:
-
-- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-4. (Optional) Register dummy data
----------------------------------
-
-During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-.. note::
-    The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
-
-Here are some examples:
-
-- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-5. (Optional) Register input processor
---------------------------------------
-
-Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call.
-You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
-Here are some examples:
-
-- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
new file mode 100644
index 0000000000000..35e0302b86619
--- /dev/null
+++ b/docs/source/models/generative_models.md
@@ -0,0 +1,126 @@
+(generative-models)=
+
+# Generative Models
+
+vLLM provides first-class support for generative models, which covers most of LLMs.
+
+In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
+Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
+which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+
+For generative models, the only supported {code}`task` option is {code}`"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
+### `LLM.generate`
+
+The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
+It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate),
+except that tokenization and detokenization are also performed automatically.
+
+```python
+llm = LLM(model="facebook/opt-125m")
+outputs = llm.generate("Hello, my name is")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
+For example, you can use greedy sampling by setting {code}`temperature=0`:
+
+```python
+llm = LLM(model="facebook/opt-125m")
+params = SamplingParams(temperature=0)
+outputs = llm.generate("Hello, my name is", params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference.py>
+
+### `LLM.beam_search`
+
+The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
+For example, to search using 5 beams and output at most 50 tokens:
+
+```python
+llm = LLM(model="facebook/opt-125m")
+params = BeamSearchParams(beam_width=5, max_tokens=50)
+outputs = llm.generate("Hello, my name is", params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+### `LLM.chat`
+
+The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`.
+In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
+
+```{important}
+In general, only instruction-tuned models have a chat template.
+Base models may perform poorly as they are not trained to respond to the chat conversation.
+```
+
+```python
+llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+outputs = llm.chat(conversation)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference_chat.py>
+
+If the model doesn't have a chat template or you want to specify another one,
+you can explicitly pass a chat template:
+
+```python
+from vllm.entrypoints.chat_utils import load_chat_template
+
+# You can find a list of existing chat templates under `examples/`
+custom_template = load_chat_template(chat_template="<path_to_template>")
+print("Loaded chat template:", custom_template)
+
+outputs = llm.chat(conversation, chat_template=custom_template)
+```
+
+## Online Inference
+
+Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+
+- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
+- [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst
deleted file mode 100644
index fb71185600863..0000000000000
--- a/docs/source/models/generative_models.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-.. _generative_models:
-
-Generative Models
-=================
-
-vLLM provides first-class support for generative models, which covers most of LLMs.
-
-In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
-Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text.
-
-Offline Inference
------------------
-
-The :class:`~vllm.LLM` class provides various methods for offline inference.
-See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
-
-For generative models, the only supported :code:`task` option is :code:`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
-``LLM.generate``
-^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM.
-It is similar to `its counterpart in HF Transformers <https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate>`__,
-except that tokenization and detokenization are also performed automatically.
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    outputs = llm.generate("Hello, my name is")
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-You can optionally control the language generation by passing :class:`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting :code:`temperature=0`:
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    params = SamplingParams(temperature=0)
-    outputs = llm.generate("Hello, my name is", params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-A code example can be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
-
-``LLM.beam_search``
-^^^^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.beam_search` method implements `beam search <https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding>`__ on top of :class:`~vllm.LLM.generate`.
-For example, to search using 5 beams and output at most 50 tokens:
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    params = BeamSearchParams(beam_width=5, max_tokens=50)
-    outputs = llm.generate("Hello, my name is", params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-``LLM.chat``
-^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`.
-In particular, it accepts input similar to `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
-and automatically applies the model's `chat template <https://huggingface.co/docs/transformers/en/chat_templating>`__ to format the prompt.
-
-.. important::
-
-    In general, only instruction-tuned models have a chat template.
-    Base models may perform poorly as they are not trained to respond to the chat conversation.
-
-.. code-block:: python
-
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-    conversation = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
-        {
-            "role": "user",
-            "content": "Write an essay about the importance of higher education.",
-        },
-    ]
-    outputs = llm.chat(conversation)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-A code example can be found in `examples/offline_inference_chat.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py>`_.
-
-If the model doesn't have a chat template or you want to specify another one,
-you can explicitly pass a chat template:
-
-.. code-block:: python
-
-    from vllm.entrypoints.chat_utils import load_chat_template
-
-    # You can find a list of existing chat templates under `examples/`
-    custom_template = load_chat_template(chat_template="<path_to_template>")
-    print("Loaded chat template:", custom_template)
-
-    outputs = llm.chat(conversation, chat_template=custom_template)
-
-Online Inference
-----------------
-
-Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
-Please click on the above link for more details on how to launch the server.
-
-Completions API
-^^^^^^^^^^^^^^^
-
-Our Completions API is similar to ``LLM.generate`` but only accepts text.
-It is compatible with `OpenAI Completions API <https://platform.openai.com/docs/api-reference/completions>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
-
-Chat API
-^^^^^^^^
-
-Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
-It is compatible with `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_chat_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py>`_.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
new file mode 100644
index 0000000000000..76c96c9edcc5d
--- /dev/null
+++ b/docs/source/models/pooling_models.md
@@ -0,0 +1,113 @@
+(pooling-models)=
+
+# Pooling Models
+
+vLLM also supports pooling models, including embedding, reranking and reward models.
+
+In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface.
+These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
+before returning them.
+
+```{note}
+We currently support pooling models primarily as a matter of convenience.
+As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
+pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+
+For pooling models, we support the following {code}`task` options:
+
+- Embedding ({code}`"embed"` / {code}`"embedding"`)
+- Classification ({code}`"classify"`)
+- Sentence Pair Scoring ({code}`"score"`)
+- Reward Modeling ({code}`"reward"`)
+
+The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+
+- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
+- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Reward Modeling: Extract all of the hidden states and return them directly.
+
+When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
+we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+
+You can customize the model's pooling method via the {code}`override_pooler_config` option,
+which takes priority over both the model's and Sentence Transformers's defaults.
+
+### `LLM.encode`
+
+The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+```python
+llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+(output,) = llm.encode("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+### `LLM.embed`
+
+The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
+
+```python
+llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+(output,) = llm.embed("Hello, my name is")
+
+embeds = output.outputs.embedding
+print(f"Embeddings: {embeds!r} (size={len(embeds)})")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
+
+### `LLM.classify`
+
+The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+```python
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+(output,) = llm.classify("Hello, my name is")
+
+probs = output.outputs.probs
+print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference_classification.py>
+
+### `LLM.score`
+
+The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
+It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
+These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+
+```{note}
+vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
+```
+
+```python
+llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+(output,) = llm.score("What is the capital of France?",
+                      "The capital of Brazil is Brasilia.")
+
+score = output.outputs.score
+print(f"Score: {score}")
+```
+
+A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
+
+## Online Inference
+
+Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+
+- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
+- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
+- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
deleted file mode 100644
index 4e67677a2767a..0000000000000
--- a/docs/source/models/pooling_models.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-.. _pooling_models:
-
-Pooling Models
-==============
-
-vLLM also supports pooling models, including embedding, reranking and reward models.
-
-In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
-before returning them.
-
-.. note::
-
-    We currently support pooling models primarily as a matter of convenience.
-    As shown in the :ref:`Compatibility Matrix <compatibility_matrix>`, most vLLM features are not applicable to
-    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-
-Offline Inference
------------------
-
-The :class:`~vllm.LLM` class provides various methods for offline inference.
-See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
-
-For pooling models, we support the following :code:`task` options:
-
-- Embedding (:code:`"embed"` / :code:`"embedding"`)
-- Classification (:code:`"classify"`)
-- Sentence Pair Scoring (:code:`"score"`)
-- Reward Modeling (:code:`"reward"`)
-
-The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used:
-
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
-
-When loading `Sentence Transformers <https://huggingface.co/sentence-transformers>`__ models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`).
-
-You can customize the model's pooling method via the :code:`override_pooler_config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
-
-``LLM.encode``
-^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly, which is useful for reward models.
-
-.. code-block:: python
-
-    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
-    (output,) = llm.encode("Hello, my name is")
-
-    data = output.outputs.data
-    print(f"Data: {data!r}")
-
-``LLM.embed``
-^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
-It is primarily designed for embedding models.
-
-.. code-block:: python
-
-    llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    (output,) = llm.embed("Hello, my name is")
-
-    embeds = output.outputs.embedding
-    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
-
-A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
-
-``LLM.classify``
-^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
-It is primarily designed for classification models.
-
-.. code-block:: python
-
-    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
-    (output,) = llm.classify("Hello, my name is")
-
-    probs = output.outputs.probs
-    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
-
-A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
-
-``LLM.score``
-^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
-It is primarily designed for `cross-encoder models <https://www.sbert.net/examples/applications/cross-encoder/README.html>`__.
-These types of models serve as rerankers between candidate query-document pairs in RAG systems.
-
-.. note::
-
-    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
-    To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
-
-.. code-block:: python
-
-    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
-    (output,) = llm.score("What is the capital of France?",
-                          "The capital of Brazil is Brasilia.")
-
-    score = output.outputs.score
-    print(f"Score: {score}")
-
-A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
-
-Online Inference
-----------------
-
-Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
-Please click on the above link for more details on how to launch the server.
-
-Embeddings API
-^^^^^^^^^^^^^^
-
-Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
-
-The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_embedding_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py>`_.
-
-The multi-modal API is an extension of the `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
-that incorporates `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__,
-so it is not part of the OpenAI standard. Please see :ref:`this page <multimodal_inputs>` for more details on how to use it.
-
-Score API
-^^^^^^^^^
-
-Our Score API is similar to ``LLM.score``.
-Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.md
similarity index 68%
rename from docs/source/models/supported_models.rst
rename to docs/source/models/supported_models.md
index 3bef3f3226062..95add0d71bbab 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.md
@@ -1,84 +1,78 @@
-.. _supported_models:
+(supported-models)=
 
-Supported Models
-================
+# Supported Models
 
 vLLM supports generative and pooling models across various tasks.
-If a model supports more than one task, you can set the task via the :code:`--task` argument.
+If a model supports more than one task, you can set the task via the {code}`--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-Loading a Model
-^^^^^^^^^^^^^^^
+## Loading a Model
 
-HuggingFace Hub
-+++++++++++++++
+### HuggingFace Hub
 
-By default, vLLM loads models from `HuggingFace (HF) Hub <https://huggingface.co/models>`_.
+By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository.
-If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
+If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
-.. tip::
-    The easiest way to check if your model is really supported at runtime is to run the program below:
+````{tip}
+The easiest way to check if your model is really supported at runtime is to run the program below:
 
-    .. code-block:: python
+```python
+from vllm import LLM
 
-        from vllm import LLM
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
 
-        # For generative models (task=generate) only
-        llm = LLM(model=..., task="generate")  # Name or path of your model
-        output = llm.generate("Hello, my name is")
-        print(output)
+# For pooling models (task={embed,classify,reward,score}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
 
-        # For pooling models (task={embed,classify,reward}) only
-        llm = LLM(model=..., task="embed")  # Name or path of your model
-        output = llm.encode("Hello, my name is")
-        print(output)
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+````
 
-    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
-for instructions on how to implement your model in vLLM.
-Alternatively, you can `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ to request vLLM support.
+### ModelScope
 
-ModelScope
-++++++++++
+To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
 
-To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
+```shell
+$ export VLLM_USE_MODELSCOPE=True
+```
 
-.. code-block:: shell
+And use with {code}`trust_remote_code=True`.
 
-    $ export VLLM_USE_MODELSCOPE=True
+```python
+from vllm import LLM
 
-And use with :code:`trust_remote_code=True`.
+llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
 
-.. code-block:: python
+# For generative models (task=generate) only
+output = llm.generate("Hello, my name is")
+print(output)
 
-    from vllm import LLM
+# For pooling models (task={embed,classify,reward,score}) only
+output = llm.encode("Hello, my name is")
+print(output)
+```
 
-    llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+## List of Text-only Language Models
 
-    # For generative models (task=generate) only
-    output = llm.generate("Hello, my name is")
-    print(output)
+### Generative Models
 
-    # For pooling models (task={embed,classify,reward}) only
-    output = llm.encode("Hello, my name is")
-    print(output)
+See [this page](#generative-models) for more information on how to use generative models.
 
-List of Text-only Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Generative Models
-+++++++++++++++++
-
-See :ref:`this page <generative_models>` for more information on how to use generative models.
-
-Text Generation (``--task generate``)
--------------------------------------
+#### Text Generation (`--task generate`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -86,8 +80,8 @@ Text Generation (``--task generate``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`AquilaForCausalLM`
     - Aquila, Aquila2
     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
@@ -111,8 +105,8 @@ Text Generation (``--task generate``)
   * - :code:`BartForConditionalGeneration`
     - BART
     - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`ChatGLMModel`
     - ChatGLM
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
@@ -136,12 +130,17 @@ Text Generation (``--task generate``)
   * - :code:`DeepseekForCausalLM`
     - DeepSeek
     - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc.
-    - 
+    -
     - ✅︎
   * - :code:`DeepseekV2ForCausalLM`
     - DeepSeek-V2
     - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
-    - 
+    -
+    - ✅︎
+  * - :code:`DeepseekV3ForCausalLM`
+    - DeepSeek-V3
+    - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc.
+    -
     - ✅︎
   * - :code:`ExaoneForCausalLM`
     - EXAONE-3
@@ -194,8 +193,8 @@ Text Generation (``--task generate``)
     -
     - ✅︎
   * - :code:`GraniteForCausalLM`
-    - Granite 3.0, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
+    - Granite 3.0, Granite 3.1, PowerLM
+    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`GraniteMoeForCausalLM`
@@ -316,7 +315,7 @@ Text Generation (``--task generate``)
   * - :code:`PersimmonForCausalLM`
     - Persimmon
     - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
-    - 
+    -
     - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen
@@ -325,7 +324,7 @@ Text Generation (``--task generate``)
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
-    - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
+    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
     - ✅︎
     - ✅︎
   * - :code:`Qwen2MoeForCausalLM`
@@ -358,29 +357,24 @@ Text Generation (``--task generate``)
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     - ✅︎
     - ✅︎
+```
 
-.. note::
-    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-
-Pooling Models
-++++++++++++++
-
-See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+```{note}
+Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+```
 
-.. important::
-    Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+### Pooling Models
 
-Text Embedding (``--task embed``)
----------------------------------
+See [this page](pooling-models) for more information on how to use pooling models.
 
-Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+```{important}
+Since some model architectures support both generative and pooling tasks,
+you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```
 
-.. note::
-    To get the best results, you should use pooling models that are specifically trained as such.
-
-The following table lists those that are tested in vLLM.
+#### Text Embedding (`--task embed`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -388,17 +382,17 @@ The following table lists those that are tested in vLLM.
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`BertModel`
     - BERT-based
     - :code:`BAAI/bge-base-en-v1.5`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`Gemma2Model`
     - Gemma2-based
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
-    - 
+    -
     - ✅︎
   * - :code:`GritLM`
     - GritLM
@@ -418,28 +412,35 @@ The following table lists those that are tested in vLLM.
   * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
     - RoBERTa-based
     - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`XLMRobertaModel`
     - XLM-RoBERTa-based
     - :code:`intfloat/multilingual-e5-large`, etc.
-    - 
-    - 
+    -
+    -
+```
+
+```{note}
+{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+```
 
-.. note::
-  :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-  You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+```{note}
+Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
-.. note::
-  Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-  You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+despite being described otherwise on its model card.
+```
 
-  On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
-  despite being described otherwise on its model card.
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
-Reward Modeling (``--task reward``)
------------------------------------
+#### Reward Modeling (`--task reward`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -447,8 +448,8 @@ Reward Modeling (``--task reward``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`LlamaForCausalLM`
     - Llama-based
     - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
@@ -459,14 +460,19 @@ Reward Modeling (``--task reward``)
     - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
     - ✅︎
     - ✅︎
+```
 
-.. important::
-  For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-  e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
-Classification (``--task classify``)
-------------------------------------
+```{important}
+For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+```
 
+#### Classification (`--task classify`)
+
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -474,17 +480,26 @@ Classification (``--task classify``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
+  * - :code:`JambaForSequenceClassification`
+    - Jamba
+    - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`Qwen2ForSequenceClassification`
     - Qwen2-based
     - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
     - ✅︎
     - ✅︎
+```
 
-Sentence Pair Scoring (``--task score``)
-----------------------------------------
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
+#### Sentence Pair Scoring (`--task score`)
+
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -492,54 +507,53 @@ Sentence Pair Scoring (``--task score``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`BertForSequenceClassification`
     - BERT-based
     - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`RobertaForSequenceClassification`
     - RoBERTa-based
     - :code:`cross-encoder/quora-roberta-base`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`XLMRobertaForSequenceClassification`
     - XLM-RoBERTa-based
     - :code:`BAAI/bge-reranker-v2-m3`, etc.
-    - 
-    - 
+    -
+    -
+```
 
-.. _supported_mm_models:
+(supported-mm-models)=
 
-List of Multimodal Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+## List of Multimodal Language Models
 
 The following modalities are supported depending on the model:
 
-- **T**\ ext
-- **I**\ mage
-- **V**\ ideo
-- **A**\ udio
+- **T**ext
+- **I**mage
+- **V**ideo
+- **A**udio
 
-Any combination of modalities joined by :code:`+` are supported.
+Any combination of modalities joined by {code}`+` are supported.
 
-- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
 
-On the other hand, modalities separated by :code:`/` are mutually exclusive.
+On the other hand, modalities separated by {code}`/` are mutually exclusive.
 
-- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-See :ref:`this page <multimodal_inputs>` on how to pass multi-modal inputs to the model.
+See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
-Generative Models
-+++++++++++++++++
+### Generative Models
 
-See :ref:`this page <generative_models>` for more information on how to use generative models.
+See [this page](#generative-models) for more information on how to use generative models.
 
-Text Generation (``--task generate``)
--------------------------------------
+#### Text Generation (`--task generate`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 15 20 5 5 5
   :header-rows: 1
@@ -548,63 +562,63 @@ Text Generation (``--task generate``)
     - Models
     - Inputs
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
     - V1
   * - :code:`AriaForConditionalGeneration`
     - Aria
     - T + I
     - :code:`rhymes-ai/Aria`
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
     -
-    - 
+    -
   * - :code:`InternVLChatModel`
     - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
     - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-    - 
+    -
     - ✅︎
     - ✅︎
   * - :code:`LlavaForConditionalGeneration`
@@ -620,28 +634,28 @@ Text Generation (``--task generate``)
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
@@ -660,16 +674,16 @@ Text Generation (``--task generate``)
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
-    - 
+    -
     - ✅︎
     - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
@@ -697,70 +711,79 @@ Text Generation (``--task generate``)
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
-    - 
+    -
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
+    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
-    - 
-
-| :sup:`E` Pre-computed embeddings can be inputted for this modality.
-| :sup:`+` Multiple items can be inputted per text prompt for this modality.
+    -
+```
 
-.. important::
-    To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference)
-    or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+```{eval-rst}
+:sup:`E` Pre-computed embeddings can be inputted for this modality.
 
-    .. code-block:: python
+:sup:`+` Multiple items can be inputted per text prompt for this modality.
+```
 
-        llm = LLM(
-            model="Qwen/Qwen2-VL-7B-Instruct",
-            limit_mm_per_prompt={"image": 4},
-        )
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
+or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
 
-    .. code-block:: bash
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
 
-        vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
 
-.. note::
-  vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
 
-.. note::
-  To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
-  and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+```{note}
+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
+and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+```
 
-.. note::
-  The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
-  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+```{note}
+The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
+For more details, please see: <gh-pr:4087#issuecomment-2250397630>
+```
 
-Pooling Models
-++++++++++++++
+### Pooling Models
 
-See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+See [this page](pooling-models) for more information on how to use pooling models.
 
-.. important::
-    Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```{important}
+Since some model architectures support both generative and pooling tasks,
+you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```
 
-Text Embedding (``--task embed``)
----------------------------------
+#### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
 
-.. note::
-    To get the best results, you should use pooling models that are specifically trained as such.
+```{note}
+To get the best results, you should use pooling models that are specifically trained as such.
+```
 
 The following table lists those that are tested in vLLM.
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 15 25 5 5
   :header-rows: 1
@@ -769,13 +792,13 @@ The following table lists those that are tested in vLLM.
     - Models
     - Inputs
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT-based
     - T / I
     - :code:`royokong/e5-v`
-    - 
+    -
     - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
@@ -787,27 +810,25 @@ The following table lists those that are tested in vLLM.
     - Qwen2-VL-based
     - T + I
     - :code:`MrLight/dse-qwen2-2b-mrl-v1`
-    - 
+    -
     - ✅︎
+```
 
-----
+______________________________________________________________________
 
-Model Support Policy
-=====================
+# Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
-
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-.. tip::
-  When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json <https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945>`__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+```{tip}
+When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
-
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
-
 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
 
 Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
@@ -816,7 +837,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
 
 We have the following levels of testing for models:
 
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md
new file mode 100644
index 0000000000000..39dc470a1c708
--- /dev/null
+++ b/docs/source/performance/benchmarks.md
@@ -0,0 +1,28 @@
+(benchmarks)=
+
+# Benchmark Suites
+
+vLLM contains two sets of benchmarks:
+
+- [Performance benchmarks](#performance-benchmarks)
+- [Nightly benchmarks](#nightly-benchmarks)
+
+(performance-benchmarks)=
+
+## Performance Benchmarks
+
+The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+
+More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+
+(nightly-benchmarks)=
+
+## Nightly Benchmarks
+
+These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
+
+The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
+
+More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst
deleted file mode 100644
index 6d4d7b544cb5d..0000000000000
--- a/docs/source/performance/benchmarks.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _benchmarks:
-
-================
-Benchmark Suites
-================
-
-vLLM contains two sets of benchmarks:
-
-+ :ref:`Performance benchmarks <performance_benchmarks>`
-+ :ref:`Nightly benchmarks <nightly_benchmarks>`
-
-
-.. _performance_benchmarks:
-
-Performance Benchmarks
-----------------------
-
-The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM.
-
-The latest performance results are hosted on the public `vLLM Performance Dashboard <https://perf.vllm.ai>`_.
-
-More information on the performance benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md>`__.
-
-.. _nightly_benchmarks:
-
-Nightly Benchmarks
-------------------
-
-These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. 
-
-The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 <https://blog.vllm.ai/2024/09/05/perf-update.html>`_.
-
-More information on the nightly benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`__.
\ No newline at end of file
diff --git a/docs/source/quantization/auto_awq.md b/docs/source/quantization/auto_awq.md
new file mode 100644
index 0000000000000..c02fbf0605a8c
--- /dev/null
+++ b/docs/source/quantization/auto_awq.md
@@ -0,0 +1,78 @@
+(auto-awq)=
+
+# AutoAWQ
+
+```{warning}
+Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
+accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
+inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
+```
+
+To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
+Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
+The main benefits are lower latency and memory usage.
+
+You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
+
+```console
+$ pip install autoawq
+```
+
+After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+
+```python
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+
+model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+quant_path = 'mistral-instruct-v0.2-awq'
+quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+
+# Load model
+model = AutoAWQForCausalLM.from_pretrained(
+    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+# Quantize
+model.quantize(tokenizer, quant_config=quant_config)
+
+# Save quantized model
+model.save_quantized(quant_path)
+tokenizer.save_pretrained(quant_path)
+
+print(f'Model is quantized and saved at "{quant_path}"')
+```
+
+To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
+
+```console
+$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+```
+
+AWQ models are also supported directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst
deleted file mode 100644
index 8eb6fa2f4cbe1..0000000000000
--- a/docs/source/quantization/auto_awq.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-.. _auto_awq:
-
-AutoAWQ
-==================
-
-.. warning::
-
-   Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-   accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-   inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-
-To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. 
-Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
-The main benefits are lower latency and memory usage.
-
-You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_. 
-
-.. code-block:: console
-
-    $ pip install autoawq
-
-After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
-
-.. code-block:: python
-
-    from awq import AutoAWQForCausalLM
-    from transformers import AutoTokenizer
-    
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-    
-    # Load model
-    model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    
-    # Quantize
-    model.quantize(tokenizer, quant_config=quant_config)
-    
-    # Save quantized model
-    model.save_quantized(quant_path)
-    tokenizer.save_pretrained(quant_path)
-    
-    print(f'Model is quantized and saved at "{quant_path}"')
-
-To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command:
-
-.. code-block:: console
-
-    $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
-
-AWQ models are also supported directly through the LLM entrypoint:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    # Sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM.
-    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md
new file mode 100644
index 0000000000000..8240eca1c7e03
--- /dev/null
+++ b/docs/source/quantization/bnb.md
@@ -0,0 +1,39 @@
+(bits-and-bytes)=
+
+# BitsAndBytes
+
+vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.
+
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+```console
+$ pip install bitsandbytes>=0.45.0
+```
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
+And usually, these repositories have a config.json file that includes a quantization_config section.
+
+## Read quantized checkpoint.
+
+```python
+from vllm import LLM
+import torch
+# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+model_id = "unsloth/tinyllama-bnb-4bit"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+quantization="bitsandbytes", load_format="bitsandbytes")
+```
+
+## Inflight quantization: load as 4bit quantization
+
+```python
+from vllm import LLM
+import torch
+model_id = "huggyllama/llama-7b"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+quantization="bitsandbytes", load_format="bitsandbytes")
+```
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
deleted file mode 100644
index 84f805bb60c2a..0000000000000
--- a/docs/source/quantization/bnb.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. _bits_and_bytes:
-
-BitsAndBytes
-==================
-
-vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
-BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
-Compared to other quantization methods,  BitsAndBytes eliminates the need for calibrating the quantized model with input data.
-
-Below are the steps to utilize BitsAndBytes with vLLM.
-
-.. code-block:: console
-
-    $ pip install bitsandbytes>=0.45.0
-
-vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
-
-You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
-And usually, these repositories have a config.json file that includes a quantization_config section.
-
-Read quantized checkpoint.
---------------------------
-
-.. code-block:: python
-
-    from vllm import LLM
-    import torch
-    # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
-    model_id = "unsloth/tinyllama-bnb-4bit"
-    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-    quantization="bitsandbytes", load_format="bitsandbytes")
-
-Inflight quantization: load as 4bit quantization
-------------------------------------------------
-
-.. code-block:: python
-
-    from vllm import LLM
-    import torch
-    model_id = "huggyllama/llama-7b"
-    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-    quantization="bitsandbytes", load_format="bitsandbytes")
-
diff --git a/docs/source/quantization/fp8.md b/docs/source/quantization/fp8.md
new file mode 100644
index 0000000000000..b2eda74fd1e3b
--- /dev/null
+++ b/docs/source/quantization/fp8.md
@@ -0,0 +1,192 @@
+(fp8)=
+
+# FP8 W8A8
+
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
+Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
+Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+
+Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127).
+
+The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
+
+- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
+- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
+
+```{note}
+FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
+FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
+```
+
+## Quick Start with Online Dynamic Quantization
+
+Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor.
+
+In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
+
+```python
+from vllm import LLM
+model = LLM("facebook/opt-125m", quantization="fp8")
+# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
+result = model.generate("Hello, my name is")
+```
+
+```{warning}
+Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
+```
+
+## Installation
+
+To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+$ pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves three main steps:
+
+1. Loading the model
+2. Applying quantization
+3. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+
+```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+  MODEL_ID, device_map="auto", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Applying Quantization
+
+For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses:
+
+- Static, per-channel quantization on the weights
+- Dynamic, per-token quantization on the activations
+
+Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Configure the simple PTQ quantization
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+
+# Apply the quantization algorithm.
+oneshot(model=model, recipe=recipe)
+
+# Save the model.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+### 3. Evaluating Accuracy
+
+Install `vllm` and `lm-evaluation-harness`:
+
+```console
+$ pip install vllm lm-eval==0.4.4
+```
+
+Load and run the model in `vllm`:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+model.generate("Hello my name is")
+```
+
+Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
+
+```{note}
+Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
+```
+
+```console
+$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
+$ lm_eval \
+  --model vllm \
+  --model_args pretrained=$MODEL,add_bos_token=True \
+  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
+```
+
+Here's an example of the resulting scores:
+
+```text
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
+|     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+
+## Deprecated Flow
+
+```{note}
+The following information is preserved for reference and search purposes.
+The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
+```
+
+For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
+
+```bash
+git clone https://github.com/neuralmagic/AutoFP8.git
+pip install -e AutoFP8
+```
+
+This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed.
+
+## Offline Quantization with Static Activation Scaling Factors
+
+You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument.
+
+```python
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
+
+pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+# Load and tokenize 512 dataset samples for calibration of activation scales
+ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
+examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
+examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
+
+# Define quantization config with static activation scales
+quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
+
+# Load the model, quantize, and save checkpoint
+model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+model.quantize(examples)
+model.save_quantized(quantized_model_dir)
+```
+
+Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`.
+Finally, you can load the quantized model checkpoint directly in vLLM.
+
+```python
+from vllm import LLM
+model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
+# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
+result = model.generate("Hello, my name is")
+```
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
deleted file mode 100644
index 4dbf8e9d346e1..0000000000000
--- a/docs/source/quantization/fp8.rst
+++ /dev/null
@@ -1,204 +0,0 @@
-.. _fp8:
-
-FP8 W8A8
-==================
-
-vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
-Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. 
-Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
-Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
-
-Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
-
-The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
-
-- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``.
-- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values.
-
-.. note::
-
-   FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
-   FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-
-Quick Start with Online Dynamic Quantization
---------------------------------------------
-
-Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor.
-
-In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM("facebook/opt-125m", quantization="fp8")
-    # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-    result = model.generate("Hello, my name is")
-
-.. warning::
-
-    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-
-Installation
-------------
-
-To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor <https://github.com/vllm-project/llm-compressor/>`_ library:
-
-.. code-block:: console
-
-   $ pip install llmcompressor
-
-Quantization Process
---------------------
-
-The quantization process involves three main steps:
-
-1. Loading the model
-2. Applying quantization
-3. Evaluating accuracy in vLLM
-
-1. Loading the Model
-^^^^^^^^^^^^^^^^^^^^
-
-Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import SparseAutoModelForCausalLM
-   from transformers import AutoTokenizer
-
-   MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-   model = SparseAutoModelForCausalLM.from_pretrained(
-     MODEL_ID, device_map="auto", torch_dtype="auto")
-   tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-2. Applying Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses:
-
-- Static, per-channel quantization on the weights
-- Dynamic, per-token quantization on the activations
-
-Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
-
-.. code-block:: python
-
-   from llmcompressor.transformers import oneshot
-   from llmcompressor.modifiers.quantization import QuantizationModifier
-
-   # Configure the simple PTQ quantization
-   recipe = QuantizationModifier(
-     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-
-   # Apply the quantization algorithm.
-   oneshot(model=model, recipe=recipe)
-
-   # Save the model.
-   SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-   model.save_pretrained(SAVE_DIR)
-   tokenizer.save_pretrained(SAVE_DIR)
-
-3. Evaluating Accuracy
-^^^^^^^^^^^^^^^^^^^^^^
-
-Install ``vllm`` and ``lm-evaluation-harness``:
-
-.. code-block:: console
-
-   $ pip install vllm lm-eval==0.4.4
-
-Load and run the model in ``vllm``:
-
-.. code-block:: python
-
-   from vllm import LLM
-   model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-   model.generate("Hello my name is")
-
-Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``):
-
-.. note::
-
-   Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations.
-
-.. code-block:: console
-
-   $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic 
-   $ lm_eval \
-     --model vllm \
-     --model_args pretrained=$MODEL,add_bos_token=True \
-     --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
-
-Here's an example of the resulting scores:
-
-.. code-block:: text
-
-   |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
-   |-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
-   |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
-   |     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
-
-Troubleshooting and Support
----------------------------
-
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
-
-
-Deprecated Flow
-------------------
-
-.. note::
-
-   The following information is preserved for reference and search purposes.
-   The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above.
-
-For static per-tensor offline quantization to FP8, please install the `AutoFP8 library <https://github.com/neuralmagic/autofp8>`_.
-
-.. code-block:: bash
-
-    git clone https://github.com/neuralmagic/AutoFP8.git
-    pip install -e AutoFP8
-
-This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed.
-
-Offline Quantization with Static Activation Scaling Factors
------------------------------------------------------------
-
-You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument.
-
-.. code-block:: python
-
-    from datasets import load_dataset
-    from transformers import AutoTokenizer
-    from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
-
-    pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-    quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
-
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # Load and tokenize 512 dataset samples for calibration of activation scales
-    ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
-    examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
-    examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
-
-    # Define quantization config with static activation scales
-    quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
-
-    # Load the model, quantize, and save checkpoint
-    model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``.
-Finally, you can load the quantized model checkpoint directly in vLLM.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
-    # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
-    result = model.generate("Hello, my name is")
-
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md
new file mode 100644
index 0000000000000..f200c722d1d42
--- /dev/null
+++ b/docs/source/quantization/fp8_e4m3_kvcache.md
@@ -0,0 +1,44 @@
+(fp8-e4m3-kvcache)=
+
+# FP8 E4M3 KV Cache
+
+Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache,
+improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2
+(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of
+the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of
+FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside
+each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling
+factors of a finer granularity (e.g. per-channel).
+
+These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If
+this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an
+unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
+
+To install AMMO (AlgorithMic Model Optimization):
+
+```console
+$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+```
+
+Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
+offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc.
+Thus, LLM inference is greatly accelerated with minimal accuracy loss.
+
+Here is an example of how to enable this feature:
+
+```python
+# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
+# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+
+from vllm import LLM, SamplingParams
+sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+          kv_cache_dtype="fp8",
+          quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+prompt = "London is the capital of"
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+print(out)
+
+# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
+# output w/o scaling factors:  England, located in the southeastern part of the country. It is known
+```
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst
deleted file mode 100644
index cc52d8f40af8f..0000000000000
--- a/docs/source/quantization/fp8_e4m3_kvcache.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-.. _fp8_e4m3_kvcache:
-
-FP8 E4M3 KV Cache
-==================
-
-Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, 
-improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 
-(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of 
-the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of 
-FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside 
-each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling 
-factors of a finer granularity (e.g. per-channel).
-
-These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If 
-this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an 
-unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). 
-
-To install AMMO (AlgorithMic Model Optimization):
-
-.. code-block:: console
-
-        $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
-
-Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon 
-offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. 
-Thus, LLM inference is greatly accelerated with minimal accuracy loss.
-
-
-Here is an example of how to enable this feature:
-
-.. code-block:: python
-
-        # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to 
-        # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
-
-        from vllm import LLM, SamplingParams
-        sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
-        llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                  kv_cache_dtype="fp8",
-                  quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
-        prompt = "London is the capital of"
-        out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-        print(out)
-
-        # output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
-        # output w/o scaling factors:  England, located in the southeastern part of the country. It is known 
-
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md
new file mode 100644
index 0000000000000..3a81ab17f332f
--- /dev/null
+++ b/docs/source/quantization/fp8_e5m2_kvcache.md
@@ -0,0 +1,31 @@
+(fp8-kv-cache)=
+
+# FP8 E5M2 KV Cache
+
+The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
+The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
+
+Here is an example of how to enable this feature:
+
+```python
+from vllm import LLM, SamplingParams
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
deleted file mode 100644
index b2d824427f786..0000000000000
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-.. _fp8_kv_cache:
-
-FP8 E5M2 KV Cache
-==================
-
-The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
-The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
-
-Here is an example of how to enable this feature:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    # Sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
diff --git a/docs/source/quantization/gguf.md b/docs/source/quantization/gguf.md
new file mode 100644
index 0000000000000..eebf11dfc1b2b
--- /dev/null
+++ b/docs/source/quantization/gguf.md
@@ -0,0 +1,72 @@
+(gguf)=
+
+# GGUF
+
+```{warning}
+Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
+```
+
+```{warning}
+Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
+```
+
+To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
+
+```console
+$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
+
+```console
+$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+```
+
+```{warning}
+We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
+```
+
+You can also use the GGUF model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# In this script, we demonstrate how to pass input to the chat method:
+conversation = [
+   {
+      "role": "system",
+      "content": "You are a helpful assistant"
+   },
+   {
+      "role": "user",
+      "content": "Hello"
+   },
+   {
+      "role": "assistant",
+      "content": "Hello! How can I assist you today?"
+   },
+   {
+      "role": "user",
+      "content": "Write an essay about the importance of higher education.",
+   },
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.chat(conversation, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+   prompt = output.prompt
+   generated_text = output.outputs[0].text
+   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/gguf.rst b/docs/source/quantization/gguf.rst
deleted file mode 100644
index 9f00dc5563909..0000000000000
--- a/docs/source/quantization/gguf.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-.. _gguf:
-
-GGUF
-==================
-
-.. warning::
-
-   Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-
-.. warning::
-
-   Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split <https://github.com/ggerganov/llama.cpp/pull/6135>`_ tool to merge them to a single-file model.
-
-To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF <https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF>`_ with the following command:
-
-.. code-block:: console
-
-   $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
-   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
-
-You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs:
-
-.. code-block:: console
-
-   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
-
-.. warning::
-
-   We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-
-You can also use the GGUF model directly through the LLM entrypoint:
-
-.. code-block:: python
-
-   from vllm import LLM, SamplingParams
-
-   # In this script, we demonstrate how to pass input to the chat method:
-   conversation = [
-      {
-         "role": "system",
-         "content": "You are a helpful assistant"
-      },
-      {
-         "role": "user",
-         "content": "Hello"
-      },
-      {
-         "role": "assistant",
-         "content": "Hello! How can I assist you today?"
-      },
-      {
-         "role": "user",
-         "content": "Write an essay about the importance of higher education.",
-      },
-   ]
-
-   # Create a sampling params object.
-   sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-   # Create an LLM.
-   llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-            tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-   # Generate texts from the prompts. The output is a list of RequestOutput objects
-   # that contain the prompt, generated text, and other information.
-   outputs = llm.chat(conversation, sampling_params)
-
-   # Print the outputs.
-   for output in outputs:
-      prompt = output.prompt
-      generated_text = output.outputs[0].text
-      print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/docs/source/quantization/int8.md b/docs/source/quantization/int8.md
new file mode 100644
index 0000000000000..1ac50ba987dda
--- /dev/null
+++ b/docs/source/quantization/int8.md
@@ -0,0 +1,136 @@
+(int8)=
+
+# INT8 W8A8
+
+vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
+This quantization method is particularly useful for reducing model size while maintaining good performance.
+
+Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
+
+```{note}
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+```
+
+## Prerequisites
+
+To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+$ pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+
+```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing activations to INT8, you need sample data to estimate the activation scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+```{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+```
+
+## Best Practices
+
+- Start with 512 samples for calibration data (increase if accuracy drops)
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst
deleted file mode 100644
index aa5b251becb1c..0000000000000
--- a/docs/source/quantization/int8.rst
+++ /dev/null
@@ -1,145 +0,0 @@
-.. _int8:
-
-INT8 W8A8
-==================
-
-vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
-This quantization method is particularly useful for reducing model size while maintaining good performance.
-
-Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415>`_.
-
-.. note::
-
-   INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-
-Prerequisites
--------------
-
-To use INT8 quantization with vLLM, you'll need to install the `llm-compressor <https://github.com/vllm-project/llm-compressor/>`_ library:
-
-.. code-block:: console
-
-   $ pip install llmcompressor
-
-Quantization Process
---------------------
-
-The quantization process involves four main steps:
-
-1. Loading the model
-2. Preparing calibration data
-3. Applying quantization
-4. Evaluating accuracy in vLLM
-
-1. Loading the Model
-^^^^^^^^^^^^^^^^^^^^
-
-Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import SparseAutoModelForCausalLM
-   from transformers import AutoTokenizer
-
-   MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-   model = SparseAutoModelForCausalLM.from_pretrained(
-       MODEL_ID, device_map="auto", torch_dtype="auto",
-   )
-   tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-2. Preparing Calibration Data
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-When quantizing activations to INT8, you need sample data to estimate the activation scales.
-It's best to use calibration data that closely matches your deployment data. 
-For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``:
-
-.. code-block:: python
-
-   from datasets import load_dataset
-
-   NUM_CALIBRATION_SAMPLES = 512
-   MAX_SEQUENCE_LENGTH = 2048
-
-   # Load and preprocess the dataset
-   ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-   ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-
-   def preprocess(example):
-       return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-   ds = ds.map(preprocess)
-
-   def tokenize(sample):
-       return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-   ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-3. Applying Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-Now, apply the quantization algorithms:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import oneshot
-   from llmcompressor.modifiers.quantization import GPTQModifier
-   from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-
-   # Configure the quantization algorithms
-   recipe = [
-       SmoothQuantModifier(smoothing_strength=0.8),
-       GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-   ]
-
-   # Apply quantization
-   oneshot(
-       model=model,
-       dataset=ds,
-       recipe=recipe,
-       max_seq_length=MAX_SEQUENCE_LENGTH,
-       num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-   )
-
-   # Save the compressed model
-   SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-   model.save_pretrained(SAVE_DIR, save_compressed=True)
-   tokenizer.save_pretrained(SAVE_DIR)
-
-This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
-
-4. Evaluating Accuracy
-^^^^^^^^^^^^^^^^^^^^^^
-
-After quantization, you can load and run the model in vLLM:
-
-.. code-block:: python
-
-   from vllm import LLM
-   model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
-
-To evaluate accuracy, you can use ``lm_eval``:
-
-.. code-block:: console
-
-   $ lm_eval --model vllm \
-     --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
-     --tasks gsm8k \
-     --num_fewshot 5 \
-     --limit 250 \
-     --batch_size 'auto'
-
-.. note::
-
-   Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations.
-
-Best Practices
---------------
-
-- Start with 512 samples for calibration data (increase if accuracy drops)
-- Use a sequence length of 2048 as a starting point
-- Employ the chat template or instruction template that the model was trained with
-- If you've fine-tuned a model, consider using a sample of your training data for calibration
-
-Troubleshooting and Support
----------------------------
-
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.md
similarity index 82%
rename from docs/source/quantization/supported_hardware.rst
rename to docs/source/quantization/supported_hardware.md
index 09f8e7112cf0c..843ee21627d78 100644
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.md
@@ -1,132 +1,132 @@
-.. _supported_hardware_for_quantization:
-
-Supported Hardware for Quantization Kernels
-===========================================
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-.. list-table::
-   :header-rows: 1
-   :widths: 20 8 8 8 8 8 8 8 8 8 8
-
-   * - Implementation
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - AMD GPU
-     - Intel GPU
-     - x86 CPU
-     - AWS Inferentia
-     - Google TPU
-   * - AWQ
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - GPTQ
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - Marlin (GPTQ/AWQ/FP8)
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - INT8 (W8A8)
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✅︎
-     - ✗
-     - ✗
-   * - FP8 (W8A8)
-     - ✗
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - AQLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - bitsandbytes
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - DeepSpeedFP
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - GGUF
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-
-Notes:
-^^^^^^
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- "✅︎" indicates that the quantization method is supported on the specified hardware.
-- "✗" indicates that the quantization method is not supported on the specified hardware.
-
-Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
+(supported-hardware-for-quantization)=
+
+# Supported Hardware for Quantization Kernels
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+```{eval-rst}
+.. list-table::
+   :header-rows: 1
+   :widths: 20 8 8 8 8 8 8 8 8 8 8
+
+   * - Implementation
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - AMD GPU
+     - Intel GPU
+     - x86 CPU
+     - AWS Inferentia
+     - Google TPU
+   * - AWQ
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+   * - GPTQ
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+   * - Marlin (GPTQ/AWQ/FP8)
+     - ✗
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - INT8 (W8A8)
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✅︎
+     - ✗
+     - ✗
+   * - FP8 (W8A8)
+     - ✗
+     - ✗
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - AQLM
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - bitsandbytes
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - DeepSpeedFP
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - GGUF
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+```
+
+## Notes:
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅︎" indicates that the quantization method is supported on the specified hardware.
+- "✗" indicates that the quantization method is not supported on the specified hardware.
+
+Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/serving/deploying_with_bentoml.md
new file mode 100644
index 0000000000000..dfa0de4f0f6d7
--- /dev/null
+++ b/docs/source/serving/deploying_with_bentoml.md
@@ -0,0 +1,7 @@
+(deploying-with-bentoml)=
+
+# Deploying with BentoML
+
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+
+For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst
deleted file mode 100644
index 4b9d19f5bdb72..0000000000000
--- a/docs/source/serving/deploying_with_bentoml.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _deploying_with_bentoml:
-
-Deploying with BentoML
-======================
-
-`BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
-
-For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.
\ No newline at end of file
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md
new file mode 100644
index 0000000000000..4863936236119
--- /dev/null
+++ b/docs/source/serving/deploying_with_cerebrium.md
@@ -0,0 +1,109 @@
+(deploying-with-cerebrium)=
+
+# Deploying with Cerebrium
+
+```{raw} html
+<p align="center">
+    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
+</p>
+```
+
+vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
+
+To install the Cerebrium client, run:
+
+```console
+$ pip install cerebrium
+$ cerebrium login
+```
+
+Next, create your Cerebrium project, run:
+
+```console
+$ cerebrium init vllm-project
+```
+
+Next, to install the required packages, add the following to your cerebrium.toml:
+
+```toml
+[cerebrium.deployment]
+docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+
+[cerebrium.dependencies.pip]
+vllm = "latest"
+```
+
+Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+
+def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+
+    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    results = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        results.append({"prompt": prompt, "generated_text": generated_text})
+
+    return {"results": results}
+```
+
+Then, run the following code to deploy it to the cloud
+
+```console
+$ cerebrium deploy
+```
+
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+
+```python
+curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+ -H 'Content-Type: application/json' \
+ -H 'Authorization: <JWT TOKEN>' \
+ --data '{
+   "prompts": [
+     "Hello, my name is",
+     "The president of the United States is",
+     "The capital of France is",
+     "The future of AI is"
+   ]
+ }'
+```
+
+You should get a response like:
+
+```python
+{
+    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+    "result": {
+        "result": [
+            {
+                "prompt": "Hello, my name is",
+                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+            },
+            {
+                "prompt": "The president of the United States is",
+                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+            },
+            {
+                "prompt": "The capital of France is",
+                "generated_text": " Paris.\n"
+            },
+            {
+                "prompt": "The future of AI is",
+                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+            }
+        ]
+    },
+    "run_time_ms": 152.53663063049316
+}
+```
+
+You now have an autoscaling endpoint where you only pay for the compute you use!
diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst
deleted file mode 100644
index 9585b6ef5cb38..0000000000000
--- a/docs/source/serving/deploying_with_cerebrium.rst
+++ /dev/null
@@ -1,112 +0,0 @@
-.. _deploying_with_cerebrium:
-
-Deploying with Cerebrium
-============================
-
-.. raw:: html
-
-    <p align="center">
-        <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
-    </p>
-
-vLLM can be run on a cloud based GPU machine with `Cerebrium <https://www.cerebrium.ai/>`__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
-
-To install the Cerebrium client, run:
-
-.. code-block:: console
-
-    $ pip install cerebrium
-    $ cerebrium login
-
-Next, create your Cerebrium project, run:
-    
-.. code-block:: console
-
-    $ cerebrium init vllm-project
-
-Next, to install the required packages, add the following to your cerebrium.toml:
-
-.. code-block:: toml
-
-    [cerebrium.deployment]
-    docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
-
-    [cerebrium.dependencies.pip]
-    vllm = "latest"
-
-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`:
-    
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
-
-    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
-    
-        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
-        outputs = llm.generate(prompts, sampling_params)
-
-        # Print the outputs.
-        results = []
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            results.append({"prompt": prompt, "generated_text": generated_text})
-
-        return {"results": results}
-
-
-Then, run the following code to deploy it to the cloud
-
-.. code-block:: console
-
-    $ cerebrium deploy
-
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
-
-.. code-block:: python
-
-    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-     -H 'Content-Type: application/json' \
-     -H 'Authorization: <JWT TOKEN>' \
-     --data '{
-       "prompts": [
-         "Hello, my name is",
-         "The president of the United States is",
-         "The capital of France is",
-         "The future of AI is"
-       ]
-     }'
-
-You should get a response like:
-
-.. code-block:: python
-    
-    {
-        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
-        "result": {
-            "result": [
-                {
-                    "prompt": "Hello, my name is",
-                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
-                },
-                {
-                    "prompt": "The president of the United States is",
-                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
-                },
-                {
-                    "prompt": "The capital of France is",
-                    "generated_text": " Paris.\n"
-                },
-                {
-                    "prompt": "The future of AI is",
-                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
-                }
-            ]
-        },
-        "run_time_ms": 152.53663063049316
-    }
-
-You now have an autoscaling endpoint where you only pay for the compute you use!
-
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md
new file mode 100644
index 0000000000000..844bd27800c7a
--- /dev/null
+++ b/docs/source/serving/deploying_with_docker.md
@@ -0,0 +1,81 @@
+(deploying-with-docker)=
+
+# Deploying with Docker
+
+## Use vLLM's Official Docker Image
+
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+
+```console
+$ docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+```{note}
+You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+memory to share data between processes under the hood, particularly for tensor parallel inference.
+```
+
+## Building vLLM's Docker Image from Source
+
+You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
+
+```console
+$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+```
+
+```{note}
+By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+for vLLM to find the current GPU type and build for that.
+```
+
+## Building for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+```{note}
+Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+```
+
+```console
+# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+$ python3 use_existing_torch.py
+$ DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t vllm/vllm-gh200-openai:latest \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+```
+
+## Use the custom-built vLLM Docker image
+
+To run vLLM with the custom-built Docker image:
+
+```console
+$ docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+```{note}
+**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
+```
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
deleted file mode 100644
index 56f0020a1011a..0000000000000
--- a/docs/source/serving/deploying_with_docker.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-.. _deploying_with_docker:
-
-Deploying with Docker
-============================
-
-vLLM offers an official Docker image for deployment.
-The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
-
-.. code-block:: console
-
-    $ docker run --runtime nvidia --gpus all \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-        -p 8000:8000 \
-        --ipc=host \
-        vllm/vllm-openai:latest \
-        --model mistralai/Mistral-7B-v0.1
-
-
-.. note::
-
-        You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
-        container to access the host's shared memory. vLLM uses PyTorch, which uses shared
-        memory to share data between processes under the hood, particularly for tensor parallel inference.
-
-
-You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
-
-.. code-block:: console
-
-    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-
-
-.. note::
-
-        By default vLLM will build for all GPU types for widest distribution. If you are just building for the
-        current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
-        for vLLM to find the current GPU type and build for that.
-
-Building for Arm64/aarch64
---------------------------
-
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
-
-.. note::
-
-        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
-        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-
-.. code-block:: console
-
-    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
-    $ DOCKER_BUILDKIT=1 sudo docker build . \
-      --target vllm-openai \
-      --platform "linux/arm64" \
-      -t vllm/vllm-gh200-openai:latest \
-      --build-arg max_jobs=66 \
-      --build-arg nvcc_threads=2 \
-      --build-arg torch_cuda_arch_list="9.0+PTX" \
-      --build-arg vllm_fa_cmake_gpu_arches="90-real"
-
-To run vLLM:
-
-.. code-block:: console
-
-    $ docker run --runtime nvidia --gpus all \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        -p 8000:8000 \
-        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-        vllm/vllm-openai <args...>
-
-.. note::
-
-        **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md
new file mode 100644
index 0000000000000..65ef1c0016208
--- /dev/null
+++ b/docs/source/serving/deploying_with_dstack.md
@@ -0,0 +1,102 @@
+(deploying-with-dstack)=
+
+# Deploying with dstack
+
+```{raw} html
+<p align="center">
+    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
+</p>
+```
+
+vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
+
+To install dstack client, run:
+
+```console
+$ pip install "dstack[all]
+$ dstack server
+```
+
+Next, to configure your dstack project, run:
+
+```console
+$ mkdir -p vllm-dstack
+$ cd vllm-dstack
+$ dstack init
+```
+
+Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+
+```yaml
+type: service
+
+python: "3.11"
+env:
+    - MODEL=NousResearch/Llama-2-7b-chat-hf
+port: 8000
+resources:
+    gpu: 24GB
+commands:
+    - pip install vllm
+    - vllm serve $MODEL --port 8000
+model:
+    format: openai
+    type: chat
+    name: NousResearch/Llama-2-7b-chat-hf
+```
+
+Then, run the following CLI for provisioning:
+
+```console
+$ dstack run . -f serve.dstack.yml
+
+⠸ Getting run plan...
+ Configuration  serve.dstack.yml
+ Project        deep-diver-main
+ User           deep-diver
+ Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+ Max price      -
+ Max duration   -
+ Spot policy    auto
+ Retry policy   no
+
+ #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+ 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+ 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+ 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    ...
+ Shown 3 of 193 offers, $5.876 max
+
+Continue? [y/n]: y
+⠙ Submitting run...
+⠏ Launching spicy-treefrog-1 (pulling)
+spicy-treefrog-1 provisioning completed (running)
+Service is published at ...
+```
+
+After the provisioning, you can interact with the model by using the OpenAI SDK:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="https://gateway.<gateway domain>",
+    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+)
+
+completion = client.chat.completions.create(
+    model="NousResearch/Llama-2-7b-chat-hf",
+    messages=[
+        {
+            "role": "user",
+            "content": "Compose a poem that explains the concept of recursion in programming.",
+        }
+    ]
+)
+
+print(completion.choices[0].message.content)
+```
+
+```{note}
+dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
+```
diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst
deleted file mode 100644
index e1eb45b225d9c..0000000000000
--- a/docs/source/serving/deploying_with_dstack.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-.. _deploying_with_dstack:
-
-Deploying with dstack
-============================
-
-.. raw:: html
-
-    <p align="center">
-        <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
-    </p>
-
-vLLM can be run on a cloud based GPU machine with `dstack <https://dstack.ai/>`__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
-
-To install dstack client, run:
-
-.. code-block:: console
-
-    $ pip install "dstack[all]
-    $ dstack server
-
-Next, to configure your dstack project, run:
-    
-.. code-block:: console
-
-    $ mkdir -p vllm-dstack
-    $ cd vllm-dstack
-    $ dstack init
-
-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
-    
-.. code-block:: yaml
-
-    type: service
-    
-    python: "3.11"
-    env:
-        - MODEL=NousResearch/Llama-2-7b-chat-hf
-    port: 8000
-    resources:
-        gpu: 24GB
-    commands:
-        - pip install vllm
-        - vllm serve $MODEL --port 8000
-    model:
-        format: openai
-        type: chat
-        name: NousResearch/Llama-2-7b-chat-hf
-
-Then, run the following CLI for provisioning:
-
-.. code-block:: console
-
-    $ dstack run . -f serve.dstack.yml
-    
-    ⠸ Getting run plan...
-     Configuration  serve.dstack.yml             
-     Project        deep-diver-main              
-     User           deep-diver                   
-     Min resources  2..xCPU, 8GB.., 1xGPU (24GB) 
-     Max price      -                            
-     Max duration   -                            
-     Spot policy    auto                         
-     Retry policy   no                           
-    
-     #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE       
-     1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-     2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-     3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-        ...                                                                                            
-     Shown 3 of 193 offers, $5.876 max
-    
-    Continue? [y/n]: y
-    ⠙ Submitting run...
-    ⠏ Launching spicy-treefrog-1 (pulling)
-    spicy-treefrog-1 provisioning completed (running)
-    Service is published at ...
-
-After the provisioning, you can interact with the model by using the OpenAI SDK:
-
-.. code-block:: python
-
-    from openai import OpenAI
-    
-    client = OpenAI(
-        base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
-    )
-    
-    completion = client.chat.completions.create(
-        model="NousResearch/Llama-2-7b-chat-hf",
-        messages=[
-            {
-                "role": "user",
-                "content": "Compose a poem that explains the concept of recursion in programming.",
-            }
-        ]
-    )
-
-    print(completion.choices[0].message.content)
-
-.. note::
-
-    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository <https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm>`__
diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.md
similarity index 88%
rename from docs/source/serving/deploying_with_helm.rst
rename to docs/source/serving/deploying_with_helm.md
index d185a6951d7ec..3b26575827011 100644
--- a/docs/source/serving/deploying_with_helm.rst
+++ b/docs/source/serving/deploying_with_helm.md
@@ -1,7 +1,6 @@
-.. _deploying_with_helm:
+(deploying-with-helm)=
 
-Deploying with Helm
-===================
+# Deploying with Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
@@ -9,44 +8,42 @@ Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s
 
 This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
 
-Prerequisites
--------------
+## Prerequisites
+
 Before you begin, ensure that you have the following:
 
 - A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin <https://github.com/NVIDIA/k8s-device-plugin>`__
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
 - S3 with the model which will be deployed
 
-Installing the chart
---------------------
-
-To install the chart with the release name ``test-vllm``:
-
-.. code-block:: console
+## Installing the chart
 
-    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+To install the chart with the release name `test-vllm`:
 
-Uninstalling the Chart
-----------------------
+```console
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+```
 
-To uninstall the ``test-vllm`` deployment:
+## Uninstalling the Chart
 
-.. code-block:: console
+To uninstall the `test-vllm` deployment:
 
-    helm uninstall test-vllm --namespace=ns-vllm
+```console
+helm uninstall test-vllm --namespace=ns-vllm
+```
 
 The command removes all the Kubernetes components associated with the
 chart **including persistent volumes** and deletes the release.
 
-Architecture
-------------
+## Architecture
 
-.. image:: architecture_helm_deployment.png
+```{image} architecture_helm_deployment.png
+```
 
-Values
-------
+## Values
 
+```{eval-rst}
 .. list-table:: Values
    :widths: 25 25 25 25
    :header-rows: 1
@@ -251,3 +248,4 @@ Values
      - string
      - test
      - Release name
+```
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
new file mode 100644
index 0000000000000..d27db826cd006
--- /dev/null
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -0,0 +1,171 @@
+(deploying-with-k8s)=
+
+# Deploying with Kubernetes
+
+Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
+
+## Prerequisites
+
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
+- Available GPU resources in your cluster
+
+## Deployment Steps
+
+1. **Create a PVC , Secret and Deployment for vLLM**
+
+PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: mistral-7b
+  namespace: default
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
+  storageClassName: default
+  volumeMode: Filesystem
+```
+
+Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+```yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+  namespace: default
+type: Opaque
+data:
+  token: "REPLACE_WITH_TOKEN"
+```
+
+Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "2Gi"
+      containers:
+      - name: mistral-7b
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            nvidia.com/gpu: "1"
+          requests:
+            cpu: "2"
+            memory: 6G
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        - mountPath: /root/.cache/huggingface
+          name: cache-volume
+        - name: shm
+          mountPath: /dev/shm
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 5
+```
+
+2. **Create a Kubernetes Service for vLLM**
+
+Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: mistral-7b
+  namespace: default
+spec:
+  ports:
+  - name: http-mistral-7b
+    port: 80
+    protocol: TCP
+    targetPort: 8000
+  # The label selector should match the deployment labels & it is useful for prefix caching feature
+  selector:
+    app: mistral-7b
+  sessionAffinity: None
+  type: ClusterIP
+```
+
+3. **Deploy and Test**
+
+Apply the deployment and service configurations using `kubectl apply -f <filename>`:
+
+```console
+kubectl apply -f deployment.yaml
+kubectl apply -f service.yaml
+```
+
+To test the deployment, run the following `curl` command:
+
+```console
+curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+      }'
+```
+
+If the service is correctly deployed, you should receive a response from the vLLM model.
+
+## Conclusion
+
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
deleted file mode 100644
index cc3606f0df851..0000000000000
--- a/docs/source/serving/deploying_with_k8s.rst
+++ /dev/null
@@ -1,175 +0,0 @@
-.. _deploying_with_k8s:
-
-Deploying with Kubernetes
-==========================
-
-Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
-
-Prerequisites
--------------
-Before you begin, ensure that you have the following:
-
-- A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
-- Available GPU resources in your cluster
-
-Deployment Steps
-----------------
-
-1.  **Create a PVC , Secret and Deployment for vLLM**
-
-
-PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
-
-.. code-block:: yaml
-
-  apiVersion: v1
-  kind: PersistentVolumeClaim
-  metadata:
-    name: mistral-7b
-    namespace: default
-  spec:
-    accessModes:
-    - ReadWriteOnce
-    resources:
-      requests:
-        storage: 50Gi
-    storageClassName: default
-    volumeMode: Filesystem
-
-Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
-
-.. code-block:: yaml
-
-  apiVersion: v1
-  kind: Secret
-  metadata:
-    name: hf-token-secret
-    namespace: default
-  type: Opaque
-  data:
-    token: "REPLACE_WITH_TOKEN"
-
-
-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
-
-.. code-block:: yaml
-
-  apiVersion: apps/v1
-  kind: Deployment
-  metadata:
-    name: mistral-7b
-    namespace: default
-    labels:
-      app: mistral-7b
-  spec:
-    replicas: 1
-    selector:
-      matchLabels:
-        app: mistral-7b
-    template:
-      metadata:
-        labels:
-          app: mistral-7b
-      spec:
-        volumes:
-        - name: cache-volume
-          persistentVolumeClaim:
-            claimName: mistral-7b
-        # vLLM needs to access the host's shared memory for tensor parallel inference.
-        - name: shm
-          emptyDir:
-            medium: Memory
-            sizeLimit: "2Gi"
-        containers:
-        - name: mistral-7b
-          image: vllm/vllm-openai:latest
-          command: ["/bin/sh", "-c"]
-          args: [
-            "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-          ]
-          env:
-          - name: HUGGING_FACE_HUB_TOKEN
-            valueFrom:
-              secretKeyRef:
-                name: hf-token-secret
-                key: token
-          ports:
-          - containerPort: 8000
-          resources:
-            limits:
-              cpu: "10"
-              memory: 20G
-              nvidia.com/gpu: "1"
-            requests:
-              cpu: "2"
-              memory: 6G
-              nvidia.com/gpu: "1"
-          volumeMounts:
-          - mountPath: /root/.cache/huggingface
-            name: cache-volume
-          - name: shm
-            mountPath: /dev/shm
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: 8000
-            initialDelaySeconds: 60
-            periodSeconds: 10
-          readinessProbe:
-            httpGet:
-              path: /health
-              port: 8000
-            initialDelaySeconds: 60
-            periodSeconds: 5
-
-2. **Create a Kubernetes Service for vLLM**
-
-Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
-
-.. code-block:: yaml
-
-    apiVersion: v1
-    kind: Service
-    metadata:
-      name: mistral-7b
-      namespace: default
-    spec:
-      ports:
-      - name: http-mistral-7b
-        port: 80
-        protocol: TCP
-        targetPort: 8000
-      # The label selector should match the deployment labels & it is useful for prefix caching feature
-      selector:
-        app: mistral-7b
-      sessionAffinity: None
-      type: ClusterIP
-
-3. **Deploy and Test**
-
-Apply the deployment and service configurations using ``kubectl apply -f <filename>``:
-
-.. code-block:: console
-
-    kubectl apply -f deployment.yaml
-    kubectl apply -f service.yaml
-
-To test the deployment, run the following ``curl`` command:
-
-.. code-block:: console
-
-    curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-      -H "Content-Type: application/json" \
-      -d '{
-            "model": "mistralai/Mistral-7B-Instruct-v0.3",
-            "prompt": "San Francisco is a",
-            "max_tokens": 7,
-            "temperature": 0
-          }'
-
-If the service is correctly deployed, you should receive a response from the vLLM model.
-
-Conclusion
-----------
-Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/serving/deploying_with_kserve.md
new file mode 100644
index 0000000000000..feaeb5d0ec8a2
--- /dev/null
+++ b/docs/source/serving/deploying_with_kserve.md
@@ -0,0 +1,7 @@
+(deploying-with-kserve)=
+
+# Deploying with KServe
+
+vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
+
+Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst
deleted file mode 100644
index 01d7ccc6e9300..0000000000000
--- a/docs/source/serving/deploying_with_kserve.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _deploying_with_kserve:
-
-Deploying with KServe
-============================
-
-vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
-
-Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/>`_ for more details on using vLLM with KServe.
diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/serving/deploying_with_kubeai.md
new file mode 100644
index 0000000000000..3609d7e05acd3
--- /dev/null
+++ b/docs/source/serving/deploying_with_kubeai.md
@@ -0,0 +1,15 @@
+(deploying-with-kubeai)=
+
+# Deploying with KubeAI
+
+[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+Please see the Installation Guides for environment specific instructions:
+
+- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
+- [EKS](https://www.kubeai.org/installation/eks/)
+- [GKE](https://www.kubeai.org/installation/gke/)
+
+Once you have KubeAI installed, you can
+[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
+using vLLM.
diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst
deleted file mode 100644
index ec3c065320fd9..0000000000000
--- a/docs/source/serving/deploying_with_kubeai.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _deploying_with_kubeai:
-
-Deploying with KubeAI
-=====================
-
-`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
-
-
-Please see the Installation Guides for environment specific instructions:
-
-* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
-* `EKS <https://www.kubeai.org/installation/eks/>`_
-* `GKE <https://www.kubeai.org/installation/gke/>`_
-
-Once you have KubeAI installed, you can
-`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
-using vLLM.
\ No newline at end of file
diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/serving/deploying_with_lws.md
new file mode 100644
index 0000000000000..22bab419eaca3
--- /dev/null
+++ b/docs/source/serving/deploying_with_lws.md
@@ -0,0 +1,11 @@
+(deploying-with-lws)=
+
+# Deploying with LWS
+
+LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
+A major use case is for multi-host/multi-node distributed inference.
+
+vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
+
+Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
+deploying vLLM on Kubernetes using LWS.
diff --git a/docs/source/serving/deploying_with_lws.rst b/docs/source/serving/deploying_with_lws.rst
deleted file mode 100644
index b63a432dde0d5..0000000000000
--- a/docs/source/serving/deploying_with_lws.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. _deploying_with_lws:
-
-Deploying with LWS
-============================
-
-LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
-A major use case is for multi-host/multi-node distributed inference.
-
-vLLM can be deployed with `LWS <https://github.com/kubernetes-sigs/lws>`_ on Kubernetes for distributed model serving.
-
-Please see `this guide <https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm>`_ for more details on
-deploying vLLM on Kubernetes using LWS.
diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md
new file mode 100644
index 0000000000000..a1f00d8536465
--- /dev/null
+++ b/docs/source/serving/deploying_with_nginx.md
@@ -0,0 +1,133 @@
+(nginxloadbalancer)=
+
+# Deploying with Nginx Loadbalancer
+
+This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
+
+Table of contents:
+
+1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
+2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
+3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
+4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
+5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
+6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
+7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
+
+(nginxloadbalancer-nginx-build)=
+
+## Build Nginx Container
+
+This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
+
+```console
+export vllm_root=`pwd`
+```
+
+Create a file named `Dockerfile.nginx`:
+
+```console
+FROM nginx:latest
+RUN rm /etc/nginx/conf.d/default.conf
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
+```
+
+Build the container:
+
+```console
+docker build . -f Dockerfile.nginx --tag nginx-lb
+```
+
+(nginxloadbalancer-nginx-conf)=
+
+## Create Simple Nginx Config file
+
+Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
+
+```console
+upstream backend {
+    least_conn;
+    server vllm0:8000 max_fails=3 fail_timeout=10000s;
+    server vllm1:8000 max_fails=3 fail_timeout=10000s;
+}
+server {
+    listen 80;
+    location / {
+        proxy_pass http://backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+```
+
+(nginxloadbalancer-nginx-vllm-container)=
+
+## Build vLLM Container
+
+```console
+cd $vllm_root
+docker build -f Dockerfile . --tag vllm
+```
+
+If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
+
+```console
+cd $vllm_root
+docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+```
+
+(nginxloadbalancer-nginx-docker-network)=
+
+## Create Docker Network
+
+```console
+docker network create vllm_nginx
+```
+
+(nginxloadbalancer-nginx-launch-container)=
+
+## Launch vLLM Containers
+
+Notes:
+
+- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
+- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
+- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
+- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
+
+```console
+mkdir -p ~/.cache/huggingface/hub/
+hf_cache_dir=~/.cache/huggingface/
+docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+```
+
+```{note}
+If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
+```
+
+(nginxloadbalancer-nginx-launch-nginx)=
+
+## Launch Nginx
+
+```console
+docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
+```
+
+(nginxloadbalancer-nginx-verify-nginx)=
+
+## Verify That vLLM Servers Are Ready
+
+```console
+docker logs vllm0 | grep Uvicorn
+docker logs vllm1 | grep Uvicorn
+```
+
+Both outputs should look like this:
+
+```console
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst
deleted file mode 100644
index b5dff02b6bae6..0000000000000
--- a/docs/source/serving/deploying_with_nginx.rst
+++ /dev/null
@@ -1,142 +0,0 @@
-.. _nginxloadbalancer:
-
-Deploying with Nginx Loadbalancer
-=================================
-
-This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. 
-
-Table of contents:
-
-#. :ref:`Build Nginx Container <nginxloadbalancer_nginx_build>`
-#. :ref:`Create Simple Nginx Config file <nginxloadbalancer_nginx_conf>`
-#. :ref:`Build vLLM Container <nginxloadbalancer_nginx_vllm_container>`
-#. :ref:`Create Docker Network <nginxloadbalancer_nginx_docker_network>`
-#. :ref:`Launch vLLM Containers <nginxloadbalancer_nginx_launch_container>`
-#. :ref:`Launch Nginx <nginxloadbalancer_nginx_launch_nginx>`
-#. :ref:`Verify That vLLM Servers Are Ready <nginxloadbalancer_nginx_verify_nginx>`
-
-.. _nginxloadbalancer_nginx_build:
-
-Build Nginx Container
----------------------
-
-This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
-
-.. code-block:: console
-
-    export vllm_root=`pwd`
-
-Create a file named ``Dockerfile.nginx``:
-
-.. code-block:: console
-
-    FROM nginx:latest
-    RUN rm /etc/nginx/conf.d/default.conf
-    EXPOSE 80
-    CMD ["nginx", "-g", "daemon off;"]
-
-Build the container:
-
-.. code-block:: console
-
-    docker build . -f Dockerfile.nginx --tag nginx-lb
-
-.. _nginxloadbalancer_nginx_conf:
-
-Create Simple Nginx Config file
--------------------------------
-
-Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``.
-
-.. code-block:: console
-
-    upstream backend {
-        least_conn;
-        server vllm0:8000 max_fails=3 fail_timeout=10000s;
-        server vllm1:8000 max_fails=3 fail_timeout=10000s;
-    }     
-    server {
-        listen 80;
-        location / {
-            proxy_pass http://backend;
-            proxy_set_header Host $host;
-            proxy_set_header X-Real-IP $remote_addr;
-            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-            proxy_set_header X-Forwarded-Proto $scheme;
-        }
-    }
-
-.. _nginxloadbalancer_nginx_vllm_container:
-
-Build vLLM Container
---------------------
-
-.. code-block:: console
-
-    cd $vllm_root
-    docker build -f Dockerfile . --tag vllm
-
-
-If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
-
-.. code-block:: console
-
-    cd $vllm_root
-    docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
-
-.. _nginxloadbalancer_nginx_docker_network:
-
-Create Docker Network
----------------------
-
-.. code-block:: console
-
-    docker network create vllm_nginx
-
-
-.. _nginxloadbalancer_nginx_launch_container:
-
-Launch vLLM Containers
-----------------------
-
-Notes:
-
-* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. 
-* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again.
-* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command.
-* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. 
-
-.. code-block:: console
-
-    mkdir -p ~/.cache/huggingface/hub/
-    hf_cache_dir=~/.cache/huggingface/
-    docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-    docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
-
-.. note::
-    If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``.
-
-.. _nginxloadbalancer_nginx_launch_nginx:
-
-Launch Nginx
-------------
-
-.. code-block:: console
-
-    docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
-    
-.. _nginxloadbalancer_nginx_verify_nginx:
-
-Verify That vLLM Servers Are Ready
-----------------------------------
-
-.. code-block:: console
-    
-    docker logs vllm0 | grep Uvicorn
-    docker logs vllm1 | grep Uvicorn
-
-Both outputs should look like this:
-
-.. code-block:: console
-
-    INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/serving/deploying_with_triton.md
new file mode 100644
index 0000000000000..9b0a6f1d54ae8
--- /dev/null
+++ b/docs/source/serving/deploying_with_triton.md
@@ -0,0 +1,5 @@
+(deploying-with-triton)=
+
+# Deploying with NVIDIA Triton
+
+The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst
deleted file mode 100644
index 5ce7c3d03dd2d..0000000000000
--- a/docs/source/serving/deploying_with_triton.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-.. _deploying_with_triton:
-
-Deploying with NVIDIA Triton
-============================
-
-The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
new file mode 100644
index 0000000000000..c0a4b23f6dc70
--- /dev/null
+++ b/docs/source/serving/distributed_serving.md
@@ -0,0 +1,105 @@
+(distributed-serving)=
+
+# Distributed Inference and Serving
+
+## How to decide the distributed inference strategy?
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+```{note}
+There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+```
+
+## Details for Distributed Inference and Serving
+
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+
+To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+
+```python
+from vllm import LLM
+llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
+output = llm.generate("San Franciso is a")
+```
+
+To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+
+```console
+$ vllm serve facebook/opt-13b \
+$     --tensor-parallel-size 4
+```
+
+You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+
+```console
+$ vllm serve gpt2 \
+$     --tensor-parallel-size 4 \
+$     --pipeline-parallel-size 2
+```
+
+## Multi-Node Inference and Serving
+
+If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
+
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+
+Pick a node as the head node, and run the following command:
+
+```console
+$ bash run_cluster.sh \
+$                   vllm/vllm-openai \
+$                   ip_of_head_node \
+$                   --head \
+$                   /path/to/the/huggingface/home/in/this/node
+```
+
+On the rest of the worker nodes, run the following command:
+
+```console
+$ bash run_cluster.sh \
+$                   vllm/vllm-openai \
+$                   ip_of_head_node \
+$                   --worker \
+$                   /path/to/the/huggingface/home/in/this/node
+```
+
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+
+Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+
+```console
+$ vllm serve /path/to/the/model/in/the/container \
+$     --tensor-parallel-size 8 \
+$     --pipeline-parallel-size 2
+```
+
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+
+```console
+$ vllm serve /path/to/the/model/in/the/container \
+$     --tensor-parallel-size 16
+```
+
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
+
+```{warning}
+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
+```
+
+```{warning}
+Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
+
+When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
+```
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
deleted file mode 100644
index b24ba53e59694..0000000000000
--- a/docs/source/serving/distributed_serving.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-.. _distributed_serving:
-
-Distributed Inference and Serving
-=================================
-
-How to decide the distributed inference strategy?
--------------------------------------------------
-
-Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
-
-- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
-- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
-- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
-
-In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
-
-After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
-
-.. note::
-    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-
-Details for Distributed Inference and Serving
-----------------------------------------------
-
-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
-
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
-
-To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
-
-.. code-block:: python
-
-    from vllm import LLM
-    llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-    output = llm.generate("San Franciso is a")
-
-To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
-
-.. code-block:: console
-
-    $ vllm serve facebook/opt-13b \
-    $     --tensor-parallel-size 4
-
-You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
-
-.. code-block:: console
-
-    $ vllm serve gpt2 \
-    $     --tensor-parallel-size 4 \
-    $     --pipeline-parallel-size 2
-
-Multi-Node Inference and Serving
---------------------------------
-
-If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
-
-The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command.
-
-Pick a node as the head node, and run the following command:
-
-.. code-block:: console
-
-    $ bash run_cluster.sh \
-    $                   vllm/vllm-openai \
-    $                   ip_of_head_node \
-    $                   --head \
-    $                   /path/to/the/huggingface/home/in/this/node
-
-On the rest of the worker nodes, run the following command:
-
-.. code-block:: console
-
-    $ bash run_cluster.sh \
-    $                   vllm/vllm-openai \
-    $                   ip_of_head_node \
-    $                   --worker \
-    $                   /path/to/the/huggingface/home/in/this/node
-
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
-
-Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
-
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
-
-.. code-block:: console
-
-    $ vllm serve /path/to/the/model/in/the/container \
-    $     --tensor-parallel-size 8 \
-    $     --pipeline-parallel-size 2
-
-You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
-
-.. code-block:: console
-
-    $ vllm serve /path/to/the/model/in/the/container \
-    $     --tensor-parallel-size 16
-
-To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
-
-.. warning::
-    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion <https://github.com/vllm-project/vllm/issues/6803>`_ for more information.
-
-.. warning::
-
-    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
-
-    When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model.
diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md
new file mode 100644
index 0000000000000..d214c77254257
--- /dev/null
+++ b/docs/source/serving/integrations.md
@@ -0,0 +1,17 @@
+# Integrations
+
+```{toctree}
+:maxdepth: 1
+
+run_on_sky
+deploying_with_kserve
+deploying_with_kubeai
+deploying_with_triton
+deploying_with_bentoml
+deploying_with_cerebrium
+deploying_with_lws
+deploying_with_dstack
+serving_with_langchain
+serving_with_llamaindex
+serving_with_llamastack
+```
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
deleted file mode 100644
index 0dd505a739863..0000000000000
--- a/docs/source/serving/integrations.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Integrations
-------------
-
-.. toctree::
-   :maxdepth: 1
-
-   run_on_sky
-   deploying_with_kserve
-   deploying_with_kubeai
-   deploying_with_triton
-   deploying_with_bentoml
-   deploying_with_cerebrium
-   deploying_with_lws
-   deploying_with_dstack
-   serving_with_langchain
-   serving_with_llamaindex
-   serving_with_llamastack
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
new file mode 100644
index 0000000000000..2dc78643f6d8f
--- /dev/null
+++ b/docs/source/serving/metrics.md
@@ -0,0 +1,38 @@
+# Production Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the
+system. These metrics are exposed via the `/metrics` endpoint on the vLLM
+OpenAI compatible API server.
+
+You can start the server using Python, or using [Docker](deploying_with_docker.md):
+
+```console
+$ vllm serve unsloth/Llama-3.2-1B-Instruct
+```
+
+Then query the endpoint to get the latest metrics from the server:
+
+```console
+$ curl http://0.0.0.0:8000/metrics
+
+# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
+# TYPE vllm:iteration_tokens_total histogram
+vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
+vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+...
+```
+
+The following metrics are exposed:
+
+```{literalinclude} ../../../vllm/engine/metrics.py
+:end-before: end-metrics-definitions
+:language: python
+:start-after: begin-metrics-definitions
+```
diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst
deleted file mode 100644
index 231111cd7b738..0000000000000
--- a/docs/source/serving/metrics.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-Production Metrics
-==================
-
-vLLM exposes a number of metrics that can be used to monitor the health of the
-system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM
-OpenAI compatible API server.
-
-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
-
-.. code-block:: console
-
-    $ vllm serve unsloth/Llama-3.2-1B-Instruct
-
-Then query the endpoint to get the latest metrics from the server:
-
-.. code-block:: console
-
-    $ curl http://0.0.0.0:8000/metrics
-    
-    # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
-    # TYPE vllm:iteration_tokens_total histogram
-    vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
-    vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    ...
-
-The following metrics are exposed:
-
-.. literalinclude:: ../../../vllm/engine/metrics.py
-    :language: python
-    :start-after: begin-metrics-definitions
-    :end-before: end-metrics-definitions
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1bc8d32d2d161..caf5e8cafd9aa 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -2,7 +2,7 @@
 
 vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst):
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
@@ -30,20 +30,22 @@ print(completion.choices[0].message)
 We currently support the following OpenAI APIs:
 
 - [Completions API](#completions-api) (`/v1/completions`)
-  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
+  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
-  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
+  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template).
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
-  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
+  - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
 
 In addition, we have the following custom APIs:
 
 - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
   - Applicable to any model with a tokenizer.
+- [Pooling API](#pooling-api) (`/pooling`)
+  - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
-  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
+  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
 ## Chat Template
@@ -63,8 +65,7 @@ and all chat requests will error.
 vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
 
 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
@@ -111,7 +112,13 @@ completion = client.chat.completions.create(
 
 ## Extra HTTP Headers
 
-Only `X-Request-Id` HTTP request header is supported for now.
+Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
+with `--enable-request-id-headers`. 
+
+> Note that enablement of the headers can impact performance significantly at high QPS
+> rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
+> rather than within the vLLM layer for this reason.
+> See https://github.com/vllm-project/vllm/pull/11529 for more details.
 
 ```python
 completion = client.chat.completions.create(
@@ -179,11 +186,14 @@ The order of priorities is `command line > config file values > defaults`.
 (completions-api)=
 ### Completions API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+Code example: <gh-file:examples/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -200,18 +210,21 @@ The following extra parameters are supported:
 ```
 
 (chat-api)=
-### Chat Completions API
+### Chat API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
+Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
+Code example: <gh-file:examples/openai_chat_completion_client.py>
+
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -230,18 +243,21 @@ The following extra parameters are supported:
 (embeddings-api)=
 ### Embeddings API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
 ```{tip}
-This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
+Code example: <gh-file:examples/openai_embedding_client.py>
+
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -268,20 +284,31 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 (tokenizer-api)=
 ### Tokenizer API
 
-The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
 It consists of two endpoints:
 
 - `/tokenize` corresponds to calling `tokenizer.encode()`.
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
+(pooling-api)=
+### Pooling API
+
+Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
+
+The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+
+Code example: <gh-file:examples/openai_pooling_client.py>
+
 (score-api)=
 ### Score API
 
-The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Our Score API applies a cross-encoder model to predict scores for sentence pairs.
 Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
+Code example: <gh-file:examples/openai_cross_encoder_score.py>
+
 #### Single inference
 
 You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
@@ -418,7 +445,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/docs/source/serving/run_on_sky.md b/docs/source/serving/run_on_sky.md
new file mode 100644
index 0000000000000..115873ae49292
--- /dev/null
+++ b/docs/source/serving/run_on_sky.md
@@ -0,0 +1,345 @@
+(on-cloud)=
+
+# Deploying and scaling up with SkyPilot
+
+```{raw} html
+<p align="center">
+  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+</p>
+```
+
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+
+## Prerequisites
+
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
+- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+
+```console
+pip install skypilot-nightly
+sky check
+```
+
+## Run on a single instance
+
+See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
+
+```yaml
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log &
+
+  echo 'Waiting for vllm api server to start...'
+  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+  echo 'Starting gradio server...'
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://localhost:8081/v1 \
+    --stop-token-ids 128009,128001
+```
+
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
+
+```console
+HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
+```
+
+Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
+
+```console
+(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+```
+
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+
+```console
+HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+```
+
+## Scale up to multiple replicas
+
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+
+```yaml
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+    model: $MODEL_NAME
+    messages:
+      - role: user
+        content: Hello! What is your name?
+  max_completion_tokens: 1
+```
+
+```{raw} html
+<details>
+<summary>Click to see the full recipe YAML</summary>
+```
+
+```yaml
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_completion_tokens: 1
+
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log
+```
+
+```{raw} html
+</details>
+```
+
+Start the serving the Llama-3 8B model on multiple replicas:
+
+```console
+HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+```
+
+Wait until the service is ready:
+
+```console
+watch -n10 sky serve status vllm
+```
+
+```{raw} html
+<details>
+<summary>Example outputs:</summary>
+```
+
+```console
+Services
+NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+
+Service Replicas
+SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
+vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+```
+
+```{raw} html
+</details>
+```
+
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+
+```console
+ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+curl -L http://$ENDPOINT/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Who are you?"
+    }
+    ],
+    "stop_token_ids": [128009,  128001]
+  }'
+```
+
+To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
+
+```yaml
+service:
+  replica_policy:
+    min_replicas: 2
+    max_replicas: 4
+    target_qps_per_replica: 2
+```
+
+This will scale the service up to when the QPS exceeds 2 for each replica.
+
+```{raw} html
+<details>
+<summary>Click to see the full recipe YAML</summary>
+```
+
+```yaml
+service:
+  replica_policy:
+    min_replicas: 2
+    max_replicas: 4
+    target_qps_per_replica: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_completion_tokens: 1
+
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log
+```
+
+```{raw} html
+</details>
+```
+
+To update the service with the new config:
+
+```console
+HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
+```
+
+To stop the service:
+
+```console
+sky serve down vllm
+```
+
+### **Optional**: Connect a GUI to the endpoint
+
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+
+```{raw} html
+<details>
+<summary>Click to see the full GUI YAML</summary>
+```
+
+```yaml
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+
+resources:
+  cpus: 2
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  # Install Gradio for web UI.
+  pip install gradio openai
+
+run: |
+  conda activate vllm
+  export PATH=$PATH:/sbin
+
+  echo 'Starting gradio server...'
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://$ENDPOINT/v1 \
+    --stop-token-ids 128009,128001 | tee ~/gradio.log
+```
+
+```{raw} html
+</details>
+```
+
+1. Start the chat web UI:
+
+```console
+sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+```
+
+2. Then, we can access the GUI at the returned gradio link:
+
+```console
+| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+```
diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
deleted file mode 100644
index 227e6fd2a7818..0000000000000
--- a/docs/source/serving/run_on_sky.rst
+++ /dev/null
@@ -1,366 +0,0 @@
-.. _on_cloud:
-
-Deploying and scaling up with SkyPilot
-================================================
-
-.. raw:: html
-
-  <p align="center">
-    <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
-  </p>
-
-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
-
-
-Prerequisites
--------------
-
-- Go to the `HuggingFace model page <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
-- Check that you have installed SkyPilot (`docs <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__).
-- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
-
-.. code-block:: console
-
-  pip install skypilot-nightly
-  sky check
-
-
-Run on a single instance
-------------------------
-
-See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
-
-.. code-block:: yaml
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log &
-    
-    echo 'Waiting for vllm api server to start...'
-    while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-    echo 'Starting gradio server...'
-    git clone https://github.com/vllm-project/vllm.git || true
-    python vllm/examples/gradio_openai_chatbot_webserver.py \
-      -m $MODEL_NAME \
-      --port 8811 \
-      --model-url http://localhost:8081/v1 \
-      --stop-token-ids 128009,128001
-
-Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
-
-Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
-
-.. code-block:: console
-
-  (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
-
-**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
-
-
-Scale up to multiple replicas
------------------------------
-
-SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
-
-.. code-block:: yaml
-
-  service:
-    replicas: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
-    max_completion_tokens: 1
-    
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full recipe YAML</summary>
-
-
-.. code-block:: yaml
-
-  service:
-    replicas: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-        model: $MODEL_NAME
-        messages:
-          - role: user
-            content: Hello! What is your name?
-        max_completion_tokens: 1
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log
-
-.. raw:: html
-
-  </details>
-
-Start the serving the Llama-3 8B model on multiple replicas:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
-
-
-Wait until the service is ready:
-
-.. code-block:: console
-
-  watch -n10 sky serve status vllm
-
-
-.. raw:: html
-
-  <details>
-  <summary>Example outputs:</summary>
-
-.. code-block:: console
-
-  Services
-  NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
-  vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
-
-  Service Replicas
-  SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
-  vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
-  vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
-
-.. raw:: html
-  
-  </details>
-
-After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
-
-.. code-block:: console
-
-  ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-  curl -L http://$ENDPOINT/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-      "messages": [
-      {
-        "role": "system",
-        "content": "You are a helpful assistant."
-      },
-      {
-        "role": "user",
-        "content": "Who are you?"
-      }
-      ],
-      "stop_token_ids": [128009,  128001]
-    }'
-
-To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
-
-.. code-block:: yaml
-
-  service:
-    replica_policy:
-      min_replicas: 2
-      max_replicas: 4
-      target_qps_per_replica: 2
-
-This will scale the service up to when the QPS exceeds 2 for each replica.
-
-    
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full recipe YAML</summary>
-
-
-.. code-block:: yaml
-
-  service:
-    replica_policy:
-      min_replicas: 2
-      max_replicas: 4
-      target_qps_per_replica: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-        model: $MODEL_NAME
-        messages:
-          - role: user
-            content: Hello! What is your name?
-        max_completion_tokens: 1
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log
-
-
-.. raw:: html
-  
-  </details>
-
-To update the service with the new config:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
-
-
-To stop the service:
-
-.. code-block:: console
-
-  sky serve down vllm
-
-
-**Optional**: Connect a GUI to the endpoint
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
-It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
-
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full GUI YAML</summary>
-
-.. code-block:: yaml
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
-
-  resources:
-    cpus: 2
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    # Install Gradio for web UI.
-    pip install gradio openai
-
-  run: |
-    conda activate vllm
-    export PATH=$PATH:/sbin
-
-    echo 'Starting gradio server...'
-    git clone https://github.com/vllm-project/vllm.git || true
-    python vllm/examples/gradio_openai_chatbot_webserver.py \
-      -m $MODEL_NAME \
-      --port 8811 \
-      --model-url http://$ENDPOINT/v1 \
-      --stop-token-ids 128009,128001 | tee ~/gradio.log
-
-
-.. raw:: html
-  
-  </details>
-
-1. Start the chat web UI:
-
-.. code-block:: console
-
-  sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
-
-
-2. Then, we can access the GUI at the returned gradio link:
-
-.. code-block:: console
-
-  | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
-
-
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md
new file mode 100644
index 0000000000000..1b5756a95075a
--- /dev/null
+++ b/docs/source/serving/runai_model_streamer.md
@@ -0,0 +1,53 @@
+(runai-model-streamer)=
+
+# Loading Models with Run:ai Model Streamer
+
+Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
+Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
+
+vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
+You first need to install vLLM RunAI optional dependency:
+
+```console
+$ pip3 install vllm[runai]
+```
+
+To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+```
+
+To run model from AWS S3 object store run:
+
+```console
+$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+```
+
+To run model from a S3 compatible object store run:
+
+```console
+$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+```
+
+## Tunable parameters
+
+You can tune parameters using `--model-loader-extra-config`:
+
+You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
+For reading from S3, it will be the number of client instances the host is opening to the S3 server.
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+```
+
+You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+```
+
+```{note}
+For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
+```
diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/serving_with_langchain.md
new file mode 100644
index 0000000000000..96bd5943f3d64
--- /dev/null
+++ b/docs/source/serving/serving_with_langchain.md
@@ -0,0 +1,30 @@
+(run-on-langchain)=
+
+# Serving with Langchain
+
+vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) .
+
+To install langchain, run
+
+```console
+$ pip install langchain langchain_community -q
+```
+
+To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
+
+```python
+from langchain_community.llms import VLLM
+
+llm = VLLM(model="mosaicml/mpt-7b",
+           trust_remote_code=True,  # mandatory for hf models
+           max_new_tokens=128,
+           top_k=10,
+           top_p=0.95,
+           temperature=0.8,
+           # tensor_parallel_size=... # for distributed inference
+)
+
+print(llm("What is the capital of France ?"))
+```
+
+Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst
deleted file mode 100644
index 6440c8aad5986..0000000000000
--- a/docs/source/serving/serving_with_langchain.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _run_on_langchain:
-
-Serving with Langchain
-============================
-
-vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
-
-To install langchain, run
-
-.. code-block:: console
-
-    $ pip install langchain langchain_community -q
-
-To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
-
-.. code-block:: python
-
-    from langchain_community.llms import VLLM
-
-    llm = VLLM(model="mosaicml/mpt-7b",
-               trust_remote_code=True,  # mandatory for hf models
-               max_new_tokens=128,
-               top_k=10,
-               top_p=0.95,
-               temperature=0.8,
-               # tensor_parallel_size=... # for distributed inference
-    )
-
-    print(llm("What is the capital of France ?"))
-
-Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/serving_with_llamaindex.md
new file mode 100644
index 0000000000000..98859d8e3f828
--- /dev/null
+++ b/docs/source/serving/serving_with_llamaindex.md
@@ -0,0 +1,26 @@
+(run-on-llamaindex)=
+
+# Serving with llama_index
+
+vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) .
+
+To install llamaindex, run
+
+```console
+$ pip install llama-index-llms-vllm -q
+```
+
+To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
+
+```python
+from llama_index.llms.vllm import Vllm
+
+llm = Vllm(
+    model="microsoft/Orca-2-7b",
+    tensor_parallel_size=4,
+    max_new_tokens=100,
+    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+)
+```
+
+Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst
deleted file mode 100644
index 038e961344e47..0000000000000
--- a/docs/source/serving/serving_with_llamaindex.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. _run_on_llamaindex:
-
-Serving with llama_index
-============================
-
-vLLM is also available via `llama_index <https://github.com/run-llama/llama_index>`_ .
-
-To install llamaindex, run
-
-.. code-block:: console
-
-    $ pip install llama-index-llms-vllm -q
-
-To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``.
-
-.. code-block:: python
-
-    from llama_index.llms.vllm import Vllm
-
-    llm = Vllm(
-        model="microsoft/Orca-2-7b",
-        tensor_parallel_size=4,
-        max_new_tokens=100,
-        vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
-    )
-
-Please refer to this `Tutorial <https://docs.llamaindex.ai/en/latest/examples/llm/vllm/>`_ for more details.
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md
new file mode 100644
index 0000000000000..71dadca7ad47c
--- /dev/null
+++ b/docs/source/serving/serving_with_llamastack.md
@@ -0,0 +1,38 @@
+(run-on-llamastack)=
+
+# Serving with Llama Stack
+
+vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+
+To install Llama Stack, run
+
+```console
+$ pip install llama-stack -q
+```
+
+## Inference using OpenAI Compatible API
+
+Then start Llama Stack server pointing to your vLLM server with the following configuration:
+
+```yaml
+inference:
+  - provider_id: vllm0
+    provider_type: remote::vllm
+    config:
+      url: http://127.0.0.1:8000
+```
+
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+
+## Inference via Embedded vLLM
+
+An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+is also available. This is a sample of configuration using that method:
+
+```yaml
+inference
+  - provider_type: vllm
+    config:
+      model: Llama3.1-8B-Instruct
+      tensor_parallel_size: 4
+```
diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
deleted file mode 100644
index a2acd7b39f887..0000000000000
--- a/docs/source/serving/serving_with_llamastack.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. _run_on_llamastack:
-
-Serving with Llama Stack
-============================
-
-vLLM is also available via `Llama Stack <https://github.com/meta-llama/llama-stack>`_ .
-
-To install Llama Stack, run
-
-.. code-block:: console
-
-    $ pip install llama-stack -q
-
-Inference using OpenAI Compatible API
--------------------------------------
-
-Then start Llama Stack server pointing to your vLLM server with the following configuration:
-
-.. code-block:: yaml
-
-    inference:
-      - provider_id: vllm0
-        provider_type: remote::vllm
-        config:
-          url: http://127.0.0.1:8000
-
-Please refer to `this guide <https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html>`_ for more details on this remote vLLM provider.
-
-Inference via Embedded vLLM
----------------------------
-
-An `inline vLLM provider
-<https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm>`_
-is also available. This is a sample of configuration using that method:
-
-.. code-block:: yaml
-
-    inference
-      - provider_type: vllm
-        config:
-          model: Llama3.1-8B-Instruct
-          tensor_parallel_size: 4
diff --git a/docs/source/serving/tensorizer.md b/docs/source/serving/tensorizer.md
new file mode 100644
index 0000000000000..d3dd29d48f730
--- /dev/null
+++ b/docs/source/serving/tensorizer.md
@@ -0,0 +1,16 @@
+(tensorizer)=
+
+# Loading Models with CoreWeave's Tensorizer
+
+vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
+vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
+at runtime extremely quickly directly to the GPU, resulting in significantly
+shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
+
+For more information on CoreWeave's Tensorizer, please refer to
+[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+
+```{note}
+Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
+```
diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst
deleted file mode 100644
index 96a93db94871b..0000000000000
--- a/docs/source/serving/tensorizer.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _tensorizer:
-
-Loading Models with CoreWeave's Tensorizer
-==========================================
-vLLM supports loading models with `CoreWeave's Tensorizer <https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer>`_.
-vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
-at runtime extremely quickly directly to the GPU, resulting in significantly
-shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
-
-For more information on CoreWeave's Tensorizer, please refer to
-`CoreWeave's Tensorizer documentation <https://github.com/coreweave/tensorizer>`_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the `vLLM example script <https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html>`_.
-
-.. note::
-  Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md
new file mode 100644
index 0000000000000..3cefa12ea8a1d
--- /dev/null
+++ b/docs/source/usage/compatibility_matrix.md
@@ -0,0 +1,468 @@
+(compatibility-matrix)=
+
+# Compatibility Matrix
+
+The tables below show mutually exclusive features and the support on some hardware.
+
+```{note}
+Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
+```
+
+## Feature x Feature
+
+```{raw} html
+<style>
+  /* Make smaller to try to improve readability  */
+  td {
+    font-size: 0.8rem;
+    text-align: center;
+  }
+
+  th {
+    text-align: center;
+    font-size: 0.8rem;
+  }
+</style>
+```
+
+```{list-table}
+   :header-rows: 1
+   :stub-columns: 1
+   :widths: auto
+
+   * - Feature
+     - [CP](#chunked-prefill)
+     - [APC](#apc)
+     - [LoRA](#lora-adapter)
+     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - [SD](#spec_decode)
+     - CUDA graph
+     - <abbr title="Pooling Models">pooling</abbr>
+     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - <abbr title="Logprobs">logP</abbr>
+     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - <abbr title="Async Output Processing">async output</abbr>
+     - multi-step
+     - <abbr title="Multimodal Inputs">mm</abbr>
+     - best-of
+     - beam-search
+     - <abbr title="Guided Decoding">guided dec</abbr>
+   * - [CP](#chunked-prefill)
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [APC](#apc)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [LoRA](#lora-adapter)
+     - [✗](gh-pr:9057)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - ✅
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [SD](#spec_decode)
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Pooling Models">pooling</abbr>
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - ✗
+     - [✗](gh-issue:7366)
+     - ✗
+     - ✗
+     - [✗](gh-issue:7366)
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Logprobs">logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-pr:8199)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Async Output Processing">async output</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✗
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+   * - multi-step
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✗
+     - ✅
+     - [✗](gh-issue:8198)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Multimodal Inputs">mm</abbr>
+     - ✅
+     -  [✗](gh-pr:8348)
+     -  [✗](gh-pr:7199)
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     -
+     -
+     -
+     -
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-issue:6137)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - [✗](gh-issue:7968)
+     - ✅
+     -
+     -
+     -
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-issue:6137)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - [✗](gh-issue:7968>)
+     - ?
+     - ✅
+     -
+     -
+   * - <abbr title="Guided Decoding">guided dec</abbr>
+     - ✅
+     - ✅
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ✗
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-issue:9893)
+     - ?
+     - ✅
+     - ✅
+     -
+
+```
+
+### Feature x Hardware
+
+```{list-table}
+   :header-rows: 1
+   :stub-columns: 1
+   :widths: auto
+
+   * - Feature
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - CPU
+     - AMD
+   * - [CP](#chunked-prefill)
+     - [✗](gh-issue:2729)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - [APC](#apc)
+     - [✗](gh-issue:3687)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - [LoRA](#lora-adapter)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-pr:4830)
+     - ✅
+   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-issue:8475)
+     - ✅
+   * - [SD](#spec_decode)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - <abbr title="Pooling Models">pooling</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+   * - <abbr title="Multimodal Inputs">mm</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Logprobs">logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Async Output Processing">async output</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✗
+   * - multi-step
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](gh-issue:8477)
+     - ✅
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Guided Decoding">guided dec</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+```
diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst
deleted file mode 100644
index 04dd72b1e3527..0000000000000
--- a/docs/source/usage/compatibility_matrix.rst
+++ /dev/null
@@ -1,468 +0,0 @@
-.. _compatibility_matrix:
-
-Compatibility Matrix
-====================
-
-The tables below show mutually exclusive features and the support on some hardware. 
-
-.. note::
-
-   Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-
-Feature x Feature
------------------
-
-
-.. raw:: html
-
-    <style>
-      /* Make smaller to try to improve readability  */
-      td {
-        font-size: 0.8rem;
-        text-align: center;
-      }
-
-      th {
-        text-align: center;
-        font-size: 0.8rem;
-      }
-    </style>
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - Feature
-     - :ref:`CP <chunked-prefill>`
-     - :ref:`APC <apc>`
-     - :ref:`LoRA <lora>`
-     - :abbr:`prmpt adptr (Prompt Adapter)`
-     - :ref:`SD <spec_decode>`
-     - CUDA graph
-     - :abbr:`pooling (Pooling Models)`
-     - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - :abbr:`logP (Logprobs)`
-     - :abbr:`prmpt logP (Prompt Logprobs)`
-     - :abbr:`async output (Async Output Processing)`
-     - multi-step
-     - :abbr:`mm (Multimodal Inputs)`
-     - best-of
-     - beam-search
-     - :abbr:`guided dec (Guided Decoding)`
-   * - :ref:`CP <chunked-prefill>`
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`APC <apc>`
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`LoRA <lora>`
-     - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`prmpt adptr (Prompt Adapter)`
-     - ✅
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`SD <spec_decode>`
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`pooling (Pooling Models)`
-     - ✗
-     - ✗
-     - ✗ 
-     - ✗
-     - ✗
-     - ✗
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - ✗
-     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
-     - ✗ 
-     - ✗
-     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`logP (Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅ 
-     - ✗
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
-     - ✅
-     - ✗
-     - ✅ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`async output (Async Output Processing)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅ 
-     - ✗ 
-     - ✗
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - multi-step
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗ 
-     - ✗
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`mm (Multimodal Inputs)`
-     - ✅
-     -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
-     -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - 
-     - 
-     - 
-     - 
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
-     - ✅
-     - 
-     - 
-     - 
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
-     - ?
-     - ✅
-     - 
-     - 
-   * - :abbr:`guided dec (Guided Decoding)`
-     - ✅
-     - ✅
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✗
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__ 
-     - ?
-     - ✅
-     - ✅
-     - 
-
-
-Feature x Hardware
-^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - Feature
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - CPU
-     - AMD
-   * - :ref:`CP <chunked-prefill>`
-     - `✗ <https://github.com/vllm-project/vllm/issues/2729>`__ 
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :ref:`APC <apc>`
-     - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :ref:`LoRA <lora>`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/pull/4830>`__ 
-     - ✅
-   * - :abbr:`prmpt adptr (Prompt Adapter)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8475>`__ 
-     - ✅
-   * - :ref:`SD <spec_decode>`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-   * - :abbr:`pooling (Pooling Models)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-   * - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-   * - :abbr:`mm (Multimodal Inputs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`logP (Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`async output (Async Output Processing)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✗
-   * - multi-step
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
-     - ✅
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`guided dec (Guided Decoding)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/usage/disagg_prefill.md
new file mode 100644
index 0000000000000..a61c00fad1e3c
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.md
@@ -0,0 +1,64 @@
+(disagg-prefill)=
+
+# Disaggregated prefilling (experimental)
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+
+## Why disaggregated prefilling?
+
+Two main reasons:
+
+- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+```{note}
+Disaggregated prefill DOES NOT improve throughput.
+```
+
+## Usage example
+
+Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+
+## Benchmarks
+
+Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+
+## Development
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
+
+Key abstractions for disaggregated prefilling:
+
+- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
+- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
+
+```{note}
+`insert` is non-blocking operation but `drop_select` is blocking operation.
+```
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+```{image} /assets/usage/disagg_prefill/abstraction.jpg
+:alt: Disaggregated prefilling abstractions
+```
+
+The workflow of disaggregated prefilling is as follows:
+
+```{image} /assets/usage/disagg_prefill/overview.jpg
+:alt: Disaggregated prefilling workflow
+```
+
+The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
+
+## Third-party contributions
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
+- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst
deleted file mode 100644
index 9fe714b4fd856..0000000000000
--- a/docs/source/usage/disagg_prefill.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-.. _disagg_prefill:
-
-Disaggregated prefilling (experimental)
-=======================================
-
-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. 
-
-Why disaggregated prefilling?
------------------------------
-
-Two main reasons:
-
-* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
-* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
-
-.. note::
-    Disaggregated prefill DOES NOT improve throughput.
-
-Usage example
--------------
-
-Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling.
-
-
-Benchmarks
-----------
-
-Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks.
-
-
-Development
------------
-
-We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
-
-All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``.
-
-Key abstractions for disaggregated prefilling:
-
-* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
-* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer.
-* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``.
-
-.. note::
-    ``insert`` is non-blocking operation but ``drop_select`` is blocking operation.
-
-Here is a figure illustrating how the above 3 abstractions are organized:
-
-.. image:: /assets/usage/disagg_prefill/abstraction.jpg
-    :alt: Disaggregated prefilling abstractions
-
-The workflow of disaggregated prefilling is as follows:
-
-.. image:: /assets/usage/disagg_prefill/overview.jpg
-    :alt: Disaggregated prefilling workflow
-
-The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer.
-
-
-Third-party contributions
--------------------------
-
-Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
-
-We recommend three ways of implementations:
-
-* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
-* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL.
-* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`.
diff --git a/docs/source/usage/engine_args.rst b/docs/source/usage/engine_args.md
similarity index 76%
rename from docs/source/usage/engine_args.rst
rename to docs/source/usage/engine_args.md
index e7ce8cdcabe88..cd3c6a430b7fa 100644
--- a/docs/source/usage/engine_args.rst
+++ b/docs/source/usage/engine_args.md
@@ -1,23 +1,25 @@
-.. _engine_args:
+(engine-args)=
 
-Engine Arguments
-================
+# Engine Arguments
 
 Below, you can find an explanation of every engine argument for vLLM:
 
+```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+```
 
-Async Engine Arguments
-----------------------
+## Async Engine Arguments
 
 Below are the additional arguments related to the asynchronous engine:
 
+```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
     :prog: vllm serve
-    :nodefaultconst:
\ No newline at end of file
+    :nodefaultconst:
+```
diff --git a/docs/source/usage/env_vars.md b/docs/source/usage/env_vars.md
new file mode 100644
index 0000000000000..f9b08077a03b4
--- /dev/null
+++ b/docs/source/usage/env_vars.md
@@ -0,0 +1,15 @@
+# Environment Variables
+
+vLLM uses the following environment variables to configure the system:
+
+```{warning}
+Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
+
+All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
+```
+
+```{literalinclude} ../../../vllm/envs.py
+:end-before: end-env-vars-definition
+:language: python
+:start-after: begin-env-vars-definition
+```
diff --git a/docs/source/usage/env_vars.rst b/docs/source/usage/env_vars.rst
deleted file mode 100644
index ff2259c0da3f1..0000000000000
--- a/docs/source/usage/env_vars.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Environment Variables
-========================
-
-vLLM uses the following environment variables to configure the system:
-
-.. warning::
-    Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
-
-    All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
-
-.. literalinclude:: ../../../vllm/envs.py
-    :language: python
-    :start-after: begin-env-vars-definition
-    :end-before: end-env-vars-definition
diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.md
similarity index 61%
rename from docs/source/usage/faq.rst
rename to docs/source/usage/faq.md
index d88da32092924..fde2954f10c59 100644
--- a/docs/source/usage/faq.rst
+++ b/docs/source/usage/faq.md
@@ -1,34 +1,33 @@
-.. _faq:
+(faq)=
 
-Frequently Asked Questions
-===========================
+# Frequently Asked Questions
 
-    Q: How can I serve multiple models on a single port using the OpenAI API?
+> Q: How can I serve multiple models on a single port using the OpenAI API?
 
 A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
 
-----------------------------------------
+______________________________________________________________________
 
-    Q: Which model to use for offline inference embedding?
+> Q: Which model to use for offline inference embedding?
 
-A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
-more are listed :ref:`here <supported_models>`.
+A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
+more are listed [here](#supported-models).
 
-By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
-`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
+[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
 but they are expected be inferior to models that are specifically trained on embedding tasks.
 
-----------------------------------------
+______________________________________________________________________
 
-    Q: Can the output of a prompt vary across runs in vLLM?
+> Q: Can the output of a prompt vary across runs in vLLM?
 
 A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
-numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, 
-see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
+numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
+see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations).
 
 In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
-changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, 
-can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in 
+changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
+can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 
 **Mitigation Strategies**
diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md
new file mode 100644
index 0000000000000..cf06916d70f44
--- /dev/null
+++ b/docs/source/usage/lora.md
@@ -0,0 +1,214 @@
+(lora-adapter)=
+
+# LoRA Adapters
+
+This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+
+Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+them locally with
+
+```python
+from huggingface_hub import snapshot_download
+
+sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+```
+
+Then we instantiate the base model and pass in the `enable_lora=True` flag:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
+```
+
+We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
+of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
+the third parameter is the path to the LoRA adapter.
+
+```python
+sampling_params = SamplingParams(
+    temperature=0,
+    max_tokens=256,
+    stop=["[/assistant]"]
+)
+
+prompts = [
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+]
+
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+)
+```
+
+Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+
+## Serving LoRA Adapters
+
+LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+
+```bash
+vllm serve meta-llama/Llama-2-7b-hf \
+    --enable-lora \
+    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+```{note}
+The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
+```
+
+The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
+etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
+with its base model:
+
+```bash
+curl localhost:8000/v1/models | jq .
+{
+    "object": "list",
+    "data": [
+        {
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            ...
+        },
+        {
+            "id": "sql-lora",
+            "object": "model",
+            ...
+        }
+    ]
+}
+```
+
+Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
+processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
+LoRA adapter requests if they were provided and `max_loras` is set high enough).
+
+The following is an example request
+
+```bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "sql-lora",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }' | jq
+```
+
+## Dynamically serving LoRA Adapters
+
+In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
+LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
+to change models on-the-fly is needed.
+
+Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+
+To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Loading a LoRA Adapter:
+
+To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
+details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
+
+Example request to load a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter",
+    "lora_path": "/path/to/sql-lora-adapter"
+}'
+```
+
+Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
+cannot be found or loaded, an appropriate error message will be returned.
+
+Unloading a LoRA Adapter:
+
+To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
+with the name or ID of the adapter to be unloaded.
+
+Example request to unload a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter"
+}'
+```
+
+## New format for `--lora-modules`
+
+In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
+
+```bash
+--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
+Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
+
+```bash
+--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
+```
+
+To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
+
+## Lora model lineage in model card
+
+The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
+
+- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
+- The `root` field points to the artifact location of the lora adapter.
+
+```bash
+$ curl http://localhost:8000/v1/models
+
+{
+    "object": "list",
+    "data": [
+        {
+        "id": "meta-llama/Llama-2-7b-hf",
+        "object": "model",
+        "created": 1715644056,
+        "owned_by": "vllm",
+        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+        "parent": null,
+        "permission": [
+            {
+            .....
+            }
+        ]
+        },
+        {
+        "id": "sql-lora",
+        "object": "model",
+        "created": 1715644056,
+        "owned_by": "vllm",
+        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+        "parent": meta-llama/Llama-2-7b-hf,
+        "permission": [
+            {
+            ....
+            }
+        ]
+        }
+    ]
+}
+```
diff --git a/docs/source/usage/lora.rst b/docs/source/usage/lora.rst
deleted file mode 100644
index c2c6fa2aebfaf..0000000000000
--- a/docs/source/usage/lora.rst
+++ /dev/null
@@ -1,225 +0,0 @@
-.. _lora:
-
-LoRA Adapters
-=============
-
-This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
-
-LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`.
-
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
-them locally with
-
-.. code-block:: python
-
-    from huggingface_hub import snapshot_download
-
-    sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-
-
-Then we instantiate the base model and pass in the ``enable_lora=True`` flag:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    from vllm.lora.request import LoRARequest
-
-    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
-
-
-We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter
-of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
-the third parameter is the path to the LoRA adapter.
-
-.. code-block:: python
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=256,
-        stop=["[/assistant]"]
-    )
-
-    prompts = [
-         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-    ]
-
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
-    )
-
-
-Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
-for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
-
-Serving LoRA Adapters
----------------------
-LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server:
-
-.. code-block:: bash
-
-    vllm serve meta-llama/Llama-2-7b-hf \
-        --enable-lora \
-        --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
-
-.. note::
-   The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-
-The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
-etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
-with its base model:
-
-.. code-block:: bash
-
-    curl localhost:8000/v1/models | jq .
-    {
-        "object": "list",
-        "data": [
-            {
-                "id": "meta-llama/Llama-2-7b-hf",
-                "object": "model",
-                ...
-            },
-            {
-                "id": "sql-lora",
-                "object": "model",
-                ...
-            }
-        ]
-    }
-
-Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
-processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
-LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
-
-The following is an example request
-
-.. code-block:: bash
-
-    curl http://localhost:8000/v1/completions \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "sql-lora",
-            "prompt": "San Francisco is a",
-            "max_tokens": 7,
-            "temperature": 0
-        }' | jq
-
-
-Dynamically serving LoRA Adapters
----------------------------------
-
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
-
-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
-
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
-
-.. code-block:: bash
-
-    export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
-
-
-Loading a LoRA Adapter:
-
-To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
-details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
-
-Example request to load a LoRA adapter:
-
-.. code-block:: bash
-
-    curl -X POST http://localhost:8000/v1/load_lora_adapter \
-    -H "Content-Type: application/json" \
-    -d '{
-        "lora_name": "sql_adapter",
-        "lora_path": "/path/to/sql-lora-adapter"
-    }'
-
-Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
-cannot be found or loaded, an appropriate error message will be returned.
-
-Unloading a LoRA Adapter:
-
-To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
-with the name or ID of the adapter to be unloaded.
-
-Example request to unload a LoRA adapter:
-
-.. code-block:: bash
-
-    curl -X POST http://localhost:8000/v1/unload_lora_adapter \
-    -H "Content-Type: application/json" \
-    -d '{
-        "lora_name": "sql_adapter"
-    }'
-
-
-New format for `--lora-modules`
--------------------------------
-
-In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
-
-.. code-block:: bash
-
-    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
-
-This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
-Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
-
-.. code-block:: bash
-
-    --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
-
-To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
-
-
-Lora model lineage in model card
---------------------------------
-
-The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
-
-- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
-- The `root` field points to the artifact location of the lora adapter.
-
-.. code-block:: bash
-
-    $ curl http://localhost:8000/v1/models
-
-    {
-        "object": "list",
-        "data": [
-            {
-            "id": "meta-llama/Llama-2-7b-hf",
-            "object": "model",
-            "created": 1715644056,
-            "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-            "parent": null,
-            "permission": [
-                {
-                .....
-                }
-            ]
-            },
-            {
-            "id": "sql-lora",
-            "object": "model",
-            "created": 1715644056,
-            "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-            "parent": meta-llama/Llama-2-7b-hf,
-            "permission": [
-                {
-                ....
-                }
-            ]
-            }
-        ]
-    }
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
new file mode 100644
index 0000000000000..4f45a9f448cf0
--- /dev/null
+++ b/docs/source/usage/multimodal_inputs.md
@@ -0,0 +1,532 @@
+(multimodal-inputs)=
+
+# Multimodal Inputs
+
+This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
+
+```{note}
+We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
+```
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
+
+- `prompt`: The prompt should follow the format that is documented on HuggingFace.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`.
+
+### Image
+
+You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples:
+
+```python
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+# Load the image using PIL.Image
+image = PIL.Image.open(...)
+
+# Single prompt inference
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {"image": image},
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+
+# Batch inference
+image_1 = PIL.Image.open(...)
+image_2 = PIL.Image.open(...)
+outputs = llm.generate(
+    [
+        {
+            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+            "multi_modal_data": {"image": image_1},
+        },
+        {
+            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+            "multi_modal_data": {"image": image_2},
+        }
+    ]
+)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+Full example: <gh-file:examples/offline_inference_vision_language.py>
+
+To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
+
+```python
+llm = LLM(
+    model="microsoft/Phi-3.5-vision-instruct",
+    trust_remote_code=True,  # Required to load Phi-3.5-vision
+    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+)
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+
+# Load the images using PIL.Image
+image1 = PIL.Image.open(...)
+image2 = PIL.Image.open(...)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {
+        "image": [image1, image2]
+    },
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
+
+Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
+
+```python
+# Specify the maximum number of frames per video to be 4. This can be changed.
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+
+# Create the request payload.
+video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+message = {
+    "role": "user",
+    "content": [
+        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+    ],
+}
+for i in range(len(video_frames)):
+    base64_image = encode_image(video_frames[i]) # base64 encoding.
+    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+    message["content"].append(new_image)
+
+# Perform inference and log output.
+outputs = llm.chat([message])
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+### Video
+
+You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
+instead of using multi-image input.
+
+Full example: <gh-file:examples/offline_inference_vision_language.py>
+
+### Audio
+
+You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
+
+Full example: <gh-file:examples/offline_inference_audio_language.py>
+
+### Embedding
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+```python
+# Inference with image embeddings as input
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+# Embeddings for single image
+# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+image_embeds = torch.load(...)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {"image": image_embeds},
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
+
+```python
+# Construct the prompt based on your model
+prompt = ...
+
+# Embeddings for multiple images
+# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+image_embeds = torch.load(...)
+
+# Qwen2-VL
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+mm_data = {
+    "image": {
+        "image_embeds": image_embeds,
+        # image_grid_thw is needed to calculate positional encoding.
+        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+    }
+}
+
+# MiniCPM-V
+llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+mm_data = {
+    "image": {
+        "image_embeds": image_embeds,
+        # image_size_list is needed to calculate details of the sliced image.
+        "image_size_list": [image.size for image in images],  # list of image sizes
+    }
+}
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": mm_data,
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+## Online Inference
+
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+
+```{important}
+A chat template is **required** to use Chat Completions API.
+
+Although most models come with a chat template, for others you have to define one yourself.
+The chat template can be inferred based on the documentation on the model's HuggingFace repo.
+For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
+```
+
+### Image
+
+Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
+Here is a simple example using Phi-3.5-Vision.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+from openai import OpenAI
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Single-image input inference
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+chat_response = client.chat.completions.create(
+    model="microsoft/Phi-3.5-vision-instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            # NOTE: The prompt formatting with the image token `<image>` is not needed
+            # since the prompt will be processed automatically by the API server.
+            {"type": "text", "text": "What’s in this image?"},
+            {"type": "image_url", "image_url": {"url": image_url}},
+        ],
+    }],
+)
+print("Chat completion output:", chat_response.choices[0].message.content)
+
+# Multi-image input inference
+image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+chat_response = client.chat.completions.create(
+    model="microsoft/Phi-3.5-vision-instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What are the animals in these images?"},
+            {"type": "image_url", "image_url": {"url": image_url_duck}},
+            {"type": "image_url", "image_url": {"url": image_url_lion}},
+        ],
+    }],
+)
+print("Chat completion output:", chat_response.choices[0].message.content)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+
+```{tip}
+Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
+and pass the file path as `url` in the API request.
+```
+
+```{tip}
+There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+```
+
+````{note}
+By default, the timeout for fetching images through HTTP URL is `5` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Video
+
+Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+```
+
+Then, you can use the OpenAI client as follows:
+```python
+from openai import OpenAI
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+## Use video url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from image url:", result)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+
+````{note}
+By default, the timeout for fetching videos through HTTP URL is `30` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Audio
+
+Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve fixie-ai/ultravox-v0_3
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+import base64
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Any format supported by librosa is supported
+audio_url = AudioAsset("winning_call").url
+audio_base64 = encode_base64_content_from_url(audio_url)
+
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": audio_base64,
+                    "format": "wav"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print("Chat completion output from input audio:", result)
+```
+
+Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+
+```python
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from audio url:", result)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+
+````{note}
+By default, the timeout for fetching audios through HTTP URL is `10` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Embedding
+
+vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
+where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
+
+```{tip}
+The schema of `messages` is exactly the same as in Chat Completions API.
+You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
+```
+
+Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
+Refer to the examples below for illustration.
+
+Here is an end-to-end example using VLM2Vec. To serve the model:
+
+```bash
+vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
+```
+
+```{important}
+Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+to run this model in embedding mode instead of text generation mode.
+
+The custom chat template is completely different from the original one for this model,
+and can be found here: <gh-file:examples/template_vlm2vec.jinja>
+```
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+
+```python
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model": "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "Represent the given image."},
+            ],
+        }],
+        "encoding_format": "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+print("Embedding output:", response_json["data"][0]["embedding"])
+```
+
+Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
+
+```bash
+vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
+```
+
+```{important}
+Like with VLM2Vec, we have to explicitly pass `--task embed`.
+
+Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
+by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
+```
+
+```{important}
+Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
+example below for details.
+```
+
+Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
deleted file mode 100644
index 680382e457cc5..0000000000000
--- a/docs/source/usage/multimodal_inputs.rst
+++ /dev/null
@@ -1,492 +0,0 @@
-.. _multimodal_inputs:
-
-Multimodal Inputs
-=================
-
-This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models <supported_mm_models>` in vLLM.
-
-.. note::
-    We are actively iterating on multi-modal support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
-    and `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
-
-Offline Inference
------------------
-
-To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`:
-
-* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
-* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
-
-Image
-^^^^^
-
-You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples:
-
-.. code-block:: python
-
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    # Load the image using PIL.Image
-    image = PIL.Image.open(...)
-
-    # Single prompt inference
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Batch inference
-    image_1 = PIL.Image.open(...)
-    image_2 = PIL.Image.open(...)
-    outputs = llm.generate(
-        [
-            {
-                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
-                "multi_modal_data": {"image": image_1},
-            },
-            {
-                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
-                "multi_modal_data": {"image": image_2},
-            }
-        ]
-    )
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
-
-To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
-
-.. code-block:: python
-
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,  # Required to load Phi-3.5-vision
-        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
-    )
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
-
-    # Load the images using PIL.Image
-    image1 = PIL.Image.open(...)
-    image2 = PIL.Image.open(...)
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": [image1, image2]
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
-
-Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos:
-
-.. code-block:: python
-
-    # Specify the maximum number of frames per video to be 4. This can be changed.
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-
-    # Create the request payload.
-    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-    message = {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
-        ],
-    }
-    for i in range(len(video_frames)):
-        base64_image = encode_image(video_frames[i]) # base64 encoding.
-        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-        message["content"].append(new_image)
-
-    # Perform inference and log output.
-    outputs = llm.chat([message])
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-Video
-^^^^^
-
-You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary
-instead of using multi-image input.
-
-Please refer to `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_ for more details.
-
-Audio
-^^^^^
-
-You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary.
-
-Please refer to `examples/offline_inference_audio_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py>`_ for more details.
-
-Embedding
-^^^^^^^^^
-
-To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
-
-.. code-block:: python
-
-    # Inference with image embeddings as input
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    # Embeddings for single image
-    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-    image_embeds = torch.load(...)
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image_embeds},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
-
-.. code-block:: python
-
-    # Construct the prompt based on your model
-    prompt = ...
-
-    # Embeddings for multiple images
-    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    image_embeds = torch.load(...)
-
-    # Qwen2-VL
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-    mm_data = {
-        "image": {
-            "image_embeds": image_embeds,
-            # image_grid_thw is needed to calculate positional encoding.
-            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
-        }
-    }
-
-    # MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-    mm_data = {
-        "image": {
-            "image_embeds": image_embeds,
-            # image_size_list is needed to calculate details of the sliced image.
-            "image_size_list": [image.size for image in images],  # list of image sizes
-        }
-    }
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": mm_data,
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-Online Inference
-----------------
-
-Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_.
-
-.. important::
-    A chat template is **required** to use Chat Completions API.
-
-    Although most models come with a chat template, for others you have to define one yourself.
-    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`__.
-
-Image
-^^^^^
-
-Image input is supported according to `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
-Here is a simple example using Phi-3.5-Vision.
-
-First, launch the OpenAI-compatible server:
-
-.. code-block:: bash
-
-    vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-      --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
-
-Then, you can use the OpenAI client as follows:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    # Single-image input inference
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-    chat_response = client.chat.completions.create(
-        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        }],
-    )
-    print("Chat completion output:", chat_response.choices[0].message.content)
-
-    # Multi-image input inference
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
-
-    chat_response = client.chat.completions.create(
-        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
-            ],
-        }],
-    )
-    print("Chat completion output:", chat_response.choices[0].message.content)
-
-A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
-
-.. tip::
-    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine,
-    and pass the file path as ``url`` in the API request.
-
-.. tip::
-    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
-    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-
-.. note::
-
-    By default, the timeout for fetching images through HTTP URL is ``5`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
-
-Video
-^^^^^
-
-Instead of :code:`image_url`, you can pass a video file via :code:`video_url`.
-
-You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py>`_ as reference.
-
-.. note::
-
-    By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
-
-Audio
-^^^^^
-
-Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
-Here is a simple example using Ultravox-v0.3.
-
-First, launch the OpenAI-compatible server:
-
-.. code-block:: bash
-
-    vllm serve fixie-ai/ultravox-v0_3
-    
-Then, you can use the OpenAI client as follows:
-
-.. code-block:: python
-
-    import base64
-    import requests
-    from openai import OpenAI
-    from vllm.assets.audio import AudioAsset
-
-    def encode_base64_content_from_url(content_url: str) -> str:
-        """Encode a content retrieved from a remote url to base64 format."""
-
-        with requests.get(content_url) as response:
-            response.raise_for_status()
-            result = base64.b64encode(response.content).decode('utf-8')
-
-        return result
-
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    # Any format supported by librosa is supported
-    audio_url = AudioAsset("winning_call").url
-    audio_base64 = encode_base64_content_from_url(audio_url)
-
-    chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        "data": audio_base64,
-                        "format": "wav"
-                    },
-                },
-            ],
-        }],
-        model=model,
-        max_completion_tokens=64,
-    )
-
-    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
-
-Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
-
-.. code-block:: python
-
-    chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
-                    },
-                },
-            ],
-        }],
-        model=model,
-        max_completion_tokens=64,
-    )
-
-    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
-
-A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
-
-.. note::
-
-    By default, the timeout for fetching audios through HTTP URL is ``10`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
-
-Embedding
-^^^^^^^^^
-
-vLLM's Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
-where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
-
-.. tip::
-    The schema of ``messages`` is exactly the same as in Chat Completions API.
-    You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-
-Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
-Refer to the examples below for illustration.
-
-Here is an end-to-end example using VLM2Vec. To serve the model:
-
-.. code-block:: bash
-
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
-      --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
-
-.. important::
-
-    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed``
-    to run this model in embedding mode instead of text generation mode.
-
-    The custom chat template is completely different from the original one for this model,
-    and can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`__.
-
-Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
-
-.. code-block:: python
-
-    import requests
-
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "TIGER-Lab/VLM2Vec-Full",
-            "messages": [{
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                    {"type": "text", "text": "Represent the given image."},
-                ],
-            }],
-            "encoding_format": "float",
-        },
-    )
-    response.raise_for_status()
-    response_json = response.json()
-    print("Embedding output:", response_json["data"][0]["embedding"])
-
-Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
-
-.. code-block:: bash
-
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
-      --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
-
-.. important::
-
-    Like with VLM2Vec, we have to explicitly pass ``--task embed``.
-    
-    Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
-    by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
-
-.. important::
-
-    Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code 
-    example below for details.
-
-A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/docs/source/usage/performance.rst b/docs/source/usage/performance.md
similarity index 54%
rename from docs/source/usage/performance.rst
rename to docs/source/usage/performance.md
index 23b5ab79a7378..f028e28627a9f 100644
--- a/docs/source/usage/performance.rst
+++ b/docs/source/usage/performance.md
@@ -1,16 +1,15 @@
-.. _performance:
+(performance)=
 
-Performance and Tuning
-======================
+# Performance and Tuning
+
+## Preemption
 
-Preemption
-----------
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, the following warning is printed:
 
 ```
-WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
 While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
@@ -22,44 +21,44 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
 
 You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
 
-.. _chunked-prefill:
+(chunked-prefill)=
 
-Chunked Prefill
----------------
-vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+## Chunked Prefill
 
-You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor.
+vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
 
-.. code-block:: python
+You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
 
-    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
-    # Set max_num_batched_tokens to tune performance.
-    # NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
-    # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+```python
+llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
+# Set max_num_batched_tokens to tune performance.
+# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
+# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+```
 
 By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
 This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
 
 Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
 It batches all pending decode requests to the batch before scheduling any prefill.
-When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills.
-If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it.
+When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
+If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
 
 This policy has two benefits:
 
 - It improves ITL and generation decode because decode requests are prioritized.
 - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can tune the performance by changing ``max_num_batched_tokens``.
+You can tune the performance by changing `max_num_batched_tokens`.
 By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B).
-Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes.
-Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch.
+Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
+Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
 
-- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
+- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
 
-We recommend you set ``max_num_batched_tokens > 2048`` for throughput.
+We recommend you set `max_num_batched_tokens > 2048` for throughput.
 
-See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369).
+See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
 
-Please try out this feature and let us know your feedback via GitHub issues!
\ No newline at end of file
+Please try out this feature and let us know your feedback via GitHub issues!
diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md
new file mode 100644
index 0000000000000..8302da81b6173
--- /dev/null
+++ b/docs/source/usage/spec_decode.md
@@ -0,0 +1,205 @@
+(spec-decode)=
+
+# Speculative decoding
+
+```{warning}
+Please note that speculative decoding in vLLM is not yet optimized and does
+not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
+The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+```
+
+```{warning}
+Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+```
+
+This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
+Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
+
+## Speculating with a draft model
+
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="facebook/opt-6.7b",
+    tensor_parallel_size=1,
+    speculative_model="facebook/opt-125m",
+    num_speculative_tokens=5,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+To perform the same with an online mode launch the server:
+
+```bash
+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+```
+
+Then use a client:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Completion API
+stream = False
+completion = client.completions.create(
+    model=model,
+    prompt="The future of AI is",
+    echo=False,
+    n=1,
+    stream=stream,
+)
+
+print("Completion results:")
+if stream:
+    for c in completion:
+        print(c)
+else:
+    print(completion)
+```
+
+## Speculating by matching n-grams in the prompt
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="facebook/opt-6.7b",
+    tensor_parallel_size=1,
+    speculative_model="[ngram]",
+    num_speculative_tokens=5,
+    ngram_prompt_lookup_max=4,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+## Speculating using MLP speculators
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+draft models that conditioning draft predictions on both context vectors and sampled tokens.
+For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
+[this technical report](https://arxiv.org/abs/2404.19124).
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="ibm-fms/llama3-70b-accelerator",
+    speculative_draft_tensor_parallel_size=1,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
+limitation will be fixed in a future release.
+
+A variety of speculative models of this type are available on HF hub:
+
+- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
+- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
+- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
+- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
+- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
+
+## Lossless guarantees of Speculative Decoding
+
+In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
+speculative decoding, breaking down the guarantees into three key areas:
+
+1. **Theoretical Losslessness**
+   \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
+   cause slight variations in output distributions, as discussed
+   in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318)
+
+2. **Algorithmic Losslessness**
+   \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
+
+   > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
+   >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
+   > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
+   >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
+   >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+   >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
+
+3. **vLLM Logprob Stability**
+   \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
+   same request across runs. For more details, see the FAQ section
+   titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+
+**Conclusion**
+
+While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
+can occur due to following factors:
+
+- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
+- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
+  due to non-deterministic behavior in batched operations or numerical instability.
+
+**Mitigation Strategies**
+
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+
+## Resources for vLLM contributors
+
+- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
+- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
+- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
+- [Dynamic speculative decoding](gh-issue:4565)
diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst
deleted file mode 100644
index f1f1917f974bb..0000000000000
--- a/docs/source/usage/spec_decode.rst
+++ /dev/null
@@ -1,210 +0,0 @@
-.. _spec_decode:
-
-Speculative decoding
-====================
-
-.. warning::
-    Please note that speculative decoding in vLLM is not yet optimized and does
-    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
-    to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
-
-.. warning::
-    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-
-This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
-Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
-
-Speculating with a draft model
-------------------------------
-
-The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="facebook/opt-125m",
-        num_speculative_tokens=5,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-To perform the same with an online mode launch the server:
-
-.. code-block:: bash
-
-    python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-        --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
-        --num_speculative_tokens 5 --gpu_memory_utilization 0.8
-
-Then use a client:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        # defaults to os.environ.get("OPENAI_API_KEY")
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    models = client.models.list()
-    model = models.data[0].id
-
-    # Completion API
-    stream = False
-    completion = client.completions.create(
-        model=model,
-        prompt="The future of AI is",
-        echo=False,
-        n=1,
-        stream=stream,
-    )
-
-    print("Completion results:")
-    if stream:
-        for c in completion:
-            print(c)
-    else:
-        print(completion)
-
-Speculating by matching n-grams in the prompt
----------------------------------------------
-
-The following code configures vLLM to use speculative decoding where proposals are generated by
-matching n-grams in the prompt. For more information read `this thread. <https://x.com/joao_gante/status/1747322413006643259>`_
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="[ngram]",
-        num_speculative_tokens=5,
-        ngram_prompt_lookup_max=4,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Speculating using MLP speculators
----------------------------------
-
-The following code configures vLLM to use speculative decoding where proposals are generated by
-draft models that conditioning draft predictions on both context vectors and sampled tokens.
-For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
-`this technical report <https://arxiv.org/abs/2404.19124>`_.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-        tensor_parallel_size=4,
-        speculative_model="ibm-fms/llama3-70b-accelerator",
-        speculative_draft_tensor_parallel_size=1,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Note that these speculative models currently need to be run without tensor parallelism, although
-it is possible to run the main model using tensor parallelism (see example above). Since the
-speculative models are relatively small, we still see significant speedups. However, this
-limitation will be fixed in a future release.
-
-A variety of speculative models of this type are available on HF hub:
-
-* `llama-13b-accelerator <https://huggingface.co/ibm-fms/llama-13b-accelerator>`_
-* `llama3-8b-accelerator <https://huggingface.co/ibm-fms/llama3-8b-accelerator>`_
-* `codellama-34b-accelerator <https://huggingface.co/ibm-fms/codellama-34b-accelerator>`_
-* `llama2-70b-accelerator <https://huggingface.co/ibm-fms/llama2-70b-accelerator>`_
-* `llama3-70b-accelerator <https://huggingface.co/ibm-fms/llama3-70b-accelerator>`_
-* `granite-3b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator>`_
-* `granite-8b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator>`_
-* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
-* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
-
-Lossless guarantees of Speculative Decoding
--------------------------------------------
-In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of 
-speculative decoding, breaking down the guarantees into three key areas:
-
-1. **Theoretical Losslessness**
-   - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might 
-   cause slight variations in output distributions, as discussed 
-   in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
-
-2. **Algorithmic Losslessness**
-   - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
-
-    - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target 
-      distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
-
-    - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
-      without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, 
-      provides a lossless guarantee.  Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
-      verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
-
-3. **vLLM Logprob Stability**
-   - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
-   same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
-
-
-**Conclusion**
-
-While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding 
-can occur due to following factors:
-
-- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
-
-- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially 
-  due to non-deterministic behavior in batched operations or numerical instability.
-
-**Mitigation Strategies**
-
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
-
-Resources for vLLM contributors
--------------------------------
-* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
-* `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
-* `Information on batch expansion <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
-* `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
new file mode 100644
index 0000000000000..3f5d9ffc26278
--- /dev/null
+++ b/docs/source/usage/structured_outputs.md
@@ -0,0 +1,260 @@
+(structured-outputs)=
+
+# Structured Outputs
+
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+This document shows you some examples of the different options that are available to generate structured outputs.
+
+## Online Inference (OpenAI API)
+
+You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+
+The following parameters are supported, which must be added as extra parameters:
+
+- `guided_choice`: the output will be exactly one of the choices.
+- `guided_regex`: the output will follow the regex pattern.
+- `guided_json`: the output will follow the JSON schema.
+- `guided_grammar`: the output will follow the context free grammar.
+- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
+- `guided_decoding_backend`: used to select the guided decoding backend to use.
+
+You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
+
+Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="-",
+)
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+    ],
+    extra_body={"guided_choice": ["positive", "negative"]},
+)
+print(completion.choices[0].message.content)
+```
+
+The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+
+```python
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+        }
+    ],
+    extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
+)
+print(completion.choices[0].message.content)
+```
+
+One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
+For this we can use the `guided_json` parameter in two different ways:
+
+- Using directly a [JSON Schema](https://json-schema.org/)
+- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
+
+The next example shows how to use the `guided_json` parameter with a Pydantic model:
+
+```python
+from pydantic import BaseModel
+from enum import Enum
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+        }
+    ],
+    extra_body={"guided_json": json_schema},
+)
+print(completion.choices[0].message.content)
+```
+
+```{tip}
+While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
+This can improve the results notably in most cases.
+```
+
+Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
+It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
+
+```python
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+        }
+    ],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print(completion.choices[0].message.content)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+
+## Experimental Automatic Parsing (OpenAI API)
+
+This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types.
+
+At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
+
+For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+
+Here is a simple example demonstrating how to get structured output using Pydantic models:
+
+```python
+from pydantic import BaseModel
+from openai import OpenAI
+
+
+class Info(BaseModel):
+    name: str
+    age: int
+
+
+client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+completion = client.beta.chat.completions.parse(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+    ],
+    response_format=Info,
+    extra_body=dict(guided_decoding_backend="outlines"),
+)
+
+message = completion.choices[0].message
+print(message)
+assert message.parsed
+print("Name:", message.parsed.name)
+print("Age:", message.parsed.age)
+```
+
+Output:
+
+```console
+ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
+Name: Cameron
+Age: 28
+```
+
+Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
+
+```python
+from typing import List
+from pydantic import BaseModel
+from openai import OpenAI
+
+
+class Step(BaseModel):
+    explanation: str
+    output: str
+
+
+class MathResponse(BaseModel):
+    steps: List[Step]
+    final_answer: str
+
+
+client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+completion = client.beta.chat.completions.parse(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful expert math tutor."},
+        {"role": "user", "content": "Solve 8x + 31 = 2."},
+    ],
+    response_format=MathResponse,
+    extra_body=dict(guided_decoding_backend="outlines"),
+)
+
+message = completion.choices[0].message
+print(message)
+assert message.parsed
+for i, step in enumerate(message.parsed.steps):
+    print(f"Step #{i}:", step)
+print("Answer:", message.parsed.final_answer)
+```
+
+Output:
+
+```console
+ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
+Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
+Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
+Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
+Answer: x = -29/8
+```
+
+## Offline Inference
+
+Offline inference allows for the same types of guided decoding.
+To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
+The main available options inside `GuidedDecodingParams` are:
+
+- `json`
+- `regex`
+- `choice`
+- `grammar`
+- `backend`
+- `whitespace_pattern`
+
+These parameters can be used in the same way as the parameters from the Online Inference examples above.
+One example for the usage of the `choices` parameter is shown below:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import GuidedDecodingParams
+
+llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+```
+
+Full example: <gh-file:examples/offline_inference_structured_outputs.py>
diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst
deleted file mode 100644
index 484e1f17d191e..0000000000000
--- a/docs/source/usage/structured_outputs.rst
+++ /dev/null
@@ -1,267 +0,0 @@
-.. _structured_outputs:
-
-Structured Outputs
-==================
-
-vLLM supports the generation of structured outputs using `outlines <https://github.com/dottxt-ai/outlines>`_ or `lm-format-enforcer <https://github.com/noamgat/lm-format-enforcer>`_ as backends for the guided decoding.
-This document shows you some examples of the different options that are available to generate structured outputs. 
-
-
-Online Inference (OpenAI API)
------------------------------
-
-You can generate structured outputs using the OpenAI's `Completions <https://platform.openai.com/docs/api-reference/completions>`_ and `Chat <https://platform.openai.com/docs/api-reference/chat>`_  API.
-
-The following parameters are supported, which must be added as extra parameters:
-
-- ``guided_choice``: the output will be exactly one of the choices.
-- ``guided_regex``: the output will follow the regex pattern.
-- ``guided_json``: the output will follow the JSON schema.
-- ``guided_grammar``: the output will follow the context free grammar.
-- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding.
-- ``guided_decoding_backend``: used to select the guided decoding backend to use.
-
-You can see the complete list of supported parameters on the `OpenAI Compatible Server </../serving/openai_compatible_server.html>`_ page. 
-
-Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: 
-
-.. code-block:: python
-
-    from openai import OpenAI
-    client = OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="-",
-    )
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-        ],
-        extra_body={"guided_choice": ["positive", "negative"]},
-    )
-    print(completion.choices[0].message.content)
-
-
-The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: 
-
-.. code-block:: python
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-            }
-        ],
-        extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
-    )
-    print(completion.choices[0].message.content)
-
-One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. 
-For this we can use the ``guided_json`` parameter in two different ways:
-
-- Using directly a `JSON Schema <https://json-schema.org/>`_ 
-- Defining a `Pydantic model <https://docs.pydantic.dev/latest/>`_ and then extracting the JSON Schema from it (which is normally an easier option).
-
-The next example shows how to use the ``guided_json`` parameter with a Pydantic model:
-
-.. code-block:: python
-
-    from pydantic import BaseModel
-    from enum import Enum
-
-    class CarType(str, Enum):
-        sedan = "sedan"
-        suv = "SUV"
-        truck = "Truck"
-        coupe = "Coupe"
-
-
-    class CarDescription(BaseModel):
-        brand: str
-        model: str
-        car_type: CarType
-
-
-    json_schema = CarDescription.model_json_schema()
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print(completion.choices[0].message.content)
-
-.. tip::
-    While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
-    This can improve the results notably in most cases.
-
-
-Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
-It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
-
-.. code-block:: python
-
-    simplified_sql_grammar = """
-        ?start: select_statement
-
-        ?select_statement: "SELECT " column_list " FROM " table_name
-
-        ?column_list: column_name ("," column_name)*
-
-        ?table_name: identifier
-
-        ?column_name: identifier
-
-        ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-    """
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    print(completion.choices[0].message.content)
-
-The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py>`_.
-
-Experimental Automatic Parsing (OpenAI API)
---------------------------------------------
-
-This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types.
-
-At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here <https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104>`_.
-
-For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct``
-
-Here is a simple example demonstrating how to get structured output using Pydantic models:
-
-.. code-block:: python
-
-    from pydantic import BaseModel
-    from openai import OpenAI
-
-
-    class Info(BaseModel):
-        name: str
-        age: int
-
-
-    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-    completion = client.beta.chat.completions.parse(
-        model="meta-llama/Llama-3.1-8B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
-        ],
-        response_format=Info,
-        extra_body=dict(guided_decoding_backend="outlines"),
-    )
-
-    message = completion.choices[0].message
-    print(message)
-    assert message.parsed
-    print("Name:", message.parsed.name)
-    print("Age:", message.parsed.age)
-
-Output:
-
-.. code-block:: console
-
-    ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
-    Name: Cameron
-    Age: 28
-
-
-Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
-
-.. code-block:: python
-
-    from typing import List
-    from pydantic import BaseModel
-    from openai import OpenAI
-
-
-    class Step(BaseModel):
-        explanation: str
-        output: str
-
-
-    class MathResponse(BaseModel):
-        steps: List[Step]
-        final_answer: str
-
-
-    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-    completion = client.beta.chat.completions.parse(
-        model="meta-llama/Llama-3.1-8B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful expert math tutor."},
-            {"role": "user", "content": "Solve 8x + 31 = 2."},
-        ],
-        response_format=MathResponse,
-        extra_body=dict(guided_decoding_backend="outlines"),
-    )
-
-    message = completion.choices[0].message
-    print(message)
-    assert message.parsed
-    for i, step in enumerate(message.parsed.steps):
-        print(f"Step #{i}:", step)
-    print("Answer:", message.parsed.final_answer)
-
-Output:
-
-.. code-block:: console
-
-    ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
-    Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
-    Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
-    Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
-    Answer: x = -29/8
-
-Offline Inference
------------------
-
-Offline inference allows for the same types of guided decoding.
-To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. 
-The main available options inside ``GuidedDecodingParams`` are: 
-
-- ``json`` 
-- ``regex`` 
-- ``choice``
-- ``grammar``
-- ``backend``
-- ``whitespace_pattern``
-
-These parameters can be used in the same way as the parameters from the Online Inference examples above. 
-One example for the usage of the ``choices`` parameter is shown below: 
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    from vllm.sampling_params import GuidedDecodingParams
-
-    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
-
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-    outputs = llm.generate(
-        prompts="Classify this sentiment: vLLM is wonderful!",
-        sampling_params=sampling_params,
-    )
-    print(outputs[0].outputs[0].text)
-
-A complete example with all options can be found in `examples/offline_inference_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py>`_.
diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md
index f8be023307b0c..34b26647a959f 100644
--- a/docs/source/usage/tool_calling.md
+++ b/docs/source/usage/tool_calling.md
@@ -170,6 +170,12 @@ Recommended flags: `--tool-call-parser granite --chat-template examples/tool_cha
 
 `examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
 
+* `ibm-granite/granite-3.1-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite`
+
+The chat template from Huggingface can be used directly. Parallel function calls are supported.
+
 * `ibm-granite/granite-20b-functioncalling`
 
 Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
@@ -284,4 +290,3 @@ Then you can use this plugin in the command line like this.
     --tool-call-parser example \
     --chat-template <your chat template> \
 ```
-
diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md
index a1e4b1c38acae..3d02fbab9216e 100644
--- a/docs/source/usage/usage_stats.md
+++ b/docs/source/usage/usage_stats.md
@@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette
 
 ## What data is collected?
 
-You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
+The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
 
 Here is an example as of v0.4.0:
 
@@ -47,7 +47,7 @@ tail ~/.config/vllm/usage_stats.json
 
 ## Opt-out of Usage Stats Collection
 
-You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
+You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 
 ```bash
 # Any of the following methods can disable usage stats collection
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 68b786961b14a..6fd74782a9aae 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -18,6 +18,10 @@
     2: "What sport and what nursery rhyme are referenced?"
 }
 
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
 
 # Ultravox 0.3
 def run_ultravox(question: str, audio_count: int):
@@ -33,6 +37,8 @@ def run_ultravox(question: str, audio_count: int):
                                            add_generation_prompt=True)
 
     llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
               trust_remote_code=True,
               limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 6d0495fdd4054..d5a71862656e7 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -28,7 +28,7 @@ def run_aria(question: str, modality: str):
               tokenizer_mode="slow",
               trust_remote_code=True,
               dtype="bfloat16",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
               "<|im_end|>\n<|im_start|>assistant\n")
@@ -45,7 +45,7 @@ def run_blip2(question: str, modality: str):
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompt = f"Question: {question} Answer:"
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -57,7 +57,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -70,7 +70,7 @@ def run_fuyu(question: str, modality: str):
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -85,7 +85,7 @@ def run_glm4v(question: str, modality: str):
               max_num_seqs=2,
               trust_remote_code=True,
               enforce_eager=True,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     prompt = question
     stop_token_ids = [151329, 151336, 151338]
     return llm, prompt, stop_token_ids
@@ -101,7 +101,7 @@ def run_h2ovl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -134,7 +134,7 @@ def run_idefics3(question: str, modality: str):
                 "longest_edge": 3 * 364
             },
         },
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     prompt = (
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -153,7 +153,7 @@ def run_internvl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -180,7 +180,7 @@ def run_llava(question: str, modality: str):
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -192,7 +192,7 @@ def run_llava_next(question: str, modality: str):
     prompt = f"[INST] <image>\n{question} [/INST]"
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -205,7 +205,7 @@ def run_llava_next_video(question: str, modality: str):
     prompt = f"USER: <video>\n{question} ASSISTANT:"
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -223,7 +223,7 @@ def run_llava_onevision(question: str, modality: str):
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
               max_model_len=16384,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -239,7 +239,7 @@ def run_mantis(question: str, modality: str):
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
@@ -266,7 +266,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -305,7 +305,7 @@ def run_mllama(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=16,
         enforce_eager=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"<|image|><|begin_of_text|>{question}"
@@ -323,7 +323,7 @@ def run_molmo(question, modality):
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = question
@@ -343,7 +343,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -363,7 +363,7 @@ def run_paligemma(question: str, modality: str):
     # PaliGemma has special prompt format for VQA
     prompt = "caption en"
     llm = LLM(model="google/paligemma-3b-mix-224",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -375,7 +375,7 @@ def run_paligemma2(question: str, modality: str):
     # PaliGemma 2 has special prompt format for VQA
     prompt = "caption en"
     llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -405,7 +405,7 @@ def run_phi3v(question: str, modality: str):
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -420,7 +420,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -437,7 +437,7 @@ def run_qwen_vl(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"{question}Picture 1: <img></img>\n"
@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str):
 
 # Qwen2-VL
 def run_qwen2_vl(question: str, modality: str):
-    assert modality == "image"
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -460,11 +459,16 @@ def run_qwen2_vl(question: str, modality: str):
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
     prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
               f"{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = None
@@ -651,9 +655,9 @@ def main(args):
         ' (if enabled)')
 
     parser.add_argument(
-        '--mm-cache-preprocessor',
+        '--disable-mm-preprocessor-cache',
         action='store_true',
-        help='If True, enable caching of multi-modal preprocessor/mapper.')
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
 
     parser.add_argument(
         '--time-generate',
diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference_with_default_generation_config.py
new file mode 100644
index 0000000000000..346bb80b1e23f
--- /dev/null
+++ b/examples/offline_inference_with_default_generation_config.py
@@ -0,0 +1,30 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM with built-in default generation config.
+# The generation config is set to None by default to keep
+# the behavior consistent with the previous version.
+# If you want to use the default generation config from the model,
+# you should set the generation_config to "auto".
+llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
+
+# Load the default sampling parameters from the model.
+sampling_params = llm.get_default_sampling_params()
+# Modify the sampling parameters if needed.
+sampling_params.temperature = 0.5
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 6a160fd70423f..213d075542e81 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -18,7 +18,6 @@
 import requests
 from openai import OpenAI
 
-from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
@@ -151,8 +150,66 @@ def run_multi_image() -> None:
     print("Chat completion output:", result)
 
 
+# Video input inference
+def run_video() -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": f"data:video/mp4;base64,{video_base64}"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
 # Audio input inference
 def run_audio() -> None:
+    from vllm.assets.audio import AudioAsset
+
     audio_url = AudioAsset("winning_call").url
     audio_base64 = encode_base64_content_from_url(audio_url)
 
@@ -240,6 +297,7 @@ def run_audio() -> None:
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "video": run_video,
     "audio": run_audio,
 }
 
@@ -253,12 +311,11 @@ def main(args) -> None:
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online inference with '
         'multimodal language models served with vLLM.')
-    parser.add_argument(
-        '--chat-type',
-        '-c',
-        type=str,
-        default="single-image",
-        choices=["text-only", "single-image", "multi-image", "audio"],
-        help='Conversation type with multimodal data.')
+    parser.add_argument('--chat-type',
+                        '-c',
+                        type=str,
+                        default="single-image",
+                        choices=list(example_function_map.keys()),
+                        help='Conversation type with multimodal data.')
     args = parser.parse_args()
     main(args)
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
index a06af8df5d3fe..365a684d53f2b 100644
--- a/examples/openai_cross_encoder_score.py
+++ b/examples/openai_cross_encoder_score.py
@@ -20,9 +20,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+
     args = parser.parse_args()
     api_url = f"http://{args.host}:{args.port}/score"
-
     model_name = args.model
 
     text_1 = "What is the capital of Brazil?"
diff --git a/examples/openai_pooling_client.py b/examples/openai_pooling_client.py
new file mode 100644
index 0000000000000..37ec8f2fb6be3
--- /dev/null
+++ b/examples/openai_pooling_client.py
@@ -0,0 +1,51 @@
+"""
+Example online usage of Pooling API.
+
+Run `vllm serve <model> --task <embed|classify|reward|score>`
+to start up the server in vLLM.
+"""
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model",
+                        type=str,
+                        default="jason9693/Qwen2.5-1.5B-apeach")
+
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    # Input like Completions API
+    prompt = {"model": model_name, "input": "vLLM is great!"}
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
+
+    # Input like Chat API
+    prompt = {
+        "model":
+        model_name,
+        "messages": [{
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "vLLM is great!"
+            }],
+        }]
+    }
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
diff --git a/pyproject.toml b/pyproject.toml
index c5a14ecf5aea9..45fa4bff4e680 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile"
+ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
diff --git a/requirements-build.txt b/requirements-build.txt
index 388b193403e88..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1; platform_machine != 'aarch64'
+torch==2.5.1
 wheel
 jinja2
diff --git a/requirements-common.txt b/requirements-common.txt
index bd2b4b7a01668..6c390bcfd18e6 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -11,15 +11,16 @@ protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
 fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11
+outlines == 0.1.11 # Requires pytorch
+lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
@@ -33,5 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
-depyf==0.18.0 # required for profiling and debugging torch.compile
+compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+depyf==0.18.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
deleted file mode 100644
index bbcb5cb7012ce..0000000000000
--- a/requirements-cuda-arm64.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
-torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 5d4dee8c7129a..8002fbd8ee5b9 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,9 +2,9 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray >= 2.9
+ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1; platform_machine != 'aarch64'
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 95e5914757812..ac9d851d661b0 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -4,5 +4,5 @@
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
 
-optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
+optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..42c6c321d040c 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2
 
-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1
diff --git a/setup.py b/setup.py
index fcfaa207c176a..61d2d710aa20e 100644
--- a/setup.py
+++ b/setup.py
@@ -455,9 +455,13 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    version = get_version(
-        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-    )
+    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
+    try:
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )
+    except LookupError:
+        version = "0.0.0"
 
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
@@ -466,7 +470,7 @@ def get_vllm_version() -> str:
             version += f"{sep}empty"
     elif _is_cuda():
         if envs.VLLM_USE_PRECOMPILED:
-            version += ".precompiled"
+            version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
             if cuda_version != MAIN_CUDA_VERSION:
@@ -630,6 +634,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 11d05cefb7313..1c2193bb17a55 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -127,11 +127,6 @@ def test_models_distributed(
     if attention_backend:
         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
 
-    # Import VLLM_USE_V1 dynamically to handle patching
-    from vllm.envs import VLLM_USE_V1
-    if VLLM_USE_V1 and distributed_executor_backend != "mp":
-        pytest.skip(f"Skip {distributed_executor_backend} for V1")
-
     dtype = "half"
     max_tokens = 5
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index ed50ec6bbc9eb..ccb9906fc5c0f 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -10,7 +10,8 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
 @pytest.fixture(scope="module")
@@ -26,11 +27,13 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_regex(sample_regex, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example IPv4 address with this regex: {sample_regex}"
     ] * 2,
@@ -50,11 +53,14 @@ def test_guided_regex(sample_regex, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_json_completion(sample_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion(sample_json_schema, llm,
+                                guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an employee profile "
         f"that fits this schema: {sample_json_schema}"
@@ -77,11 +83,14 @@ def test_guided_json_completion(sample_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_complex_json_completion(sample_complex_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_complex_json_completion(sample_complex_json_schema, llm,
+                                        guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_complex_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an assignment grade "
         f"that fits this schema: {sample_complex_json_schema}"
@@ -105,11 +114,14 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_definition_json_completion(sample_definition_json_schema, llm,
+                                           guided_decoding_backend: str):
     sampling_params = SamplingParams(temperature=1.0,
                                      max_tokens=1000,
                                      guided_decoding=GuidedDecodingParams(
-                                         json=sample_definition_json_schema))
+                                         json=sample_definition_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for solving 8x + 7 = -23 "
         f"that fits this schema: {sample_definition_json_schema}"
@@ -133,11 +145,14 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_choice_completion(sample_guided_choice, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_choice_completion(sample_guided_choice, llm,
+                                  guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         choice=sample_guided_choice,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
@@ -156,13 +171,15 @@ def test_guided_choice_completion(sample_guided_choice, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_grammar(sample_sql_statements, llm):
-
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_grammar(sample_sql_statements, llm,
+                        guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_statements,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(
         prompts=("Generate a sql state that select col_1 from "
                  "table_1 where it is equals to 1"),
@@ -218,15 +235,18 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_json_object(llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=100,
-        guided_decoding=GuidedDecodingParams(json_object=True))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_object(llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=100,
+                                     n=2,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json_object=True,
+                                         backend=guided_decoding_backend))
 
     outputs = llm.generate(
-        prompts=("Generate a JSON object describing a person with name "
-                 "and age for John Smith who is 31 years old."),
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
         sampling_params=sampling_params,
         use_tqdm=True)
 
@@ -235,10 +255,11 @@ def test_guided_json_object(llm):
         assert output is not None
         assert isinstance(output, RequestOutput)
 
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None
 
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 0a29d77e73abc..1116c0da1a6f0 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 8d23a2be6f9bb..5e6499d8f563c 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -17,6 +17,8 @@
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
@@ -464,8 +466,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 # will fail on the second `guided_decoding_backend` even when I swap their order
 # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                   guided_decoding_backend: str,
                                   sample_guided_choice):
@@ -482,6 +483,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
@@ -496,6 +498,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content
@@ -504,8 +507,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str,
                                 sample_json_schema):
@@ -552,8 +554,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str, sample_regex):
     messages = [{
@@ -611,8 +612,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                            guided_decoding_backend: str,
                                            sample_guided_choice):
@@ -644,8 +644,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_named_tool_use(client: openai.AsyncOpenAI,
                               guided_decoding_backend: str,
                               sample_json_schema):
@@ -679,7 +678,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name"
             }
-        })
+        },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
@@ -714,6 +714,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
                 "name": "dummy_function_name"
             }
         },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
         stream=True)
 
     output = []
@@ -736,10 +737,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str,
-        sample_json_schema):
+async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
+                                                   sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -783,9 +782,7 @@ async def test_required_tool_use_not_yet_supported(
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  guided_decoding_backend: str,
                                                   sample_json_schema):
     messages = [{
         "role": "system",
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 9f2b77dde2a7f..b52a5b28c9cff 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -6,6 +6,7 @@
 import pytest_asyncio
 import requests
 
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
@@ -17,6 +18,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--task",
+        "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -45,11 +48,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     ]
 
     # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -59,11 +65,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -80,11 +89,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
     ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
     assert len(embeddings.data[0].embedding) == 4096
@@ -95,11 +107,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     # test List[List[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 4
     assert len(embeddings.data[0].embedding) == 4096
@@ -124,14 +139,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         "content": "Stars twinkle brightly in the night sky.",
     }]
 
-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json={
-                                      "model": model_name,
-                                      "messages": messages,
-                                      "encoding_format": "float",
-                                  })
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
     chat_response.raise_for_status()
-    chat_embeddings = chat_response.json()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
 
     tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
     prompt = tokenizer.apply_chat_template(
@@ -148,13 +165,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         # To be consistent with chat
         extra_body={"add_special_tokens": False},
     )
-    completion_embeddings = completion_response.model_dump(mode="json")
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json"))
 
-    assert chat_embeddings.pop("id") is not None
-    assert completion_embeddings.pop("id") is not None
-    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
-        "created")
-    assert chat_embeddings == completion_embeddings
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(
+        exclude={"id", "created"}) == (completion_embeddings.model_dump(
+            exclude={"id", "created"}))
 
 
 @pytest.mark.asyncio
@@ -204,10 +223,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
     ]
 
     # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -219,10 +241,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
         1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
         9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
     ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -241,10 +265,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
     ]
 
     with pytest.raises(openai.BadRequestError):
-        embeddings = await client.embeddings.create(
+        response = await client.embeddings.create(
             model=model_name,
             input=input_texts,
             extra_body={"truncate_prompt_tokens": 8193})
-        assert "error" in embeddings.object
+        assert "error" in response.object
         assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in embeddings.message
+               "Please, select a smaller truncation size." in response.message
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
new file mode 100644
index 0000000000000..9c49239398cd2
--- /dev/null
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -0,0 +1,238 @@
+import base64
+
+import numpy as np
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "classify",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single pooling
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 7
+    assert poolings.usage.total_tokens == 7
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 5
+    assert poolings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 3
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 25
+    assert poolings.usage.total_tokens == 25
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 4
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 17
+    assert poolings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_poolings = PoolingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completions_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": prompt,
+            "encoding_format": "float",
+            # To be consistent with chat
+            "add_special_tokens": False,
+        },
+    )
+    completions_response.raise_for_status()
+    completion_poolings = PoolingResponse.model_validate(
+        completions_response.json())
+
+    assert chat_poolings.id is not None
+    assert completion_poolings.id is not None
+    assert chat_poolings.created <= completion_poolings.created
+    assert chat_poolings.model_dump(
+        exclude={"id", "created"}) == (completion_poolings.model_dump(
+            exclude={"id", "created"}))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    float_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "float",
+        },
+    )
+    float_response.raise_for_status()
+    responses_float = PoolingResponse.model_validate(float_response.json())
+
+    base64_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "base64",
+        },
+    )
+    base64_response.raise_for_status()
+    responses_base64 = PoolingResponse.model_validate(base64_response.json())
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.data),
+                          dtype="float32").tolist())
+
+    assert responses_float.data[0].data == decoded_responses_base64_data[0]
+    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    default_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+        },
+    )
+    default_response.raise_for_status()
+    responses_default = PoolingResponse.model_validate(default_response.json())
+
+    assert responses_float.data[0].data == responses_default.data[0].data
+    assert responses_float.data[1].data == responses_default.data[1].data
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5b40a04db15ee..51b255bb2a6db 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,6 +1,7 @@
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass
+from typing import Optional
 from unittest.mock import MagicMock
 
 from vllm.config import MultiModalConfig
@@ -31,6 +32,10 @@ class MockModelConfig:
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processor_pattern = None
+    diff_sampling_param: Optional[dict] = None
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
 
 
 @dataclass
@@ -94,3 +99,59 @@ def test_serving_chat_should_set_correct_max_tokens():
         asyncio.run(serving_chat.create_chat_completion(req))
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+
+def test_serving_chat_could_load_correct_generation_config():
+
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "temperature": 0.5,
+        "repetition_penalty": 1.05
+    }
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     BASE_MODEL_PATHS,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     lora_modules=None,
+                                     prompt_adapters=None,
+                                     request_logger=None)
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.5
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test the param when user set it
+    req.temperature = 0.1
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.1
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test When temperature==0.0
+    req.temperature = 0.0
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.0
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 294b250362699..e73449e406739 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index a0b6edd566561..5f070ba3b12e9 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 43c63daacb17f..3731b2dcdeae1 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,9 +1,9 @@
 from typing import Dict
 
 import pytest
-import pytest_asyncio
 import requests
 
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 from ...utils import VLLM_PATH, RemoteOpenAIServer
@@ -46,12 +46,6 @@ def server():
         yield remote_server
 
 
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.fixture(scope="session")
 def base64_encoded_image() -> Dict[str, str]:
     return {
@@ -82,18 +76,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
         ],
     }]
 
-    response = requests.post(server.url_for("v1/embeddings"),
-                             json={
-                                 "model": model_name,
-                                 "messages": messages,
-                                 "encoding_format": "float"
-                             })
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float"
+        },
+    )
     response.raise_for_status()
-
-    embeddings = response.json()
-    assert embeddings["id"] is not None
-    assert len(embeddings["data"]) == 1
-    assert len(embeddings["data"][0]["embedding"]) == 3072
-    assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 765
-    assert embeddings["usage"]["total_tokens"] == 765
+    embeddings = EmbeddingResponse.model_validate(response.json())
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 3072
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 765
+    assert embeddings.usage.total_tokens == 765
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
new file mode 100644
index 0000000000000..a16cc4582a180
--- /dev/null
+++ b/tests/kernels/test_block_fp8.py
@@ -0,0 +1,265 @@
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+NUM_TOKENS = [7, 83, 2048]
+D = [512, 4096, 5120, 13824]
+GROUP_SIZE = [64, 128, 256, 512]
+M = [1, 7, 83, 512, 2048]
+N = [128, 512, 1024, 4096, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+M_moe = [1, 7, 83, 512, 2048]
+N_moe = [4608]  # [128, 4608, 13824]
+K_moe = [7168]  # [256, 7168, 13824]
+BLOCK_SIZE = [[128, 128]]
+E = [256]  # [8, 24, 128, 256]
+TOP_KS = [1]  # [1, 2, 6]
+OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
+SEEDS = [0]
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def native_w8a8_block_fp8_matmul(A,
+                                 B,
+                                 As,
+                                 Bs,
+                                 block_size,
+                                 output_dtype=torch.float16):
+    """Matrix multiplication with block-wise quantization using native torch."""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[j * block_n:min((j + 1) * block_n, N),
+          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
+                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
+                                           SEEDS))
+@torch.inference_mode()
+def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
+    torch.manual_seed(seed)
+    x = torch.rand(num_tokens, d, dtype=dtype)
+
+    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+    out, scale = per_token_group_quant_fp8(x, group_size)
+
+    assert torch.allclose(out.to(torch.float32),
+                          ref_out.to(torch.float32),
+                          rtol=0.15)
+    assert torch.allclose(scale, ref_scale)
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
+                                           SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
+                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
+                                           BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w1_bf16
+
+    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w2_bf16
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
+    w2_s = torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=block_size,
+    )
+    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                       block_size)
+
+    print(f"{out.sum()=}")
+    print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..4316d6ab30e33
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,134 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 96b0e58713332..718730bb8cbbe 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("Test run passed!")
+    print("My rank: %d, Test run passed!" % (my_rank))
 
 
 def stress_test(my_rank, buf, device):
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("Passed stress test!")
+    print("My rank: %d, Passed stress test!" % (my_rank))
 
 
 if __name__ == "__main__":
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
index 09d7ee018c3f4..f2aeaee9ca6d5 100644
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -1,3 +1,8 @@
 #!/bin/bash
-RANK=0 python test_lookup_buffer.py &
-RANK=1 python test_lookup_buffer.py &
\ No newline at end of file
+RANK=0 python3 test_lookup_buffer.py &
+PID0=$!
+RANK=1 python3 test_lookup_buffer.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 65973bf10a4d7..4beba4dc05dde 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -10,39 +10,42 @@
 
 
 def test_run(my_rank, pipe):
+    print(f"rank {my_rank} test_run starts....")
     # test run
     x = torch.tensor([1]).to(pipe.device)
     y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
     if my_rank == 0:
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
 
     else:
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
 
     assert torch.allclose(x, x2)
     assert torch.allclose(y, y2)
 
+    print(f"rank {my_rank} test_run passed!")
 
-def stress_test(my_rank, pipe):
 
-    torch.distributed.barrier()
+def stress_test(my_rank, pipe):
+    print(f"rank {my_rank} stress_test starts....")
 
     tensors: List[torch.Tensor] = []
 
+    torch.distributed.barrier()
     torch.manual_seed(0)
 
     for i in tqdm(range(500)):
@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
 
 
 def latency_test(my_rank, pipe, nelement, ntensor):
-
     latencies = []
 
     torch.distributed.barrier()
@@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     )
 
     test_run(my_rank, pipe)
+
     stress_test(my_rank, pipe)
 
     # Use this function if you want to test the latency of pipe impl.
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
index 1e89e246b4992..54e0604806841 100644
--- a/tests/kv_transfer/test_send_recv.sh
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -1,3 +1,9 @@
 #!/bin/bash
+
 RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &
\ No newline at end of file
+PID0=$!
+RANK=1 python3 test_send_recv.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 29ecf37808205..8b247fb9b2388 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -200,6 +200,11 @@ def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
 
 
+@pytest.fixture(scope="session")
+def qwen2vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 9a529e27b4cd8..537d95b025a9d 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -4,6 +4,7 @@
 
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.utils import WeightsMapper
 
 lora_lst = [
     "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
@@ -71,3 +72,37 @@ def test_load_checkpoints(
                 device="cpu",
                 embedding_modules=embedding_modules,
                 embedding_padding_modules=embed_padding_modules)
+
+
+def test_lora_weights_mapping(baichuan_lora_files):
+    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: List[str] = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
+    lora_model = LoRAModel.from_local_checkpoint(
+        baichuan_lora_files,
+        expected_lora_modules,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules,
+        weights_mapper=hf_to_vllm_mapper,
+    )
+    for name in lora_model.loras:
+        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 1f3de9edc0d0f..78bf5a1617233 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -67,7 +67,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         trust_remote_code=True,
-        gpu_memory_utilization=0.97,  # This model is pretty big for CI gpus
         enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 150221dfce6ab..797a495201d33 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
 
 
 @pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size):
+                                         tp_size, fully_shard):
     """This LoRA model has all supported Mixtral target modules"""
 
     if torch.cuda.device_count() < tp_size:
@@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
         max_loras=4,
         distributed_executor_backend="ray",
         tensor_parallel_size=tp_size,
+        fully_sharded_loras=fully_shard,
         max_lora_rank=32,
     )
 
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
new file mode 100644
index 0000000000000..c9f48402b0268
--- /dev/null
+++ b/tests/lora/test_qwen2vl.py
@@ -0,0 +1,82 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+
+PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=5,
+    )
+
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.xfail(current_platform.is_rocm(),
+                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
+                   )
+def test_qwen2vl_lora(qwen2vl_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        max_model_len=4096,
+    )
+    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+
+    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 9f4d81b583141..be5282d9c8223 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,13 +1,20 @@
+import pickle
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -36,16 +43,30 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                  sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -59,7 +80,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -84,3 +105,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
     with pytest.raises(ValueError,
                        match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index c548cfdf53414..0bb98df1b58e6 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,6 +5,7 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
+from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -130,16 +131,14 @@ def process(hf_inputs: BatchEncoding, **kwargs):
                    dtype=dtype,
                    postprocess_inputs=process,
                    auto_cls=AutoModel) as hf_model:
-        import librosa
-
         hf_outputs_per_audio = [
             hf_model.generate_greedy_logprobs_limit(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(librosa.resample(audio[0],
-                                          orig_sr=audio[1],
-                                          target_sr=16000), 16000)])
+                audios=[(resample_audio(audio[0],
+                                        orig_sr=audio[1],
+                                        target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..bdc1571784b5d 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,43 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index ce8ac8d8e0ceb..f95cee277f4e6 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -58,16 +58,14 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img,num_imgs",
+    "num_crops,expected_toks_per_img",
     [
-        (4, 757, 1),
-        (4, 757, 2),
-        (16, 1921, 1),
-        (16, 1921, 2),
+        (4, 757),
+        (16, 1921),
         # the default num_crops of phi-3.5-vision is 4
-        (None, 757, 2),
-        (None, 757, 2),
+        (None, 757),
     ])
+@pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
                             model: str, num_crops: Optional[int],
                             expected_toks_per_img: int, num_imgs: int):
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index 7e2bea130583e..cd8954ffc48c2 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -1,12 +1,9 @@
 from typing import Any, Dict, Tuple
 
 import pytest
-import torch
-from PIL.Image import Image
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalRegistry
+from vllm.inputs import InputContext, InputProcessingContext
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -20,22 +17,9 @@
 # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
 # input mappers.
 @pytest.fixture()
-def image_input_mapper_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        image_input_mapper_for_qwen2_vl)
-    return image_input_mapper_for_qwen2_vl
-
-
-@pytest.fixture()
-def input_processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        input_processor_for_qwen2_vl)
-    return input_processor_for_qwen2_vl
-
-
-@pytest.fixture()
-def qwen2_vl_context() -> InputContext:
-    return build_model_context(model_name=MODEL)
+def processor_for_qwen2_vl():
+    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
+    return Qwen2VLMultiModalProcessor
 
 
 @pytest.fixture()
@@ -45,12 +29,6 @@ def get_max_qwen2_vl_image_tokens():
     return get_max_qwen2_vl_image_tokens
 
 
-@pytest.fixture()
-def dummy_data_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
-    return dummy_data_for_qwen2_vl
-
-
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
     ({}, 1225),
     ({
@@ -58,110 +36,70 @@ def dummy_data_for_qwen2_vl():
         MAX_PIXELS: 512**2
     }, 324),
 ])
-def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
-                                   qwen2_vl_context: InputContext,
-                                   mm_processor_kwargs: Dict[str, Any],
-                                   expected_max_tokens: int):
+@pytest.mark.parametrize("model", [MODEL])
+def test_qwen2_vl_max_image_tokens(
+    get_max_qwen2_vl_image_tokens,
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
+    expected_max_tokens: int,
+):
     """Ensure that the max token calc handles min/max pixels properly."""
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
-                                                      **mm_processor_kwargs)
-    assert actual_max_tokens == expected_max_tokens
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
-    [{}, 1225, (980, 980)],
-    [{
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324, (504, 504)],
-])
-def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
-                             qwen2_vl_context: InputContext,
-                             mm_processor_kwargs: Dict[str, Any],
-                             token_count: int, img_size: Tuple[int, int]):
-    """Ensure that the dummy data handles min/max pixels properly."""
-    seq_len = 3000
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-
-    # NOTE: video value is required, but isn't actually used
-    # when making the dummy data except for error handling currently
-    dummy_data = dummy_data_for_qwen2_vl(
-        ctx=qwen2_vl_context,
-        seq_len=seq_len,
-        mm_counts={
-            "image": 1,
-            "video": 0
-        },
-        **mm_processor_kwargs,
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
     )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-
-    # Ensure we have the right number of placeholders for min/max pixel values
-    assert seq_data.get_token_ids().count(image_token_id) == token_count
 
-    # Ensure the images were resized correctly
-    image = mm_data["image"]
-    assert isinstance(image, Image)
-    assert image.size == img_size
+    actual_max_tokens = get_max_qwen2_vl_image_tokens(
+        InputContext(ctx.model_config), **mm_processor_kwargs)
+    assert actual_max_tokens == expected_max_tokens
 
 
-@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
-    ({}, 1426),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 330),
-])
-def test_input_processor(input_processor_for_qwen2_vl,
-                         qwen2_vl_context: InputContext,
-                         image_assets: _ImageAssets, num_placeholders: int,
-                         mm_processor_kwargs: Dict[str, Any]):
-    """Ensure that the image processor handles min/max pixels properly."""
-    tokenizer = AutoTokenizer.from_pretrained(MODEL)
-    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-
-    image = image_assets[0].pil_image
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": [image]})
-
-    processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
-                                                    **mm_processor_kwargs)
-    assert processed_inputs["prompt_token_ids"].count(
-        image_token_id) == num_placeholders
-    assert len(processed_inputs["multi_modal_data"]["image"]) == 1
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
-    ({}, [5704, 1176]),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, [1320, 1176]),
-])
-def test_image_mapper_override(qwen2_vl_context: InputContext,
-                               image_assets: _ImageAssets,
-                               mm_processor_kwargs: Dict[str, Any],
-                               pixels_shape: Tuple[int, int]):
-    """Ensure that the image mapper handles min/max pixels properly."""
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
-
-    image = image_assets[0].pil_image
-
-    mapped_output = mm_registry.map_input(
-        qwen2_vl_context.model_config,
-        {"image": image},
-        mm_processor_kwargs=mm_processor_kwargs,
+@pytest.mark.parametrize(
+    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
+        ({}, 1426, (5704, 1176)),
+        ({
+            MIN_PIXELS: 64**2,
+            MAX_PIXELS: 512**2
+        }, 330, (1320, 1176)),
+    ])
+@pytest.mark.parametrize("model", [MODEL])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    processor_for_qwen2_vl,
+    image_assets: _ImageAssets,
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
+    expected_toks_per_img: int,
+    expected_pixels_shape: Tuple[int, int],
+    num_imgs: int,
+):
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
     )
-
-    # Dimension 0 of pixel values should match the product of image_grid_thw
-    actual_pixels_shape = mapped_output["pixel_values"].shape
-    assert list(actual_pixels_shape) == pixels_shape
-    assert actual_pixels_shape[0] == torch.prod(
-        mapped_output["image_grid_thw"])
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
+    images = [image_assets[0].pil_image] * num_imgs
+
+    mm_data = {"image": images}
+
+    processor = processor_for_qwen2_vl(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 6e6e5b40d6a35..18ceb34a4e042 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 45a7365204403..7406df253e7f0 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -8,7 +8,7 @@
 # Import the functions to test
 from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
                                               image_to_pixel_values_wrapper)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 models = [
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 82eae0705c9ba..3a8934adfb076 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 71b6ba4dca435..51fe7d2ad32a8 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -6,8 +6,8 @@
 from PIL import Image
 
 from vllm.entrypoints.llm import LLM
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                           PromptVideoInput, VllmRunner)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 66668296139f5..59773be709fa8 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -5,8 +5,9 @@
 
 import torch
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import _ImageAssets, _VideoAssets
 from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index e698d8d3f6f56..2291f4fa0d0ac 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,8 +1,9 @@
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 6321503e7b248..6673a9fc22f69 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -1,7 +1,4 @@
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
+"""Compare the classification outputs of HF and vLLM models.
 
 Run `pytest tests/models/test_cls_models.py`.
 """
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index af31e1a635f65..be6e3842821e2 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -1,6 +1,6 @@
-"""Compare the embedding outputs of HF and vLLM models.
+"""Compare the scoring outputs of HF and vLLM models.
 
-Run `pytest tests/models/embedding/language/test_embedding.py`.
+Run `pytest tests/models/embedding/language/test_scoring.py`.
 """
 import math
 
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 77dd1d81f84d7..636a3eedff31b 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -6,7 +6,7 @@
 
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fac8c4b2e9b19..f5a37420a2909 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -61,6 +61,8 @@ class _HfExamplesInfo:
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
                                          trust_remote_code=True),
+    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
+                                         trust_remote_code=True),
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
@@ -138,6 +140,7 @@ class _HfExamplesInfo:
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index b5368aab3ecf1..73b70d65e8e0b 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,7 +6,9 @@
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
             or model_arch in _MULTIMODAL_MODELS):
         assert is_text_generation_model(model_cls)
 
-    # All vLLM models should be convertible to an embedding model
-    embed_model = as_embedding_model(model_cls)
-    assert is_pooling_model(embed_model)
+    # All vLLM models should be convertible to a pooling model
+    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_embedding_model(model_cls))
+    assert is_pooling_model(as_reward_model(model_cls))
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 71832acbd17b8..81f2a06182bcc 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -6,7 +6,7 @@
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 
 @pytest.fixture
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index d676eacffb056..5e7d7d1877e61 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -13,6 +13,7 @@
 
 
 class MyGemma2Embedding(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -62,8 +63,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         return self.model.load_weights(weights)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 26add5bf6d90d..92436889ecffe 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,13 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -75,12 +79,12 @@ def zp_valid(zp: Optional[torch.Tensor]):
         assert output
 
 
-@pytest.mark.parametrize(
-    "model_path",
-    [
-        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
-        # TODO static & asymmetric
-    ])
+@pytest.mark.parametrize("model_path", [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
@@ -88,6 +92,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                           max_tokens, num_logprobs):
     dtype = "bfloat16"
 
+    # skip language translation prompt for the static per tensor asym model
+    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+        example_prompts = example_prompts[0:-1]
+
     with hf_runner(model_path, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
@@ -208,3 +216,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
new file mode 100644
index 0000000000000..c5722fbae5c8a
--- /dev/null
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -0,0 +1,31 @@
+from vllm import SamplingParams
+from vllm.config import LoadConfig, LoadFormat
+from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
+                                                     get_model_loader)
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def get_runai_model_loader():
+    load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER)
+    return get_model_loader(load_config)
+
+
+def test_get_model_loader_with_runai_flag():
+    model_loader = get_runai_model_loader()
+    assert isinstance(model_loader, RunaiModelStreamerLoader)
+
+
+def test_runai_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
new file mode 100644
index 0000000000000..5c89bd78ad81d
--- /dev/null
+++ b/tests/runai_model_streamer/test_weight_utils.py
@@ -0,0 +1,39 @@
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, runai_safetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_runai_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        runai_model_streamer_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+            runai_model_streamer_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
+
+        for name, runai_tensor in runai_model_streamer_tensors.items():
+            assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert runai_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_runai_model_loader()
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index f5497976faf7a..397fa2cc85821 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -200,6 +200,69 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                 assert torch.equal(results[j][i], results[0][i])
 
 
+@pytest.mark.parametrize("k", [1, 3, 6])
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
+@torch.inference_mode()
+def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
+                            device: str, use_flashinfer: bool):
+    torch.set_default_device(device)
+    set_random_seed(0)
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    single_batches = []
+    for i in range(batch_size):
+        single_batches.append((draft_probs[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0),
+                               target_probs[i].clone().unsqueeze(0),
+                               bonus_token_ids[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0)))
+
+    set_random_seed(0)
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+
+    results = []
+    seeded_seqs = {
+        i: torch.Generator(device=device).manual_seed(i)
+        for i in range(1, batch_size)  # 0 is seed None
+    }
+    batch_result = rejection_sampler(target_probs.clone(),
+                                     bonus_token_ids.clone(),
+                                     draft_probs.clone(),
+                                     draft_token_ids.clone(), seeded_seqs)
+
+    set_random_seed(0)
+
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+    for i in range(batch_size):
+        request_seeded_seqs = {
+            0: torch.Generator(device=device).manual_seed(i)
+        } if seeded_seqs.get(i) is not None else None
+        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
+         draft_token_ids) = single_batches[i]
+        results.append(
+            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                              draft_token_ids, request_seeded_seqs))
+    for i in range(batch_size):
+        assert torch.equal(batch_result[i], results[i].squeeze(0))
+
+
 @pytest.mark.parametrize("k", [1, 3, 6])
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index edd079bc7a389..0b0792b6b845f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -9,7 +9,6 @@
 import pytest
 import torch
 from huggingface_hub import snapshot_download
-from tensorizer import EncryptionParams
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -23,12 +22,18 @@
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
 # yapf: enable
-from vllm.utils import import_from_path
+from vllm.utils import PlaceholderModule, import_from_path
 
 from ..conftest import VllmRunner
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
+try:
+    from tensorizer import EncryptionParams
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
 prompts = [
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 6818ac44b2478..2241f1846e746 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -103,7 +103,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "supports_rocm":
         False,
     },
-    "granite8b": {
+    "granite-3.0-8b": {
         "model":
         "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
@@ -111,6 +111,14 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
             str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
         ],
     },
+    "granite-3.1-8b": {
+        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "arguments": [
+            "--tool-call-parser",
+            "granite",
+        ],
+        "supports_parallel": True,
+    },
     "internlm": {
         "model":
         "internlm/internlm2_5-7b-chat",
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 00f7b0fcfe1dc..ed04f0a373c51 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2,16 +2,23 @@
 import pytest
 
 from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
 
 
-def make_request(request_id, prompt_token_ids):
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
+                            multi_modal_placeholders={"image": mm_positions}
+                            if mm_positions else None,
+                            multi_modal_hashes=mm_hashes),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
@@ -38,6 +45,7 @@ def test_prefill():
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     computed_blocks = manager.get_computed_blocks(req0)
+    assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -61,6 +69,7 @@ def test_prefill():
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
@@ -90,6 +99,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_block = manager.get_computed_blocks(req2)
+    assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_block] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
@@ -416,3 +426,77 @@ def test_cache_blocks():
     )
     assert len(manager.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert len(req0.kv_block_hashes) == 3
+    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
+    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
+    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(req0.kv_block_hashes) == 4
+    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ac5e7dde525a7..ff38a4568ecb1 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -31,14 +31,6 @@ def test_prefix_caching_from_cli():
     assert engine_args.enable_prefix_caching
 
 
-def test_defaults():
-    engine_args = EngineArgs(model="facebook/opt-125m")
-
-    # Assert V1 defaults
-    assert (engine_args.enable_prefix_caching
-            ), "V1 turns on prefix caching by default"
-
-
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(
@@ -52,10 +44,3 @@ def test_defaults_with_usage_context():
         UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == 1024
     assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
-
-
-def test_prefix_cache_disabled_with_multimodel():
-    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
-
-    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
-    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index a61ec63a365b5..c529cd21f384b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -139,3 +139,41 @@ def test_engine_core(monkeypatch):
         engine_core.abort_requests([req2.request_id, req0.request_id])
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
+
+
+def test_engine_core_advanced_sampling(monkeypatch):
+    """
+    A basic end-to-end test to verify that the engine functions correctly 
+    when additional sampling parameters, such as min_tokens and 
+    presence_penalty, are set.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+
+        engine_core = EngineCore(vllm_config=vllm_config,
+                                 executor_class=executor_class,
+                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+        """Test basic request lifecycle."""
+        # First request.
+        request: EngineCoreRequest = make_request()
+        request.sampling_params = SamplingParams(
+            min_tokens=4,
+            presence_penalty=1.0,
+            frequency_penalty=1.0,
+            repetition_penalty=0.1,
+            stop_token_ids=[1001, 1002],
+        )
+        engine_core.add_request(request)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        # Loop through until they are all done.
+        while len(engine_core.step()) > 0:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/sample/__init__.py b/tests/v1/sample/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
new file mode 100644
index 0000000000000..5ebf72927cfd6
--- /dev/null
+++ b/tests/v1/sample/test_sampler.py
@@ -0,0 +1,321 @@
+from typing import List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: List[List[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: List[List[int]] = []
+    prompt_token_ids: List[List[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=torch.empty(batch_size, ),
+        top_k=torch.empty(batch_size, ),
+        no_top_p=True,
+        no_top_k=True,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        min_tokens=[],
+        stop_token_ids=[],
+    )
+    return fake_sampling_metadata
+
+
+def _generate_min_token_penalties_and_stop_tokens(
+    num_output_tokens: int, batch_size: int, vocab_size: int,
+    batch_indices_for_min_token_penalty: List[int]
+) -> Tuple[List[int], List[Set[int]]]:
+    """
+    Generates and returns a list of minimum token penalties (`min_tokens`) 
+    and a corresponding list of stop token IDs (`stop_token_ids`) for each 
+    batch.
+
+    If a batch index is included in `batch_indices_for_min_token_penalty`, 
+    a higher `min_tokens` value is assigned (within a randomized range), 
+    and a random set of stop token IDs is created. Otherwise, a lower 
+    `min_tokens` value is assigned, and the stop token IDs set is empty.   
+    """
+    stop_token_ids: List[Set[int]] = []
+    min_tokens: List[int] = []
+    for index in range(batch_size):
+        if index in batch_indices_for_min_token_penalty:
+            min_tokens.append(
+                np.random.randint(num_output_tokens + 1,
+                                  2 * num_output_tokens))
+            stop_token_ids.append(
+                set(
+                    np.random.randint(0, vocab_size - 1)
+                    for _ in range(np.random.randint(0, vocab_size))))
+
+        else:
+            min_tokens.append(np.random.randint(0, num_output_tokens))
+            stop_token_ids.append(set())
+    return (min_tokens, stop_token_ids)
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct 
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        Tuple[List[List[int]], List[List[int]]]:
+            - The first element is the output token list, where each sublist 
+              corresponds to a batch and contains tokens with weighted 
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: List[List[int]] = []
+    sorted_token_ids_in_output: List[List[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return (output_token_ids, sorted_token_ids_in_output)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+def test_sampler_min_tokens_penalty(device: str, batch_size: int):
+    """
+    Tests that if the number of output tokens is less than 
+    SamplingParams.min_tokens then we will set the logits for
+    the stop token ids to -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    batch_indices_for_min_token_penalty = np.random.randint(
+        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
+        batch_indices_for_min_token_penalty)
+    sampling_metadata.min_tokens = min_tokens
+    sampling_metadata.stop_token_ids = stop_token_ids
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            if token_id in stop_token_ids[batch_idx]:
+                assert logits[batch_idx][token_id] == -float("inf")
+            else:
+                assert logits[batch_idx][token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(device: str, batch_size: int,
+                                  presence_penalty: float):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        # Since all tokens initially have the same logits, the non-penalized
+        # token ID will be the one with the highest logit value, while the
+        # penalized token ID will be the one with the lowest logit value.
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(device: str, batch_size: int,
+                                   frequency_penalty: float):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device))
+    output_token_ids, sorted_token_ids_in_output = \
+        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        distinct_sorted_token_ids_in_output = \
+            sorted_token_ids_in_output[batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert non_penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(device: str, batch_size: int,
+                                    repetition_penalty: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens 
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        prompt_tokens = sampling_metadata.prompt_token_ids[
+            batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (non_penalized_token_id not in prompt_tokens and \
+                non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id  in prompt_tokens or \
+                penalized_token_id in output_tokens)
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (penalized_token_id not in prompt_tokens and \
+                penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id  in prompt_tokens or \
+                non_penalized_token_id in output_tokens)
diff --git a/tests/v1/worker/__init__.py b/tests/v1/worker/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
new file mode 100644
index 0000000000000..694ce81ff6e22
--- /dev/null
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -0,0 +1,224 @@
+from typing import Dict, List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+MAX_PROMPT_SIZE = 100
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _remove_requests(
+        input_batch: InputBatch, batch_size: int,
+        reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
+    """
+    Remove some requests randomly from the batch and returns a Tuple
+    of 1) set of request removed 2) indices of the requests removed
+    ordered in descending order
+    """
+
+    num_reqs_to_remove = np.random.randint(0, batch_size)
+    req_indices_to_remove: Set[int] = set()
+    for _ in range(num_reqs_to_remove):
+        req_index_to_remove = np.random.randint(0, batch_size)
+        req_indices_to_remove.add(req_index_to_remove)
+
+    req_indices_to_remove_list = list(req_indices_to_remove)
+    req_indices_to_remove_list.sort(reverse=True)
+    req_ids_to_remove: Set[str] = set()
+    for index in req_indices_to_remove:
+        input_batch.remove_request(reqs[index].req_id)
+        req_ids_to_remove.add(reqs[index].req_id)
+    return (req_ids_to_remove, req_indices_to_remove_list)
+
+
+def _construct_expected_sampling_metadata(
+        reqs: List[CachedRequestState], req_ids_retained: Set[int],
+        req_id_index_in_input_batch: Dict[str, int],
+        device: torch.device) -> SamplingMetadata:
+    """
+    Constructs and returns the expected SamplingMetadata for this
+    batch.
+    """
+    num_reqs = len(req_ids_retained)
+    output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    presence_penalties = [0.0 for _ in range(num_reqs)]
+    frequency_penalties = [0.0 for _ in range(num_reqs)]
+    repetition_penalties = [1.0 for _ in range(num_reqs)]
+    top_k = [0 for _ in range(num_reqs)]
+    top_p = [0.0 for _ in range(num_reqs)]
+    temperature = [0.0 for _ in range(num_reqs)]
+    stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
+    min_tokens = [0 for _ in range(num_reqs)]
+    for req in reqs:
+        if req.req_id not in req_ids_retained:
+            continue
+        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
+        output_token_ids[index_in_input_batch] = req.output_token_ids
+        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
+        presence_penalties[
+            index_in_input_batch] = req.sampling_params.presence_penalty
+        frequency_penalties[
+            index_in_input_batch] = req.sampling_params.frequency_penalty
+        repetition_penalties[
+            index_in_input_batch] = req.sampling_params.repetition_penalty
+        top_k[index_in_input_batch] = req.sampling_params.top_k
+        top_p[index_in_input_batch] = req.sampling_params.top_p
+        temperature[index_in_input_batch] = req.sampling_params.temperature
+        stop_token_ids[
+            index_in_input_batch] = req.sampling_params.all_stop_token_ids
+        min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
+
+
+    return SamplingMetadata(
+        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
+        all_greedy=False,
+        all_random=True,
+        top_p=torch.tensor(top_p, dtype=torch.float, device=device),
+        top_k=torch.tensor(top_k, dtype=torch.int, device=device),
+        no_top_p=all(x == 1.0 for x in top_p),
+        no_top_k=all(x == 0 for x in top_k),
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids= make_tensor_with_pad(
+            prompt_token_ids,
+            pad=VOCAB_SIZE,
+            device=torch.device(device),
+            dtype=torch.int64,
+        ),
+        frequency_penalties=torch.tensor(
+            frequency_penalties, dtype=torch.float,
+            device=device),
+        presence_penalties=torch.tensor(
+            presence_penalties, dtype=torch.float,
+            device=device),
+        repetition_penalties=torch.tensor(
+            repetition_penalties, dtype=torch.float,
+            device=device),
+        output_token_ids=output_token_ids,
+        min_tokens=min_tokens,
+        stop_token_ids=stop_token_ids,
+        no_penalties=(all(x ==0 for x in presence_penalties) and \
+            all(x ==0 for x in frequency_penalties) and \
+                all(x ==1 for x in repetition_penalties))
+    )
+
+
+def _create_sampling_params():
+    return SamplingParams(top_k=np.random.randint(1, 10),
+                          top_p=np.random.uniform(0.0, 1.0),
+                          presence_penalty=np.random.uniform(-2.0, 2.0),
+                          repetition_penalty=np.random.uniform(0.0, 2.0),
+                          frequency_penalty=np.random.uniform(-2.0, 2.0),
+                          min_tokens=np.random.randint(1, 10),
+                          stop_token_ids=[
+                              np.random.randint(0, VOCAB_SIZE)
+                              for _ in range(np.random.randint(10))
+                          ])
+
+
+def _construct_cached_request_state(req_id_suffix: int):
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
+    ]
+    output_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
+    ]
+    return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
+                              prompt_token_ids=prompt_token_ids,
+                              prompt=None,
+                              sampling_params=_create_sampling_params(),
+                              mm_inputs=[],
+                              mm_positions=[],
+                              block_ids=[],
+                              generator=None,
+                              num_computed_tokens=len(output_token_ids),
+                              output_token_ids=output_token_ids)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
+def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+    """
+    input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
+                                         max_model_len=1024,
+                                         max_num_blocks_per_req=10,
+                                         device=torch.device(device),
+                                         pin_memory=is_pin_memory_available(),
+                                         vocab_size=1024)
+    reqs: List[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        input_batch.add_request(req, req_index)
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    # Remove some requests
+    req_ids_to_remove, req_indices_to_remove = _remove_requests(
+        input_batch, batch_size, reqs)
+    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
+
+    # Compact the input batch
+    input_batch.condense(req_indices_to_remove)
+
+    # Generate the sampling metadata
+    sampling_metadata = input_batch.make_sampling_metadata(
+        req_id_output_token_ids, skip_copy=False)
+
+    # Create expected output.
+    expected_sampling_metadata = _construct_expected_sampling_metadata(
+        reqs,
+        req_ids_retained,
+        input_batch.req_id_to_index,
+        device=torch.device(device))
+
+    # Assert the actual and expected output.
+    assert torch.allclose(expected_sampling_metadata.temperature,
+                          sampling_metadata.temperature)
+    assert torch.allclose(expected_sampling_metadata.top_p,
+                          sampling_metadata.top_p)
+    assert torch.allclose(expected_sampling_metadata.top_k,
+                          sampling_metadata.top_k)
+    assert torch.allclose(expected_sampling_metadata.frequency_penalties,
+                          sampling_metadata.frequency_penalties)
+    assert torch.allclose(expected_sampling_metadata.presence_penalties,
+                          sampling_metadata.presence_penalties)
+    assert torch.allclose(expected_sampling_metadata.repetition_penalties,
+                          sampling_metadata.repetition_penalties)
+    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
+                          sampling_metadata.prompt_token_ids)
+    assert (expected_sampling_metadata.output_token_ids ==
+            sampling_metadata.output_token_ids)
+    assert (
+        expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
+    assert (expected_sampling_metadata.stop_token_ids ==
+            sampling_metadata.stop_token_ids)
+    assert (expected_sampling_metadata.no_penalties ==
+            sampling_metadata.no_penalties)
+    assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
+    assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 2afffb5b9d1c8..a06956ce18a93 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index a4d0c44c22b51..693128640e07d 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
     export QUANTIZATION=${array[0]}
     export MODEL_NAME=${array[1]}
     export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index d8bca05e204c0..199731bdc21fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,14 +1,21 @@
 import os
 
+import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d6002630ee02c..aeacf5dda5761 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,5 +1,4 @@
 import contextlib
-import functools
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
@@ -36,34 +35,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-def hint_on_error(fn):
-
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        try:
-            return fn(*args, **kwargs)
-
-        except NotImplementedError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Not implemented or built, mostly likely because the current current device "
-                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
-                "incorrectly while building)")
-            logger.error(msg, fn.__name__, e)
-            raise NotImplementedError(msg % (fn.__name__, e)) from e
-        except AttributeError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Possibly you have built or installed an obsolete version of vllm.\n"
-                "Please try a clean build and install of vllm,"
-                "or remove old built files such as vllm/*cpython*.so and build/ ."
-            )
-            logger.error(msg, fn.__name__, e)
-            raise e
-
-    return wrapper
-
-
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.silu_and_mul(out, x)
@@ -552,6 +523,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+
+
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
+    return a_nzs, a_meta
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
@@ -993,25 +1072,3 @@ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
 def register_graph_buffers(fa: int, handles: List[List[int]],
                            offsets: List[List[int]]) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
-
-
-# temporary fix for https://github.com/vllm-project/vllm/issues/5456
-# TODO: remove this in v0.6.0
-names_and_values = globals()
-names_and_values_to_update = {}
-# prepare variables to avoid dict size change during iteration
-k, v, arg = None, None, None
-fn_type = type(lambda x: x)
-for k, v in names_and_values.items():
-    # find functions that are defined in this file and have torch.Tensor
-    # in their annotations. `arg == "torch.Tensor"` is used to handle
-    # the case when users use `import __annotations__` to turn type
-    # hints into strings.
-    if isinstance(v, fn_type) \
-        and v.__code__.co_filename == __file__ \
-        and any(arg is torch.Tensor or arg == "torch.Tensor"
-                for arg in v.__annotations__.values()):
-        names_and_values_to_update[k] = hint_on_error(v)
-
-names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index a5c04ab78fbe8..468904c90fff4 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+from typing import Any, Callable, Dict, Optional, TypeVar
 
 from torch import nn
 
@@ -24,14 +24,13 @@ def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
 T = TypeVar('T')
 
 
-class AdapterLRUCache(LRUCache[T]):
+class AdapterLRUCache(LRUCache[int, T]):
 
-    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
-                                                              None]):
+    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: Hashable, value: Optional[T]):
+    def _on_remove(self, key: int, value: Optional[T]):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 49bb6aeee90bc..9033644e3264a 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,11 +1,17 @@
 from dataclasses import dataclass
-from typing import Literal, Tuple
+from typing import Literal
 from urllib.parse import urljoin
 
-import librosa
-import numpy as np
+import numpy.typing as npt
 
-from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
+from vllm.utils import PlaceholderModule
+
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
 ASSET_DIR = "multimodal_asset"
 
@@ -15,8 +21,7 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
-
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
         y, sr = librosa.load(audio_path, sr=None)
@@ -25,4 +30,4 @@ def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
 
     @property
     def url(self) -> str:
-        return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index f97e8c218f65b..249173141106c 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -4,9 +4,8 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 
-vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 
 
 def get_cache_dir() -> Path:
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
         if s3_prefix is not None:
             filename = s3_prefix + "/" + filename
         global_http_connection.download_file(
-            f"{vLLM_S3_BUCKET_URL}/{filename}",
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
             asset_path,
-            timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
 
     return asset_path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 389ecd5c869bc..cb831cb0b5bb4 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -4,7 +4,7 @@
 import torch
 from PIL import Image
 
-from vllm.assets.base import get_vllm_public_assets
+from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
@@ -15,7 +15,6 @@ class ImageAsset:
 
     @property
     def pil_image(self) -> Image.Image:
-
         image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
                                             s3_prefix=VLM_IMAGES_DIR)
         return Image.open(image_path)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e4dcab10466db..eca2ccc54482c 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,13 +2,13 @@
 from functools import lru_cache
 from typing import List, Literal
 
+import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.utils import (sample_frames_from_video,
-                                   try_import_video_packages)
+from vllm.multimodal.video import sample_frames_from_video
 
 from .base import get_cache_dir
 
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
     Download and open an image from huggingface
     repo: raushan-testing-hf/videos-test
     """
-    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory = get_cache_dir() / "video-example-data"
     video_directory.mkdir(parents=True, exist_ok=True)
 
     video_path = video_directory / filename
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 def video_to_pil_images_list(path: str,
                              num_frames: int = -1) -> List[Image.Image]:
-    cv2, _ = try_import_video_packages()
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 19daeb729ee61..480901f71047f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -430,7 +430,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 05d997279893b..69b6d1e4648df 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -191,6 +191,7 @@ def __init__(
                                         kv_cache_dtype=None,
                                         block_size=16,
                                         is_attention_free=False)
+        attn_backend = backend_name_to_enum(attn_backend.get_name())
         if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
             attn_backend = _Backend.XFORMERS
 
diff --git a/vllm/block.py b/vllm/block.py
deleted file mode 100644
index 47c381c19383b..0000000000000
--- a/vllm/block.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Token blocks."""
-from typing import TYPE_CHECKING, Iterator, List, Optional
-
-from vllm.utils import Device
-
-DEFAULT_LAST_ACCESSED_TIME: float = -1
-
-
-class PhysicalTokenBlock:
-    """Represents the state of a block in the KV cache."""
-
-    def __init__(
-        self,
-        device: Device,
-        block_number: int,
-        block_size: int,
-        block_hash: int,
-        num_hashed_tokens: int,
-    ) -> None:
-        self.device = device
-        self.block_number = block_number
-        self.block_size = block_size
-        self.block_hash = block_hash
-        self.num_hashed_tokens = num_hashed_tokens
-
-        self.ref_count = 0
-        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
-
-        self.computed = False
-
-    def __repr__(self) -> str:
-        return (f'PhysicalTokenBlock(device={self.device}, '
-                f'block_number={self.block_number}, '
-                f'num_hashed_tokens={self.num_hashed_tokens}, '
-                f'ref_count={self.ref_count}, '
-                f'last_accessed={self.last_accessed}, '
-                f'computed={self.computed})')
-
-
-class BlockTable:
-    """Holds a list of blocks with caching of their associated block_ids 
-    """
-
-    def __init__(self, blocks: Optional[List[PhysicalTokenBlock]] = None):
-        self._blocks: List[PhysicalTokenBlock] = []
-        self._block_ids: List[int] = []
-
-        if blocks is not None:
-            for block in blocks:
-                self.append(block)
-
-    def append(self, block: PhysicalTokenBlock):
-        self._blocks.append(block)
-        self._block_ids.append(block.block_number)
-
-    def __len__(self) -> int:
-        return len(self._blocks)
-
-    def __getitem__(self, key):
-        return self._blocks[key]
-
-    if TYPE_CHECKING:
-
-        def __iter__(self) -> Iterator[PhysicalTokenBlock]:
-            raise RuntimeError("Method should be automatically generated")
-
-    def __setitem__(self, key, value):
-        if isinstance(key, slice):
-            blocks = value
-            self._blocks[key] = blocks
-            self._block_ids[key] = [b.block_number for b in blocks]
-        else:
-            block = value
-            self._blocks[key] = block
-            self._block_ids[key] = block.block_number
-
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-
-    def copy(self) -> "BlockTable":
-        return BlockTable(self._blocks)
-
-    def list(self) -> List[PhysicalTokenBlock]:
-        return self._blocks
-
-    def ids(self) -> List[int]:
-        return self._block_ids
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0c7bbfe599b02..4f960b441f21d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -141,14 +141,14 @@ def produce_guards_expression(self, *args, **kwargs):
         return ""
 
 
-def wrap_inductor(graph,
+def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
-                  use_inductor: bool = True):
+                  use_inductor: bool = True) -> Any:
     if graph_index == 0:
         # before compiling the first graph, record the start time
         global compilation_start_time
@@ -208,7 +208,7 @@ def wrap_inductor(graph,
         from torch._inductor.compile_fx import graph_returns_tuple
         returns_tuple = graph_returns_tuple(graph)
 
-        # this is the graph we return to Dynamo to run
+        # this is the callable we return to Dynamo to run
         def compiled_graph(*args):
             # convert args to list
             list_args = list(args)
@@ -247,7 +247,7 @@ def _check_can_cache(*args, **kwargs):
             # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
             return
 
-        def _get_shape_env():
+        def _get_shape_env() -> AlwaysHitShapeEnv:
             return AlwaysHitShapeEnv()
 
         with patch(# for hijacking the hash of the compiled graph
@@ -537,6 +537,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             example_inputs[x].clone() for x in self.sym_tensor_indices
         ]
 
+        # this is the callable we return to Dynamo to run
         def copy_and_call(*args):
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index 0ad648abfbb3a..b6bcecdc89e26 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -7,6 +7,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor import pattern_matcher as pm
 from torch._ops import OpOverload
+from torch.fx import Node
 
 from vllm.compilation.fx_utils import find_auto_fn
 
@@ -97,7 +98,7 @@ def insert_getitems(self, tuple_node: fx.Node,
                 self.graph.call_function(operator.getitem, (tuple_node, idx))
                 for idx in indices)
 
-    def insert_auto_fn(self, op: OpOverload, kwargs):
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
         """
         Insert an auto_functionalized node with the given op and kwargs.
         """
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index fb522ae053e97..34f5f355798b2 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, Dict, List
 
 from torch import fx as fx
 
@@ -53,7 +53,7 @@ def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
 
-    def __getstate__(self):
+    def __getstate__(self) -> Dict[str, List[Any]]:
         """
         Custom pickling for the pass manager, as some passes cannot be pickled.
         Pickling occurs because the pass manager is set as the value of
diff --git a/vllm/config.py b/vllm/config.py
index 9ecd3e72afa9f..ac767bbe14be4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,12 +22,15 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform
+from vllm.platforms import current_platform, interface
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
-    get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
+    try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.s3_utils import S3Model
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
@@ -148,9 +151,8 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
-        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
-            preprocessor/mapper. Otherwise, the mapper executes each time, and 
-            for better performance consider enabling frontend process.
+        disable_mm_preprocessor_cache: If true, then disables caching of the
+            multi-modal preprocessor/mapper. (not recommended)
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
@@ -159,8 +161,9 @@ class ModelConfig:
             override default pooling config for the pooling model.
         logits_processor_pattern: Optional regex pattern specifying valid
             logits processor qualified names that can be passed with the
-            `logits_processors` extra completion argument. Defaults to None, 
+            `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
+        generation_config: Configuration parameter file for generation.
     """
 
     def compute_hash(self) -> str:
@@ -216,10 +219,11 @@ def __init__(self,
                  config_format: ConfigFormat = ConfigFormat.AUTO,
                  hf_overrides: Optional[HfOverrides] = None,
                  mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-                 mm_cache_preprocessor: bool = False,
+                 disable_mm_preprocessor_cache: bool = False,
                  override_neuron_config: Optional[Dict[str, Any]] = None,
                  override_pooler_config: Optional["PoolerConfig"] = None,
-                 logits_processor_pattern: Optional[str] = None) -> None:
+                 logits_processor_pattern: Optional[str] = None,
+                 generation_config: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -254,6 +258,8 @@ def __init__(self,
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
+        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
+
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision
@@ -286,7 +292,7 @@ def __init__(self,
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
-        self.mm_cache_preprocessor = mm_cache_preprocessor
+        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -349,10 +355,36 @@ def __init__(self,
         self.pooler_config = self._init_pooler_config(override_pooler_config)
         self.logits_processor_pattern = logits_processor_pattern
 
+        self.generation_config = generation_config
+
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    def maybe_pull_model_tokenizer_for_s3(self, model: str,
+                                          tokenizer: str) -> None:
+        """
+        Pull the model config or tokenizer to a temporary
+        directory in case of S3.
+
+        Args:
+            model: The model name or path.
+            tokenizer: The tokenizer name or path.
+
+        """
+        if is_s3(model) or is_s3(tokenizer):
+            if is_s3(model):
+                self.s3_model = S3Model()
+                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                self.model_weights = self.model
+                self.model = self.s3_model.dir
+
+            if is_s3(tokenizer):
+                self.s3_tokenizer = S3Model()
+                self.s3_tokenizer.pull_files(
+                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                self.tokenizer = self.s3_tokenizer.dir
+
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
@@ -564,6 +596,12 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+        if (self.hf_config.model_type == 'deepseek_v3'
+                and not self.enforce_eager):
+            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
+                           "fallback to the eager mode.")
+            self.enforce_eager = True
+
     def _verify_bnb_config(self) -> None:
         """
         The current version of bitsandbytes (0.44.0) with 8-bit models does not
@@ -598,7 +636,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
@@ -618,7 +656,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -680,8 +718,9 @@ def get_hidden_size(self) -> int:
 
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config, "model_type"
-                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      in ('deepseek_v2', 'deepseek_v3')):
             # FlashAttention supports only head_size 32, 64, 128, 256,
             # we need to pad head_size 192 to 256
             return 256
@@ -814,6 +853,56 @@ def get_multimodal_config(self) -> "MultiModalConfig":
 
         return self.multimodal_config
 
+    def try_get_generation_config(self) -> Dict[str, Any]:
+        if self.generation_config is None or self.generation_config == "auto":
+            config = try_get_generation_config(
+                self.model,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.revision,
+            )
+        else:
+            config = try_get_generation_config(
+                self.generation_config,
+                trust_remote_code=self.trust_remote_code,
+            )
+
+        if config is None:
+            return {}
+
+        return config.to_diff_dict()
+
+    def get_diff_sampling_param(self) -> Dict[str, Any]:
+        """
+        This method returns a dictionary containing the parameters
+        that differ from the default sampling parameters, but only
+        if `generation_config` is set. If `generation_config` is not
+        set, an empty dictionary is returned.
+
+        Returns:
+            Dict[str, Any]: A dictionary with the differing sampling
+            parameters if `generation_config` is set, otherwise an
+            empty dictionary.
+        """
+        if self.generation_config is None:
+            # When generation_config is not set
+            return {}
+        config = self.try_get_generation_config()
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+        ]
+        if any(p in config for p in available_params):
+            diff_sampling_param = {
+                p: config.get(p)
+                for p in available_params if config.get(p) is not None
+            }
+        else:
+            diff_sampling_param = {}
+        return diff_sampling_param
+
     @property
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
@@ -917,6 +1006,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
     GGUF = "gguf"
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
+    RUNAI_STREAMER = "runai_streamer"
 
 
 @dataclass
@@ -1963,7 +2057,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             logger.warning("LoRA with chunked prefill is still experimental "
@@ -2141,6 +2235,17 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if (current_platform.is_cpu()
+                    and current_platform.get_cpu_architecture()
+                    == interface.CpuArchEnum.POWERPC
+                    and (config_dtype == torch.float16
+                         or config_dtype == torch.float32)):
+                logger.info(
+                    "For POWERPC, we cast models to bfloat16 instead of "
+                    "using float16 by default. Float16 is not currently "
+                    "supported for POWERPC.")
+                torch_dtype = torch.bfloat16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
@@ -3151,7 +3256,7 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
-            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
+            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
             f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}")
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 44adc4158abec..c9306518223a3 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
 
 class Evictor(ABC):
     """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed PhysicalTokenBlocks.
+    handle eviction of freed Blocks.
     """
 
     @abstractmethod
@@ -70,7 +70,7 @@ def __init__(self, content_hash: int, num_hashed_tokens: int,
 
 class LRUEvictor(Evictor):
     """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    that's recorded in the Block. If there are multiple blocks with
     the same last_accessed time, then the one with the largest num_hashed_tokens
     will be evicted. If two blocks each have the lowest last_accessed time and
     highest num_hashed_tokens value, then one will be chose arbitrarily
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f6d276fe7c0c8..21966d003c7ef 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -141,7 +141,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
-    mm_cache_preprocessor: bool = False
+    disable_mm_preprocessor_cache: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -197,6 +197,8 @@ class EngineArgs:
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
+    generation_config: Optional[str] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -205,6 +207,7 @@ def __post_init__(self):
         # by user.
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # Override max_num_seqs if it's not set by user.
         if self.max_num_seqs is None:
             self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -313,6 +316,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '* "tensorizer" will load the weights using tensorizer from '
             'CoreWeave. See the Tensorize vLLM Model script in the Examples '
             'section for more information.\n'
+            '* "runai_streamer" will load the Safetensors weights using Run:ai'
+            'Model Streamer \n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
         parser.add_argument(
@@ -368,7 +373,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines,'
+            'https://github.com/outlines-dev/outlines, '
             'https://github.com/mlc-ai/xgrammar, and '
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
@@ -423,10 +428,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
 
         parser.add_argument(
             "--enable-prefix-caching",
@@ -603,11 +610,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
         parser.add_argument(
-            '--mm-cache-preprocessor',
+            '--disable-mm-preprocessor-cache',
             action='store_true',
-            help='If true, then enables caching of the multi-modal '
-            'preprocessor/mapper. Otherwise, the mapper executes each time'
-            ', and for better performance consider enabling frontend process.')
+            help='If true, then disables caching of the multi-modal '
+            'preprocessor/mapper. (not recommended)')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -940,6 +946,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default="auto",
             help='The worker class to use for distributed execution.')
 
+        parser.add_argument(
+            "--generation-config",
+            type=nullable_str,
+            default=None,
+            help="The folder path to the generation config. "
+            "Defaults to None, will use the default generation config in vLLM. "
+            "If set to 'auto', the generation config will be automatically "
+            "loaded from model. If set to a folder path, the generation config "
+            "will be loaded from the specified folder path.")
+
         return parser
 
     @classmethod
@@ -980,10 +996,11 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
-            mm_cache_preprocessor=self.mm_cache_preprocessor,
+            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-            logits_processor_pattern=self.logits_processor_pattern)
+            logits_processor_pattern=self.logits_processor_pattern,
+            generation_config=self.generation_config)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
@@ -1026,11 +1043,11 @@ def create_engine_config(self,
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
@@ -1131,7 +1148,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
@@ -1249,11 +1266,14 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
-            if usage_context == UsageContext.LLM_CLASS:
-                self.max_num_batched_tokens = 8192
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
-                self.max_num_batched_tokens = 2048
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
             logger.warning(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, usage_context.value)
@@ -1263,9 +1283,6 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         Override the EngineConfig's configs based on the usage context for V1.
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching
 
 
 @dataclass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f50e20cf70323..66a5089074ff5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1256,3 +1256,10 @@ async def stop_profile(self) -> None:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
+
+
+# TODO(v1): Remove this class proxy when V1 goes default.
+if envs.VLLM_USE_V1:
+    from vllm.v1.engine.async_llm import AsyncLLM
+
+    AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dc2d77d6927cd..39f59e55da1f7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,8 +5,8 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
+                    List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -52,7 +52,6 @@
                            SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
-from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import (
@@ -65,20 +64,6 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-
-    if config is None:
-        return {}
-
-    return config.to_diff_dict()
-
-
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 
@@ -148,7 +133,7 @@ class LLMEngine:
     and the :class:`AsyncLLMEngine` class wraps this class for online serving.
 
     The config arguments are derived from :class:`~vllm.EngineArgs`. (See
-    :ref:`engine_args`)
+    :ref:`engine-args`)
 
     Args:
         model_config: The configuration related to the LLM model.
@@ -274,8 +259,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
 
         self.seq_counter = Counter()
-        self.generation_config_fields = _load_generation_config_dict(
-            self.model_config)
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
 
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index a9b638ed02a1e..1c6f735f39e04 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 95da1c6e7b9bf..daefbff7e5178 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -21,7 +21,7 @@
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -119,6 +119,8 @@ async def run_server(args: Namespace,
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
+    set_ulimit()
+
     app = await init_app(args, llm_engine)
     assert engine is not None
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 58ab892676b9a..fadf297e9f6aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -115,7 +115,7 @@ class LLM:
             integer, it is used as the level of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
         **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
-            :ref:`engine_args`)
+            :ref:`engine-args`)
 
     Note:
         This class is intended to be used for offline inference. For online
@@ -233,7 +233,8 @@ def __init__(
         self.request_counter = Counter()
 
     def __del__(self):
-        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+        if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
+                self.llm_engine, "shutdown"):
             self.llm_engine.shutdown()
 
     @staticmethod
@@ -258,6 +259,13 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         else:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
+    def get_default_sampling_params(self) -> SamplingParams:
+        diff_sampling_param = (
+            self.llm_engine.model_config.get_diff_sampling_param())
+        if diff_sampling_param:
+            return SamplingParams.from_optional(**diff_sampling_param)
+        return SamplingParams()
+
     @overload
     def generate(
         self,
@@ -441,7 +449,7 @@ def generate(
 
         if sampling_params is None:
             # Use default sampling params.
-            sampling_params = SamplingParams()
+            sampling_params = self.get_default_sampling_params()
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 00e2d1a56f160..2e45b474237f9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -27,6 +27,7 @@
 import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
@@ -44,8 +45,11 @@
                                               DetokenizeRequest,
                                               DetokenizeResponse,
                                               EmbeddingRequest,
-                                              EmbeddingResponse, ErrorResponse,
+                                              EmbeddingResponse,
+                                              EmbeddingResponseData,
+                                              ErrorResponse,
                                               LoadLoraAdapterRequest,
+                                              PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
@@ -55,6 +59,7 @@
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -63,14 +68,9 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
-if envs.VLLM_USE_V1:
-    from vllm.v1.engine.async_llm import AsyncLLMEngine  # type: ignore
-else:
-    from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
-
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
@@ -288,6 +288,10 @@ def completion(request: Request) -> Optional[OpenAIServingCompletion]:
     return request.app.state.openai_serving_completion
 
 
+def pooling(request: Request) -> Optional[OpenAIServingPooling]:
+    return request.app.state.openai_serving_pooling
+
+
 def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
     return request.app.state.openai_serving_embedding
 
@@ -399,10 +403,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API")
+        fallback_handler = pooling(raw_request)
+        if fallback_handler is None:
+            return base(raw_request).create_error_response(
+                message="The model does not support Embeddings API")
+
+        logger.warning(
+            "Embeddings API will become exclusive to embedding models "
+            "in a future release. To return the hidden states directly, "
+            "use the Pooling API (`/pooling`) instead.")
+
+        res = await fallback_handler.create_pooling(request, raw_request)
+        if isinstance(res, PoolingResponse):
+            generator = EmbeddingResponse(
+                id=res.id,
+                object=res.object,
+                created=res.created,
+                model=res.model,
+                data=[
+                    EmbeddingResponseData(
+                        index=d.index,
+                        embedding=d.data,  # type: ignore
+                    ) for d in res.data
+                ],
+                usage=res.usage,
+            )
+        else:
+            generator = res
+    else:
+        generator = await handler.create_embedding(request, raw_request)
 
-    generator = await handler.create_embedding(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -412,6 +442,24 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/pooling")
+@with_cancellation
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Pooling API")
+
+    generator = await handler.create_pooling(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, PoolingResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
 @router.post("/score")
 @with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
@@ -537,12 +585,18 @@ async def authentication(request: Request, call_next):
                                     status_code=401)
             return await call_next(request)
 
-    @app.middleware("http")
-    async def add_request_id(request: Request, call_next):
-        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-        response = await call_next(request)
-        response.headers["X-Request-Id"] = request_id
-        return response
+    if args.enable_request_id_headers:
+        logger.warning(
+            "CAUTION: Enabling X-Request-Id headers in the API Server. "
+            "This can harm performance at high QPS.")
+
+        @app.middleware("http")
+        async def add_request_id(request: Request, call_next):
+            request_id = request.headers.get(
+                "X-Request-Id") or uuid.uuid4().hex
+            response = await call_next(request)
+            response.headers["X-Request-Id"] = request_id
+            return response
 
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
@@ -609,7 +663,7 @@ def init_app_state(
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     ) if model_config.runner_type == "generate" else None
-    state.openai_serving_embedding = OpenAIServingEmbedding(
+    state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
         base_model_paths,
@@ -617,13 +671,20 @@ def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     ) if model_config.runner_type == "pooling" else None
+    state.openai_serving_embedding = OpenAIServingEmbedding(
+        engine_client,
+        model_config,
+        base_model_paths,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    ) if model_config.task == "embed" else None
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger
-    ) if (model_config.runner_type == "pooling" \
-          and model_config.is_cross_encoder) else None
+    ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
@@ -666,6 +727,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     sock_addr = (args.host or "", args.port)
     sock = create_server_socket(sock_addr)
 
+    # workaround to avoid footguns where uvicorn drops requests with too
+    # many concurrent requests active
+    set_ulimit()
+
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 24c206a1261f2..908f8c3532c9e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -196,7 +196,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action="store_true",
         help="If specified, will run the OpenAI frontend server in the same "
         "process as the model serving engine.")
-
+    parser.add_argument(
+        "--enable-request-id-headers",
+        action="store_true",
+        help="If specified, API server will add X-Request-Id header to "
+        "responses. Caution: this hurts performance at high QPS.")
     parser.add_argument(
         "--enable-auto-tool-choice",
         action="store_true",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5a70e0952666b..14e41346df775 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -46,7 +46,15 @@ class OpenAIBaseModel(BaseModel):
     @classmethod
     def __log_extra_fields__(cls, data):
         if isinstance(data, dict):
-            extra_fields = data.keys() - cls.model_fields.keys()
+            # Get all class field names and their potential aliases
+            field_names = set()
+            for field_name, field in cls.model_fields.items():
+                field_names.add(field_name)
+                if hasattr(field, 'alias') and field.alias:
+                    field_names.add(field.alias)
+
+            # Compare against both field names and aliases
+            extra_fields = data.keys() - field_names
             if extra_fields:
                 logger.warning(
                     "The following fields were present in the request "
@@ -211,8 +219,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 1.0
-    top_p: Optional[float] = 1.0
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
@@ -224,9 +232,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     # doc: begin-chat-completion-sampling-params
     best_of: Optional[int] = None
     use_beam_search: bool = False
-    top_k: int = -1
-    min_p: float = 0.0
-    repetition_penalty: float = 1.0
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
@@ -348,15 +356,32 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_beam_search_params(self,
-                              default_max_tokens: int) -> BeamSearchParams:
+    # Default sampling parameters for chat completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
+    ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
         n = self.n if self.n is not None else 1
-        temperature = self.temperature if self.temperature is not None else 0.0
+
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
 
         return BeamSearchParams(
             beam_width=n,
@@ -367,13 +392,36 @@ def to_beam_search_params(self,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(
-            self, default_max_tokens: int,
-            logits_processor_pattern: Optional[str]) -> SamplingParams:
+            self,
+            default_max_tokens: int,
+            logits_processor_pattern: Optional[str],
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
@@ -403,11 +451,11 @@ def to_sampling_params(
             best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
-            repetition_penalty=self.repetition_penalty,
-            temperature=self.temperature,
-            top_p=self.top_p,
-            top_k=self.top_k,
-            min_p=self.min_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
@@ -584,15 +632,15 @@ class CompletionRequest(OpenAIBaseModel):
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
-    temperature: Optional[float] = 1.0
-    top_p: Optional[float] = 1.0
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
     user: Optional[str] = None
 
     # doc: begin-completion-sampling-params
     use_beam_search: bool = False
-    top_k: int = -1
-    min_p: float = 0.0
-    repetition_penalty: float = 1.0
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
@@ -669,14 +717,30 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_beam_search_params(self,
-                              default_max_tokens: int) -> BeamSearchParams:
+    # Default sampling parameters for completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
+    ) -> BeamSearchParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
         n = self.n if self.n is not None else 1
-        temperature = self.temperature if self.temperature is not None else 0.0
+
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get("temperature", 1.0)
 
         return BeamSearchParams(
             beam_width=n,
@@ -687,12 +751,35 @@ def to_beam_search_params(self,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(
-            self, default_max_tokens: int,
-            logits_processor_pattern: Optional[str]) -> SamplingParams:
+            self,
+            default_max_tokens: int,
+            logits_processor_pattern: Optional[str],
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.logprobs
@@ -718,11 +805,11 @@ def to_sampling_params(
             best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
-            repetition_penalty=self.repetition_penalty,
-            temperature=self.temperature,
-            top_p=self.top_p,
-            top_k=self.top_k,
-            min_p=self.min_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
@@ -876,6 +963,10 @@ def to_pooling_params(self):
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
 
+PoolingCompletionRequest = EmbeddingCompletionRequest
+PoolingChatRequest = EmbeddingChatRequest
+PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
+
 
 class ScoreRequest(OpenAIBaseModel):
     model: str
@@ -971,6 +1062,21 @@ class EmbeddingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: Union[List[List[float]], List[float], str]
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: List[PoolingResponseData]
+    usage: UsageInfo
+
+
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 675daf54c0d0d..572ed27b39083 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -232,7 +232,7 @@ async def main(args):
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
-    ) if model_config.runner_type == "pooling" else None
+    ) if model_config.task == "embed" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 81bce0dd370bb..d085333563d19 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -91,6 +91,10 @@ def __init__(
                                 "been registered") from e
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        diff_sampling_param = self.model_config.get_diff_sampling_param()
+        if diff_sampling_param:
+            logger.info("Overwriting default chat sampling param with: %s",
+                        diff_sampling_param)
 
     async def create_chat_completion(
         self,
@@ -191,13 +195,17 @@ async def create_chat_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
+                # Build default sampling params
+                default_sampling_params = (
+                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens)
+                        default_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
-                        self.model_config.logits_processor_pattern)
+                        self.model_config.logits_processor_pattern,
+                        default_sampling_params)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 5cf9df92e296e..aaad7b8c7f44c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -55,6 +55,11 @@ def __init__(
                          prompt_adapters=prompt_adapters,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
+        diff_sampling_param = self.model_config.get_diff_sampling_param()
+        if diff_sampling_param:
+            logger.info(
+                "Overwriting default completion sampling param with: %s",
+                diff_sampling_param)
 
     async def create_completion(
         self,
@@ -118,13 +123,17 @@ async def create_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
+                # Build default sampling params
+                default_sampling_params = (
+                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens)
+                        default_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
-                        self.model_config.logits_processor_pattern)
+                        self.model_config.logits_processor_pattern,
+                        default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 879276646d2ba..b8fb9d6bd77f2 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -40,36 +40,6 @@ def _get_embedding(
     assert_never(encoding_format)
 
 
-def request_output_to_embedding_response(
-        final_res_batch: List[PoolingRequestOutput], request_id: str,
-        created_time: int, model_name: str,
-        encoding_format: Literal["float", "base64"]) -> EmbeddingResponse:
-    data: List[EmbeddingResponseData] = []
-    num_prompt_tokens = 0
-    for idx, final_res in enumerate(final_res_batch):
-        embedding_res = EmbeddingRequestOutput.from_base(final_res)
-        prompt_token_ids = final_res.prompt_token_ids
-
-        embedding = _get_embedding(embedding_res.outputs, encoding_format)
-        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
-        data.append(embedding_data)
-
-        num_prompt_tokens += len(prompt_token_ids)
-
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        total_tokens=num_prompt_tokens,
-    )
-
-    return EmbeddingResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        data=data,
-        usage=usage,
-    )
-
-
 class OpenAIServingEmbedding(OpenAIServing):
 
     def __init__(
@@ -114,7 +84,7 @@ async def create_embedding(
 
         model_name = request.model
         request_id = f"embd-{self._base_request_id(raw_request)}"
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
 
         truncate_prompt_tokens = None
 
@@ -218,9 +188,13 @@ async def create_embedding(
             final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
-            response = request_output_to_embedding_response(
-                final_res_batch_checked, request_id, created_time, model_name,
-                encoding_format)
+            response = self.request_output_to_embedding_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                encoding_format,
+            )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
@@ -228,3 +202,40 @@ async def create_embedding(
             return self.create_error_response(str(e))
 
         return response
+
+    def request_output_to_embedding_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: Literal["float", "base64"],
+    ) -> EmbeddingResponse:
+        items: List[EmbeddingResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            embedding_res = EmbeddingRequestOutput.from_base(final_res)
+
+            item = EmbeddingResponseData(
+                index=idx,
+                embedding=_get_embedding(embedding_res.outputs,
+                                         encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return EmbeddingResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
new file mode 100644
index 0000000000000..01852f0df1eca
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -0,0 +1,234 @@
+import asyncio
+import base64
+import time
+from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
+
+import numpy as np
+from fastapi import Request
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              PoolingChatRequest,
+                                              PoolingRequest, PoolingResponse,
+                                              PoolingResponseData, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.logger import init_logger
+from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.utils import merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+def _get_data(
+    output: PoolingOutput,
+    encoding_format: Literal["float", "base64"],
+) -> Union[List[float], str]:
+    if encoding_format == "float":
+        return output.data.tolist()
+    elif encoding_format == "base64":
+        # Force to use float32 for base64 encoding
+        # to match the OpenAI python client behavior
+        pooling_bytes = np.array(output.data, dtype="float32").tobytes()
+        return base64.b64encode(pooling_bytes).decode("utf-8")
+
+    assert_never(encoding_format)
+
+
+class OpenAIServingPooling(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         base_model_paths=base_model_paths,
+                         lora_modules=None,
+                         prompt_adapters=None,
+                         request_logger=request_logger)
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+    async def create_pooling(
+        self,
+        request: PoolingRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[PoolingResponse, ErrorResponse]:
+        """
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        encoding_format = request.encoding_format
+        if request.dimensions is not None:
+            return self.create_error_response(
+                "dimensions is currently not supported")
+
+        model_name = request.model
+        request_id = f"pool-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        truncate_prompt_tokens = None
+
+        if request.truncate_prompt_tokens is not None:
+            if request.truncate_prompt_tokens <= self.max_model_len:
+                truncate_prompt_tokens = request.truncate_prompt_tokens
+            else:
+                return self.create_error_response(
+                    "truncate_prompt_tokens value is "
+                    "greater than max_model_len."
+                    " Please, select a smaller truncation size.")
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for pooling models")
+
+            if isinstance(request, PoolingChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    # In pooling requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.input,
+                     truncate_prompt_tokens=truncate_prompt_tokens,
+                     add_special_tokens=request.add_special_tokens,
+                 )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_pooling_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                encoding_format,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_pooling_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: Literal["float", "base64"],
+    ) -> PoolingResponse:
+        items: List[PoolingResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            item = PoolingResponseData(
+                index=idx,
+                data=_get_data(final_res.outputs, encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return PoolingResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 101d170bee4d6..a8a126e697641 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -20,32 +20,6 @@
 logger = init_logger(__name__)
 
 
-def request_output_to_score_response(
-        final_res_batch: List[PoolingRequestOutput], request_id: str,
-        created_time: int, model_name: str) -> ScoreResponse:
-    data: List[ScoreResponseData] = []
-    num_prompt_tokens = 0
-    for idx, final_res in enumerate(final_res_batch):
-        classify_res = ScoringRequestOutput.from_base(final_res)
-
-        score_data = ScoreResponseData(index=idx,
-                                       score=classify_res.outputs.score)
-        data.append(score_data)
-
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        total_tokens=num_prompt_tokens,
-    )
-
-    return ScoreResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        data=data,
-        usage=usage,
-    )
-
-
 def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str],
                                                             str]) -> List:
     if isinstance(text_1, (str, dict)):
@@ -103,7 +77,7 @@ async def create_score(
 
         model_name = request.model
         request_id = f"score-{self._base_request_id(raw_request)}"
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
         truncate_prompt_tokens = request.truncate_prompt_tokens
 
         request_prompts = []
@@ -203,8 +177,12 @@ async def create_score(
             final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
-            response = request_output_to_score_response(
-                final_res_batch_checked, request_id, created_time, model_name)
+            response = self.request_output_to_score_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+            )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
@@ -212,3 +190,38 @@ async def create_score(
             return self.create_error_response(str(e))
 
         return response
+
+    def request_output_to_score_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+    ) -> ScoreResponse:
+        items: List[ScoreResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            item = ScoreResponseData(
+                index=idx,
+                score=classify_res.outputs.score,
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ScoreResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index dae481a2154a1..8aefcd8d58a39 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -35,13 +35,18 @@ class GraniteToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
+        # for granite 3.0, the token `<|tool_call|>`
         self.bot_token = "<|tool_call|>"
+        # for granite 3.1, the string `<tool_call>`
+        self.bot_string = "<tool_call>"
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-        # remove whitespace and the BOT token if it exists
-        stripped = model_output.strip().removeprefix(self.bot_token).lstrip()
+        stripped = model_output.strip()\
+                    .removeprefix(self.bot_token)\
+                    .removeprefix(self.bot_string)\
+                    .lstrip()
         if not stripped or stripped[0] != '[':
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
@@ -91,6 +96,9 @@ def extract_tool_calls_streaming(
         if current_text[start_idx:].startswith(self.bot_token):
             start_idx = consume_space(start_idx + len(self.bot_token),
                                       current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
         if not current_text or start_idx >= len(current_text)\
             or current_text[start_idx] != '[':
             return DeltaMessage(content=delta_text)
diff --git a/vllm/envs.py b/vllm/envs.py
index 18870c1c6b51a..c4a568c680db0 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -30,7 +30,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_USE_FLASHINFER_SAMPLER: bool = False
+    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
     VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
     VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
@@ -277,7 +277,8 @@ def get_default_config_root():
 
     # If set, vllm will use flashinfer sampler
     "VLLM_USE_FLASHINFER_SAMPLER":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),
+    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
+    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
 
     # If set, vllm will force flashinfer to use tensor cores;
     # otherwise will use heuristic based on model architecture.
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 2816b5c5c1f88..5495bc50ede83 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4bf5cbbd18ffe..e2c549cbd5331 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -123,6 +123,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
+        workers = []
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
@@ -138,20 +139,30 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
+            workers.append(worker)
 
-            if self.use_ray_spmd_worker:
-                self.workers.append(worker)
-            else:
-                worker_ip = ray.get(worker.get_node_ip.remote())
-                if worker_ip == driver_ip and self.driver_dummy_worker is None:
+        worker_ip_refs = [
+            worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for worker in workers
+        ]
+        worker_ips = ray.get(worker_ip_refs)
+
+        if not self.use_ray_spmd_worker:
+            for i in range(len(workers)):
+                worker = workers[i]
+                worker_ip = worker_ips[i]
+                if self.driver_dummy_worker is None and worker_ip == driver_ip:
                     # If the worker is on the same node as the driver, we use it
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
                         vllm_config=self.vllm_config)
-                else:
-                    # Else, added to the list of workers.
-                    self.workers.append(worker)
+                    workers.pop(i)
+                    worker_ips.pop(i)
+                    self.workers = workers
+                    break
+        else:
+            self.workers = workers
 
         logger.debug("workers: %s", self.workers)
         logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
@@ -161,14 +172,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 "adjusting the Ray placement group or running the driver on a "
                 "GPU node.")
 
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
         ip_counts: Dict[str, int] = {}
         for ip in worker_ips:
             ip_counts[ip] = ip_counts.get(ip, 0) + 1
 
+        worker_to_ip = dict(zip(self.workers, worker_ips))
+
         def sort_by_driver_then_worker_ip(worker):
             """
             Sort the workers based on 3 properties:
@@ -179,7 +188,7 @@ def sort_by_driver_then_worker_ip(worker):
             3. Finally, if the work is on a node with smaller IP address, it
                 should be placed first.
             """
-            ip = ray.get(worker.get_node_ip.remote())
+            ip = worker_to_ip[worker]
             return (ip != driver_ip, ip_counts[ip], ip)
 
         # After sorting, the workers on the same node will be
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index d4402e77a3886..aaeecab7ffde1 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -13,7 +13,7 @@
 to dispatch data processing according to the target model.
 
 See also:
-    :ref:`input_processing_pipeline`
+    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85aaaa776907f..d54cbb5c37819 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -162,6 +162,11 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
+    multi_modal_hashes: NotRequired[List[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -177,6 +182,7 @@ def token_inputs(
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[List[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -191,6 +197,8 @@ def token_inputs(
         inputs["multi_modal_data"] = multi_modal_data
     if multi_modal_inputs is not None:
         inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -295,6 +303,18 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_hashes(self) -> List[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_hashes", [])
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0b85484c48714..f3ec9d115c9ba 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,11 +1,11 @@
 import functools
 from collections import UserDict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type)
+from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple,
+                    Optional, Protocol, Union)
 
 from torch import nn
-from transformers import PretrainedConfig, ProcessorMixin
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
@@ -26,6 +26,7 @@
 logger = init_logger(__name__)
 
 C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
+P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
 @dataclass(frozen=True)
@@ -38,24 +39,28 @@ class InputContext:
     model_config: "ModelConfig"
     """The configuration of the model."""
 
-    def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
+    def get_hf_config(
+        self,
+        typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
+        /,
+    ) -> C:
         """
         Get the HuggingFace configuration
         (:class:`transformers.PretrainedConfig`) of the model,
         additionally checking its type.
 
         Raises:
-            TypeError: If the model is not of the specified type.
+            TypeError: If the configuration is not of the specified type.
         """
         hf_config = self.model_config.hf_config
-        if not isinstance(hf_config, hf_config_type):
+        if not isinstance(hf_config, typ):
             raise TypeError("Invalid type of HuggingFace config. "
-                            f"Expected type: {hf_config_type}, but "
+                            f"Expected type: {typ}, but "
                             f"found type: {type(hf_config)}")
 
         return hf_config
 
-    def get_hf_image_processor_config(self) -> Dict[str, Any]:
+    def get_hf_image_processor_config(self) -> dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
@@ -74,18 +79,37 @@ def get_mm_config(self):
 
         return mm_config
 
-    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(
+        self,
+        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> P:
+        """
+        Get the HuggingFace processor
+        (:class:`transformers.ProcessorMixin`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the processor is not of the specified type.
+        """
         base_kwargs = self.model_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
-        return cached_get_processor(
+        hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
             **merged_kwargs,
         )
+        if not isinstance(hf_processor, typ):
+            raise TypeError("Invalid type of HuggingFace processor. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(hf_processor)}")
+
+        return hf_processor
 
 
 @dataclass(frozen=True)
@@ -93,39 +117,55 @@ class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
-        base_kwargs = self.model_config.mm_processor_kwargs
-        if base_kwargs is None:
-            base_kwargs = {}
-
-        merged_kwargs = {**base_kwargs, **kwargs}
-
-        return cached_get_processor(
-            self.model_config.model,
-            tokenizer=self.tokenizer,  # Override the tokenizer with ours
-            trust_remote_code=self.model_config.trust_remote_code,
-            **merged_kwargs,
+    def get_hf_processor(
+        self,
+        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> P:
+        return super().get_hf_processor(
+            typ,
+            tokenizer=self.tokenizer,
+            **kwargs,
         )
 
-    def resolve_hf_processor_call_kwargs(
+    def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
         inference_kwargs: Mapping[str, object],
-    ) -> Mapping[str, object]:
+    ) -> BatchFeature:
         assert callable(hf_processor)
 
         base_kwargs = self.model_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
-        return resolve_mm_processor_kwargs(
+        merged_kwargs = resolve_mm_processor_kwargs(
             base_kwargs,
             inference_kwargs,
             hf_processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
         )
 
+        try:
+            return hf_processor(
+                text=prompt,
+                **processor_data,
+                **merged_kwargs,
+                return_tensors="pt",
+            )
+        except Exception as exc:
+            data = dict(text=prompt, **processor_data)
+            msg = (f"Failed to apply {type(hf_processor).__name__} "
+                   f"on data={data} with kwargs={merged_kwargs}")
+
+            raise RuntimeError(msg) from exc
+
 
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 
 
 class DummyData(NamedTuple):
@@ -232,7 +272,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_dummy_data_factory(self, model_cls: Type[nn.Module]):
+    def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
         return self._dummy_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
@@ -257,7 +297,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_dummy_encoder_data_factory(self, model_cls: Type[nn.Module]):
+    def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
         return self._dummy_encoder_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
@@ -274,7 +314,7 @@ def dummy_data_for_profiling(
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
 
         Note:
             This should be called after
@@ -351,7 +391,7 @@ def register_input_processor(self, processor: InputProcessor):
         happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
 
         See also:
-            :ref:`input_processing_pipeline`
+            :ref:`input-processing-pipeline`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -368,14 +408,14 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_model_input_processor(self, model_cls: Type[nn.Module]):
+    def _get_model_input_processor(self, model_cls: type[nn.Module]):
         return self._input_processors_by_model_type \
             .get(model_cls, self._default_input_processor)
 
     def _ensure_mm_kwargs(
         self,
         inputs: SingletonInputs,
-        mm_processor_kwargs: Dict[str, Any],
+        mm_processor_kwargs: dict[str, Any],
     ):
         if inputs["type"] == "token":
             # In case the input processor for that model fails to set it
@@ -395,7 +435,7 @@ def process_input(self, model_config: "ModelConfig",
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`input_processing_pipeline`
+            :ref:`input-processing-pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a6c93a3d8bfe9..85164c2165a3c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -425,8 +425,9 @@ def forward(self, input_):
                        if self.base_layer.skip_bias_add else None)
         return output, output_bias
 
+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
     @classmethod
-    @_not_fully_sharded_can_replace
     def can_replace_layer(
         cls,
         source_layer: nn.Module,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 70806a77b9fff..5c0e4e5cbc636 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -28,7 +28,7 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.utils import PPMissingLayer
+from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -113,13 +113,14 @@ def from_lora_tensors(
         target_embedding_padding: Optional[int] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
+        weights_mapper: Optional[WeightsMapper] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: Dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
-                tensor_name)
+                tensor_name, weights_mapper)
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
@@ -187,6 +188,7 @@ def from_local_checkpoint(
         target_embedding_padding: Optional[int] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
+        weights_mapper: Optional[WeightsMapper] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
         
@@ -229,7 +231,8 @@ def from_local_checkpoint(
             with safetensors.safe_open(lora_tensor_path,
                                        framework="pt") as f:  # type: ignore
                 for lora_module in f.keys():  # noqa
-                    module_name, _, _ = parse_fine_tuned_lora_name(lora_module)
+                    module_name, _, _ = parse_fine_tuned_lora_name(
+                        lora_module, weights_mapper)
                     part_name = module_name.split(".")[-1]
                     if part_name not in expected_lora_modules:
                         unexpected_modules.append(module_name)
@@ -289,7 +292,8 @@ def from_local_checkpoint(
             embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules)
+            embedding_padding_modules=embedding_padding_modules,
+            weights_mapper=weights_mapper)
 
 
 class LoRAModelManager(AdapterModelManager):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 5876494ce2824..d72b7638d84af 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -30,6 +30,7 @@
 # yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.utils import WeightsMapper
 
 logger = init_logger(__name__)
 
@@ -91,28 +92,46 @@ def replace_submodule(model: nn.Module, module_name: str,
     return new_module
 
 
-def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]:
+def parse_fine_tuned_lora_name(
+        name: str,
+        weights_mapper: Optional[WeightsMapper] = None
+) -> Tuple[str, bool, bool]:
     """Parse the name of lora weights.
 
     args:
         name: the name of the fine-tuned LoRA, e.g.
             base_model.model.dense1.weight
+        weights_mapper: maps the name of weight, e.g.
+            `model.` -> `language_model.model.`,
     return:
         Tuple(module_name, is_lora_a):
             module_name: the name of the module, e.g. model.dense1,
             is_lora_a whether the tensor is lora_a or lora_b.
             is_bias whether the tensor is lora bias.
     """
+
+    # LoRA weight qualified name always starts with `base_model.model.`,
+    # so we remove the prefix `base_model.model.` to make the following
+    # mapping correctly.
+    if "base_model.model." in name:
+        name = name.replace("base_model.model.", "")
+        name = weights_mapper._map_name(name) if weights_mapper else name
+        # recover the prefix `base_model.model.`
+        name = "base_model.model." + name
+
     parts = name.split(".")
     if parts[-1] == "weight" and (parts[-2] == "lora_A"
                                   or parts[-2] == "lora_B"):
-        return ".".join(parts[2:-2]), parts[-2] == "lora_A", False
+        new_name = ".".join(parts[2:-2])
+        return new_name, parts[-2] == "lora_A", False
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False
+        new_name = ".".join(parts[2:-1])
+        return new_name, parts[-1] == "lora_embedding_A", False
 
     if parts[-1] == "bias":
-        return ".".join(parts[2:-2]), False, True
+        new_name = ".".join(parts[2:-2])
+        return new_name, False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 93a5e27621912..10976fac23028 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -91,7 +91,17 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                         packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
+
+            expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
+
+            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
+            # to ensure correct loading of lora weights.
+            hf_to_vllm_mapper = None
+            if (hasattr(model, "hf_to_vllm_mapper")
+                    and model.hf_to_vllm_mapper is not None):
+                hf_to_vllm_mapper = model.hf_to_vllm_mapper
+
             lora = self._lora_model_cls.from_local_checkpoint(
                 lora_path,
                 expected_lora_modules,
@@ -103,7 +113,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 self.lora_config.lora_extra_vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
-            )
+                weights_mapper=hf_to_vllm_mapper)
+
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index e631aec928ec5..694c5b68b1cbd 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -3,6 +3,9 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark,
+    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
 from vllm.platforms import CpuArchEnum, current_platform
 
 if TYPE_CHECKING:
@@ -15,49 +18,24 @@
 logger = init_logger(__name__)
 
 
-def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj for key in [
-                    "minimum", "maximum", "exclusiveMinimum",
-                    "exclusiveMaximum", "multipleOf"
-                ]):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
 def maybe_backend_fallback(
         guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if (guided_params.backend == "lm-format-enforcer"
-            and guided_params.grammar is not None):
-        logger.warning(
-            "lm-format-enforcer does not support grammar guided decoding. "
-            "Falling back to use xgrammar instead.")
-        guided_params.backend = "xgrammar"
+    if guided_params.backend == "lm-format-enforcer":
+        if guided_params.grammar is not None:
+            logger.warning(
+                "lm-format-enforcer does not support grammar guided decoding. "
+                "Falling back to use xgrammar instead.")
+            guided_params.backend = "xgrammar"
+
+        # lm-format-enforcer doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_lmf_unsupported_json_features(guided_params.json)):
+            logger.warning(
+                "lm-format-enforcer does not support advanced JSON schema "
+                "features like patterns or numeric ranges. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
 
     if guided_params.backend == "xgrammar":
         # xgrammar only has x86 wheels for linux, fallback to outlines
@@ -82,6 +60,27 @@ def maybe_backend_fallback(
                 "Falling back to use outlines instead.")
             guided_params.backend = "outlines"
 
+        # xgrammar only supports GBNF grammars, so we must convert Lark.
+        # We must check if the grammar is likely Lark and if that
+        # grammar is convertible to GBNF
+        elif (guided_params.grammar is not None
+              and grammar_is_likely_lark(guided_params.grammar)):
+            try:
+                convert_lark_to_gbnf(guided_params.grammar)
+            except Exception:
+                logger.warning(
+                    "xgrammar does not support Lark grammars and the "
+                    "grammar failed to convert to GBNF. "
+                    "Falling back to use outlines instead.")
+                guided_params.backend = "outlines"
+
+    if (guided_params.backend == "outlines"
+            and guided_params.json_object is not None):
+        # outlines doesn't support json_object, fallback to xgrammar
+        logger.warning("outlines does not support json_object. "
+                       "Falling back to use xgrammar instead.")
+        guided_params.backend = "xgrammar"
+
     return guided_params
 
 
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index b63fed1c8a8c3..e4eb3f16e56cf 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,10 +21,11 @@
 
 import numpy as np
 import torch
-from lark import Lark
 from outlines import grammars
 from outlines.caching import cache
-from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
+from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
+                                RegexGuide, Write)
+from outlines.fsm.parsing import PartialLark
 from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -34,7 +35,9 @@ class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide):
         self._guide: Guide = guide
-        self._fsm_state: DefaultDict[int, int] = defaultdict(int)
+        # CFGState is used for the FSM state for CFGGuide
+        self._fsm_state: DefaultDict[int, Union[int,
+                                                CFGState]] = defaultdict(int)
 
     def __call__(self, input_ids: List[int],
                  scores: torch.Tensor) -> torch.Tensor:
@@ -54,15 +57,13 @@ def __call__(self, input_ids: List[int],
             # On the first time this is called, we simply re-create
             # the Lark object.
             if isinstance(self._guide, CFGGuide):
-                self._guide.parser = Lark(
+                self._guide.parser = PartialLark(
                     self._guide.cfg_string,
                     parser="lalr",
-                    lexer="contextual",
-                    propagate_positions=False,
-                    maybe_placeholders=False,
-                    regex=True,
                     import_paths=[grammars.GRAMMAR_PATH],
                 )
+                self._fsm_state[seq_id] = CFGState(
+                    parser_state=self._guide.parser.parse(""), prev_token=None)
 
         instruction = self._guide.get_next_instruction(
             state=self._fsm_state[seq_id])
@@ -200,7 +201,8 @@ def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])
 
         # A hack to handle missing spaces to HF's Llama tokenizers
-        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
+                or token == "<0x20>"):
             return " " + string
 
         return string
@@ -211,6 +213,9 @@ def change_decoder(
         """Sync vLLM's decoder with the outlines by returning list."""
 
         def new_decoder(inp_tokens: List[int]) -> List[str]:
+            if (isinstance(inp_tokens, list) and len(inp_tokens) == 1
+                    and isinstance(inp_tokens[0], list)):
+                inp_tokens = inp_tokens[0]
             return [decoder(inp_tokens)]
 
         return new_decoder
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/utils.py
similarity index 72%
rename from vllm/model_executor/guided_decoding/xgrammar_utils.py
rename to vllm/model_executor/guided_decoding/utils.py
index 9a0463964de49..20abaefbacc51 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,6 +1,76 @@
 import re
 
 
+def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj for key in [
+                    "minimum", "maximum", "exclusiveMinimum",
+                    "exclusiveMaximum", "multipleOf"
+                ]):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def has_lmf_unsupported_json_features(schema: dict) -> bool:
+    """
+    Check if JSON schema contains features unsupported 
+    by lm_format_enforcer.
+
+    Known issues:
+    - Regex patterns:
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$"  # Regex pattern
+        },
+    """
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def grammar_is_likely_lark(grammar_str: str) -> bool:
     """
     Check if grammar appears to use Lark syntax.
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index fc45e37cf6f06..5e1948977bff4 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,7 @@
 
 import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import TYPE_CHECKING, Any
 
 import torch
 from transformers import PreTrainedTokenizerFast
@@ -14,8 +14,9 @@
 except ImportError:
     pass
 
-from vllm.model_executor.guided_decoding.xgrammar_utils import (
-    convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
+                                                       grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -37,11 +38,21 @@ def get_local_xgrammar_guided_decoding_logits_processor(
     return XGrammarLogitsProcessor(config)
 
 
-class TokenizerData(NamedTuple):
+@dataclass(frozen=True)
+class TokenizerData:
     """Immutable container for cached tokenizer data."""
-    encoded_vocab: list[str]
-    stop_token_ids: list[int] | None
-    backend_str: str
+    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"
 
 
 class TokenizerDataCache:
@@ -68,18 +79,27 @@ def get_tokenizer_data(cls,
                     "get_vocab method.") from e
 
             stop_token_ids = None
-            backend_str = xgr.VocabType.RAW
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
             if isinstance(tokenizer, PreTrainedTokenizerFast):
                 backend_str = tokenizer.backend_tokenizer.to_str()
-                if stop_token_ids is None and hasattr(
-                        tokenizer,
-                        "eos_token_id") and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
                 stop_token_ids=stop_token_ids,
-                backend_str=backend_str)
+                backend_str=backend_str,
+                vocab_type=vocab_type)
 
         return cls._cache[tokenizer_hash]
 
@@ -98,11 +118,30 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
         cache_key = str(config.tokenizer_hash)
 
         if cache_key not in cls._cache:
-            assert config.encoded_vocab is not None
-            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                xgr_core.TokenizerInfo.from_huggingface(
-                    config.encoded_vocab, config.backend_str,
-                    config.vocab_size, config.stop_token_ids))
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -118,10 +157,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     max_threads: int = 8
-    # Only populated if tokenizer_hash not in cache
-    encoded_vocab: list[str] | None = None
-    stop_token_ids: list[int] | None = None
-    backend_str: str | None = None
+    tokenizer_data: TokenizerData | None = None
 
     @classmethod
     def from_guided_params(cls,
@@ -132,9 +168,6 @@ def from_guided_params(cls,
 
         tokenizer_hash = hash(tokenizer)
         tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-        encoded_vocab = tokenizer_data.encoded_vocab
-        stop_token_ids = tokenizer_data.stop_token_ids
-        backend_str = tokenizer_data.backend_str
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
@@ -152,11 +185,9 @@ def from_guided_params(cls,
 
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -181,19 +212,17 @@ def from_guided_params(cls,
 
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.json_object:
-            return cls(json_object=True,
-                       vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+            return cls(
+                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -269,10 +298,14 @@ def __call__(self, input_ids: list[int],
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
         if device_type != "cuda":
-            scores = scores.to("cpu")
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type)
+            scores = scores.to(device_type).squeeze()
 
         return scores
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index e6f9f01ef0f74..4101facbe7874 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 import triton
@@ -11,6 +11,8 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -45,8 +47,14 @@ def fused_moe_kernel(
         stride_bn,
         stride_cm,
         stride_cn,
+        stride_asm,
+        stride_ask,
         stride_bse,
+        stride_bsk,
         stride_bsn,
+        # Block size for block-wise quantization
+        group_n: tl.constexpr,
+        group_k: tl.constexpr,
         # Meta-parameters
         BLOCK_SIZE_M: tl.constexpr,
         BLOCK_SIZE_N: tl.constexpr,
@@ -125,8 +133,14 @@ def fused_moe_kernel(
         b_scale = tl.load(b_scale_ptrs)
 
     if use_fp8_w8a8:
-        a_scale = tl.load(a_scale_ptr)
-        b_scale = tl.load(b_scale_ptr + off_experts)
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
+                            offs_bsn * stride_bsn)
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr + off_experts)
 
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
@@ -149,7 +163,18 @@ def fused_moe_kernel(
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
         elif use_fp8_w8a8:
-            accumulator = tl.dot(a, b, acc=accumulator)
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                accumulator = tl.dot(a, b, acc=accumulator)
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
@@ -164,7 +189,10 @@ def fused_moe_kernel(
     if use_int8_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
     elif use_fp8_w8a8:
-        accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
     else:
         accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
@@ -233,22 +261,37 @@ def moe_align_block_size(
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
-def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+def invoke_fused_moe_kernel(A: torch.Tensor,
+                            B: torch.Tensor,
+                            C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            topk_weights: torch.Tensor,
+                            topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
-                            mul_routed_weight: bool, top_k: int,
-                            config: Dict[str, Any], compute_type: tl.dtype,
-                            use_fp8_w8a8: bool, use_int8_w8a16: bool) -> None:
+                            mul_routed_weight: bool,
+                            top_k: int,
+                            config: Dict[str, Any],
+                            compute_type: tl.dtype,
+                            use_fp8_w8a8: bool,
+                            use_int8_w8a16: bool,
+                            block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
+        if block_shape is None:
+            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+        else:
+            assert len(block_shape) == 2
+            block_n, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_fp8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
     elif use_int8_w8a16:
         assert B_scale is not None
     else:
@@ -279,8 +322,13 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         B.stride(1),
         C.stride(1),
         C.stride(2),
-        B_scale.stride(0) if B_scale is not None and use_int8_w8a16 else 0,
-        B_scale.stride(1) if B_scale is not None and use_int8_w8a16 else 0,
+        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        0 if block_shape is None else block_shape[0],
+        0 if block_shape is None else block_shape[1],
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
@@ -362,6 +410,7 @@ def try_get_optimal_moe_config(
     dtype: Optional[str],
     M: int,
     is_marlin: bool = False,
+    block_shape: Optional[List[int]] = None,
 ):
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -380,6 +429,12 @@ def try_get_optimal_moe_config(
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
                                         is_marlin)
+    # NOTE: For block-wise quant,
+    # BLOCK_K must be divisible by block_shape[1]
+    # BLOCK_N and BLOCK_M has no requirements
+    if block_shape is not None:
+        config["BLOCK_SIZE_N"] = block_shape[0]
+        config["BLOCK_SIZE_K"] = block_shape[1]
     return config
 
 
@@ -421,18 +476,29 @@ def fused_topk(
     return topk_weights, topk_ids
 
 
-# This is used by the Deepseek-V2 model
+# This is used by the Deepseek-V2 and Deepseek-V3 model
 def grouped_topk(hidden_states: torch.Tensor,
                  gating_output: torch.Tensor,
                  topk: int,
                  renormalize: bool,
                  num_expert_group: int = 0,
-                 topk_group: int = 0):
+                 topk_group: int = 0,
+                 scoring_func: str = "softmax",
+                 e_score_correction_bias: Optional[torch.Tensor] = None):
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
-    scores = torch.softmax(gating_output, dim=-1)
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    if e_score_correction_bias is not None:
+        scores.add_(e_score_correction_bias.unsqueeze(0))
+
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group,
                                -1).max(dim=-1).values  # [n, n_group]
@@ -479,10 +545,11 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
-                          a2_scale: Optional[torch.Tensor] = None) -> None:
+                          a2_scale: Optional[torch.Tensor] = None,
+                          block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
-                       a1_scale, a2_scale)
+                       a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -496,7 +563,8 @@ def inplace_fused_experts_fake(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> None:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> None:
     pass
 
 
@@ -519,10 +587,11 @@ def outplace_fused_experts(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
-                              w2_scale, a1_scale, a2_scale)
+                              w2_scale, a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -536,7 +605,8 @@ def outplace_fused_experts_fake(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -559,18 +629,22 @@ def fused_experts(hidden_states: torch.Tensor,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
-                  a2_scale: Optional[torch.Tensor] = None):
+                  a2_scale: Optional[torch.Tensor] = None,
+                  block_shape: Optional[List[int]] = None):
     if inplace:
         torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
                                              topk_weights, topk_ids,
                                              use_fp8_w8a8, use_int8_w8a16,
                                              w1_scale, w2_scale, a1_scale,
-                                             a2_scale)
+                                             a2_scale, block_shape)
         return hidden_states
     else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
-            use_int8_w8a16, w1_scale, w2_scale, a1_scale, a2_scale)
+        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
+                                                     topk_weights, topk_ids,
+                                                     use_fp8_w8a8,
+                                                     use_int8_w8a16, w1_scale,
+                                                     w2_scale, a1_scale,
+                                                     a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -584,7 +658,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
-                       a2_scale: Optional[torch.Tensor] = None):
+                       a2_scale: Optional[torch.Tensor] = None,
+                       block_shape: Optional[List[int]] = None):
     # Check constraints.
     assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
@@ -611,6 +686,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         w2.shape,
         topk_ids.shape[1],
         config_dtype,
+        block_shape=block_shape,
     )
 
     config = get_config_func(M)
@@ -674,7 +750,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16)
+                                use_int8_w8a16=use_int8_w8a16,
+                                block_shape=block_shape)
 
         ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -693,7 +770,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16)
+                                use_int8_w8a16=use_int8_w8a16,
+                                block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
@@ -718,6 +796,7 @@ def fused_moe(
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -745,6 +824,12 @@ def fused_moe(
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+        quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -775,4 +860,5 @@ def fused_moe(
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
                          a1_scale=a1_scale,
-                         a2_scale=a2_scale)
+                         a2_scale=a2_scale,
+                         block_shape=block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8c6f7c6e06515..b108cbd52c218 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -29,6 +29,7 @@ class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
     GROUP = "group"
+    BLOCK = "block"
 
 
 class FusedMoEMethodBase(QuantizeMethodBase):
@@ -40,9 +41,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         raise NotImplementedError
 
     @abstractmethod
-    def apply(self, layer: torch.nn.Module, x: torch.Tensor,
-              router_logits: torch.Tensor, top_k: int, renormalize: bool,
-              use_grouped_topk: bool) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -72,16 +84,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
     def apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         return self.forward(x=x,
                             layer=layer,
@@ -91,19 +105,23 @@ def apply(
                             use_grouped_topk=use_grouped_topk,
                             topk_group=topk_group,
                             num_expert_group=num_expert_group,
-                            custom_routing_function=custom_routing_function)
+                            custom_routing_function=custom_routing_function,
+                            scoring_func=scoring_func,
+                            e_score_correction_bias=e_score_correction_bias)
 
     def forward_cuda(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            use_grouped_topk: bool,
-            top_k: int,
-            router_logits: torch.Tensor,
-            renormalize: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -113,7 +131,9 @@ def forward_cuda(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(hidden_states=x,
                              w1=layer.w13_weight,
@@ -127,21 +147,29 @@ def forward_cpu(self, *args, **kwargs):
             "The CPU backend currently does not support MoE.")
 
     def forward_tpu(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            use_grouped_topk: bool,
-            top_k: int,
-            router_logits: torch.Tensor,
-            renormalize: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
         assert custom_routing_function is None
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU.")
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU.")
         return fused_moe_pallas(hidden_states=x,
                                 w1=layer.w13_weight,
                                 w2=layer.w2_weight,
@@ -155,7 +183,7 @@ def forward_tpu(
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -189,6 +217,8 @@ def __init__(
         tp_size: Optional[int] = None,
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ):
         super().__init__()
 
@@ -199,6 +229,7 @@ def __init__(
                         get_tensor_model_parallel_world_size())
         self.top_k = top_k
         self.num_experts = num_experts
+        assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
         self.renormalize = renormalize
@@ -208,6 +239,12 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
+
+        if self.scoring_func != "softmax" and not self.use_grouped_topk:
+            raise ValueError("Only softmax scoring function is supported for "
+                             "non-grouped topk.")
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -398,7 +435,10 @@ def weight_loader(self, param: torch.nn.Parameter,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
                     tp_rank=tp_rank)
-            elif quant_method == FusedMoeWeightScaleSupported.GROUP.value:
+            elif quant_method in [
+                    FusedMoeWeightScaleSupported.GROUP.value,
+                    FusedMoeWeightScaleSupported.BLOCK.value,
+            ]:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
                     shard_dim=shard_dim,
@@ -441,7 +481,9 @@ def select_experts(hidden_states: torch.Tensor,
                        renormalize: bool,
                        topk_group: Optional[int] = None,
                        num_expert_group: Optional[int] = None,
-                       custom_routing_function: Optional[Callable] = None):
+                       custom_routing_function: Optional[Callable] = None,
+                       scoring_func: str = "softmax",
+                       e_score_correction_bias: Optional[torch.Tensor] = None):
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_topk, grouped_topk)
 
@@ -455,7 +497,9 @@ def select_experts(hidden_states: torch.Tensor,
                 topk=top_k,
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
-                topk_group=topk_group)
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
         elif custom_routing_function is None:
             topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
                                                 gating_output=router_logits,
@@ -484,7 +528,9 @@ def forward(self, hidden_states: torch.Tensor,
             use_grouped_topk=self.use_grouped_topk,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
-            custom_routing_function=self.custom_routing_function)
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 46ef11e7d02c6..33b221b994b2b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -14,11 +14,14 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+# yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           BlockQuantScaleParameter,
                                            PackedColumnParameter,
                                            PackedvLLMParameter,
                                            PerTensorScaleParameter,
                                            RowvLLMParameter)
+# yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -623,8 +626,24 @@ def weight_loader_v2(self,
         assert loaded_shard_id < len(self.output_sizes)
 
         tp_size = get_tensor_model_parallel_world_size()
-        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-        shard_size = self.output_sizes[loaded_shard_id] // tp_size
+
+        if isinstance(param, BlockQuantScaleParameter):
+            from vllm.model_executor.layers.quantization.fp8 import (
+                Fp8LinearMethod, Fp8MoEMethod)
+            assert self.quant_method is not None
+            assert isinstance(self.quant_method,
+                              (Fp8LinearMethod, Fp8MoEMethod))
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            assert weight_block_size is not None
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
+                block_n) // tp_size
+            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
+                          block_n // tp_size)
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // tp_size
 
         param.load_merged_column_weight(loaded_weight=loaded_weight,
                                         shard_id=loaded_shard_id,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 4d1a837d11585..c28fd0c6737e0 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -440,11 +440,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -454,7 +456,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f5758a42dbbc..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
@@ -27,20 +29,29 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -78,8 +89,50 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -90,12 +143,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -110,13 +165,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -315,23 +366,105 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
-        scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
+
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
 
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index dad04017d3212..5fd6b017f444b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -203,13 +203,14 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -220,7 +221,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
@@ -476,12 +479,15 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -490,7 +496,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -7,13 +7,12 @@
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..bc697ef93b34b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,208 @@
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise, sparse_cutlass_supported)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        if not sparse_cutlass_supported():
+            raise ValueError(
+                "Sparse CUTLASS not supported. vLLM must be built with"
+                "CUDA 12.2 or later to use this feature")
+
+        self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=self.weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        
+        """
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 9ad61a64e406c..61d1c911cd1ad 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -61,6 +61,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
+        assert params_dtype == torch.float16, (
+            "float16 is required for marlin24 compressd models. Set dtype=torch.float16"  # noqa: E501
+        )
+
         pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index a74eaef5efdee..dfae4db71e546 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -30,7 +30,7 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
+    if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
         shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
 
         # Convert fused_name --> [shard_names]
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 97297970d9317..209f12c6dfec9 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -99,11 +99,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -115,7 +117,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 978e727bc7cb3..7f779ac8d3b3e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,6 +6,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
@@ -14,6 +15,8 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_w8a8_block_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -22,7 +25,8 @@
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
     cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
     requantize_with_max_scale)
-from vllm.model_executor.parameter import (ModelWeightParameter,
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ModelWeightParameter,
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -41,6 +45,7 @@ def __init__(
         is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
         ignored_layers: Optional[List[str]] = None,
+        weight_block_size: Optional[List[int]] = None,
     ) -> None:
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
@@ -51,6 +56,20 @@ def __init__(
                 f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
         self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "The block-wise quantization only supports fp8-serialized "
+                    "checkpoint for now.")
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    "The quantization block size of weight must have 2 "
+                    f"dimensions, but got {len(weight_block_size)} dimensions")
+            if activation_scheme != "dynamic":
+                raise ValueError("The block-wise quantization only supports "
+                                 "dynamic activation scheme for now, but got "
+                                 f"{activation_scheme} activation scheme.")
+        self.weight_block_size = weight_block_size
 
     @classmethod
     def get_name(cls) -> str:
@@ -74,9 +93,12 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         is_checkpoint_fp8_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
+                                                 None)
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers)
+                   ignored_layers=ignored_layers,
+                   weight_block_size=weight_block_size)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
@@ -123,6 +145,11 @@ def __init__(self, quant_config: Fp8Config):
         if current_platform.is_rocm():
             self.use_marlin = False
 
+        self.block_quant = self.quant_config.weight_block_size is not None
+        if self.block_quant:
+            # Marlin doesn't support block-wise fp8
+            self.use_marlin = False
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -133,10 +160,34 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
 
+        if self.block_quant:
+            tp_size = get_tensor_model_parallel_world_size()
+            assert self.quant_config.weight_block_size is not None
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # Required by row parallel
+            if (tp_size > 1
+                    and input_size // input_size_per_partition == tp_size
+                    and input_size_per_partition % block_k != 0):
+                raise ValueError(
+                    f"Weight input_size_per_partition = "
+                    f"{input_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}.")
+            # Required by column parallel or enabling merged weights
+            if (tp_size > 1 and output_size // output_size_per_partition
+                    == tp_size) or len(output_partition_sizes) > 1:
+                for output_partition_size in output_partition_sizes:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}.")
+
         layer.logical_widths = output_partition_sizes
 
         layer.input_size_per_partition = input_size_per_partition
@@ -161,12 +212,29 @@ def create_weights(
         # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                            weight_loader=weight_loader)
-
-            scale[:] = torch.finfo(torch.float32).min
-            layer.register_parameter("weight_scale", scale)
+            if not self.block_quant:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale", scale)
+            else:
+                assert self.quant_config.activation_scheme == "dynamic"
+                scale = BlockQuantScaleParameter(
+                    data=torch.empty(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=torch.float32,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                # The weight_scale_inv name is intentional for deepseekv3
+                layer.register_parameter("weight_scale_inv", scale)
 
             # INPUT ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
@@ -180,6 +248,9 @@ def create_weights(
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
         layer.weight = torch.nn.Parameter(layer.weight.data,
                                           requires_grad=False)
         # If checkpoint not serialized fp8, quantize the weights.
@@ -266,6 +337,17 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+            return apply_w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=layer.input_scale,
+                bias=bias,
+            )
+
         return apply_fp8_linear(
             input=x,
             weight=layer.weight,
@@ -291,6 +373,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
+        self.block_quant = self.quant_config.weight_block_size is not None
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size: int, params_dtype: torch.dtype,
@@ -298,6 +381,27 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
 
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+            tp_size = get_tensor_model_parallel_world_size()
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE: To ensure proper alignment of the block-wise quantization
+            # scales, the output_size of the weights for both the gate and up
+            # layers must be divisible by block_n.
+            # Required by column parallel or enabling merged weights
+            if intermediate_size % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size} is not divisible by "
+                    f"weight quantization block_n = {block_n}.")
+            if (tp_size > 1 and intermediate_size % block_k != 0):
+                # Required by row parallel
+                raise ValueError(f"The input_size of down's weight = "
+                                 f"{intermediate_size} is not divisible by "
+                                 f"weight quantization block_k = {block_k}.")
 
         # WEIGHTS
         w13_weight = torch.nn.Parameter(torch.empty(num_experts,
@@ -317,21 +421,45 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        if not self.block_quant:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        else:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
 
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
         extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.
+             value} if self.block_quant else
             {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
         # If loading fp8 checkpoint, pass the weight loaders.
         # If loading an fp16 checkpoint, do not (we will quantize in
@@ -364,7 +492,9 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If rocm, use float8_e4m3fnuz as dtype
@@ -471,12 +601,13 @@ def apply(
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -487,19 +618,27 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
-
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_fp8_w8a8=True,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=True,
+            w1_scale=(layer.w13_weight_scale_inv
+                      if self.block_quant else layer.w13_weight_scale),
+            w2_scale=(layer.w2_weight_scale_inv
+                      if self.block_quant else layer.w2_weight_scale),
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+        )
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a3e58bf1b2a4c..a006d729cc627 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -532,11 +532,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # The input must currently be float16
         orig_dtype = x.dtype
@@ -550,7 +552,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=None)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
new file mode 100644
index 0000000000000..f3c3e130e4161
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -0,0 +1,353 @@
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+from typing import List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+def apply_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
+    output = w8a8_block_fp8_matmul(q_input,
+                                   weight,
+                                   x_scale,
+                                   weight_scale,
+                                   block_size,
+                                   output_dtype=input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_float8(
+    x: torch.Tensor,
+    dtype: torch.dtype = torch.float8_e4m3fn
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to float8 values "
+    "with tensor-wise quantization."""
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_quant_to_tensor_quant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise
+    quantization. The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise
+    quantization scale. Note only float8 is supported for now.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    x_dq_block_tiles = [[
+        x_dq_block[j * block_n:min((j + 1) * block_n, n),
+                   i * block_k:min((i + 1) * block_k, k), ]
+        for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
+
+    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    return x_q_tensor, scale
+
+
+@triton.jit
+def _per_token_group_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.float8_e4m3fn,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
+        is supported for now.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+        scaling factor for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0), (
+        f"the last dimension of `x` {x.shape[-1]} must be divisible "
+        f"by `group_size` {group_size}")
+    assert x.is_contiguous(), "`x` must be contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_fp8[(M, )](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should
+        be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    # TODO:
+    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
+    # BLOCK_SIZE_K must be divisible by block_k
+    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
+    BLOCK_SIZE_M = 128
+    if M < BLOCK_SIZE_M:
+        BLOCK_SIZE_M = triton.next_power_of_2(M)
+        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
+    BLOCK_SIZE_K = block_k
+    assert block_k % BLOCK_SIZE_K == 0
+    BLOCK_SIZE_N = block_n
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_fp8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        GROUP_SIZE_M=8,
+    )
+
+    return C
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 4037bcb963b25..d89071f30a549 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,9 +10,18 @@
 TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def sparse_cutlass_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_sparse_scaled_mm_supported(capability)
+
+
 def cutlass_fp8_supported() -> bool:
-    # cutlass is not supported on Rocm
-    if current_platform.is_rocm():
+    if not current_platform.is_cuda():
         return False
 
     capability_tuple = current_platform.get_device_capability()
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3ab0ba9e9f5c2..97a1b0c9603bd 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,6 +1,6 @@
 from functools import cached_property
 from importlib.util import find_spec
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.jit
@@ -386,16 +386,12 @@ def _multinomial(
     if not seeded_seqs:
         q.exponential_(1.0)
     else:
-        non_seeded_indices: List[int] = []
         start = 0
         for idx in range(len(q) // k):
             end = start + k
             generator = seeded_seqs.get(idx)
-            if generator is None:
-                non_seeded_indices.extend(list(range(start, end)))
-            else:
-                q[start:end].exponential_(1.0, generator=generator)
+            # Note: generator might be None for non seeded
+            q[start:end].exponential_(1.0, generator=generator)
             start = end
-        q[non_seeded_indices].exponential_(1.0)
 
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c10efefea5471..c2d12c466ba45 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -11,6 +11,7 @@
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.model_executor.layers.utils import apply_penalties
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
@@ -258,11 +259,11 @@ def forward(
 
         # Apply presence and frequency penalties.
         if do_penalties:
-            logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
-                                      sampling_tensors.output_tokens,
-                                      sampling_tensors.presence_penalties,
-                                      sampling_tensors.frequency_penalties,
-                                      sampling_tensors.repetition_penalties)
+            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                     sampling_tensors.output_tokens,
+                                     sampling_tensors.presence_penalties,
+                                     sampling_tensors.frequency_penalties,
+                                     sampling_tensors.repetition_penalties)
 
         # Use float32 to apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
@@ -336,23 +337,6 @@ def _should_modify_greedy_probs_inplace(self) -> bool:
         return self.should_modify_greedy_probs_inplace
 
 
-def _get_bin_counts_and_mask(
-    tokens: torch.Tensor,
-    vocab_size: int,
-    num_seqs: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Compute the bin counts for the tokens.
-    # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=tokens.device)
-    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
-    bin_counts = bin_counts[:, :vocab_size]
-    mask = bin_counts > 0
-
-    return bin_counts, mask
-
-
 def _apply_min_tokens_penalty(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
@@ -400,29 +384,6 @@ def _apply_min_tokens_penalty(
     return logits
 
 
-def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
-                     output_tokens_tensor: torch.Tensor,
-                     presence_penalties: torch.Tensor,
-                     frequency_penalties: torch.Tensor,
-                     repetition_penalties: torch.Tensor) -> torch.Tensor:
-    num_seqs, vocab_size = logits.shape
-    _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
-                                              num_seqs)
-    output_bin_counts, output_mask = _get_bin_counts_and_mask(
-        output_tokens_tensor, vocab_size, num_seqs)
-
-    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
-    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
-    logits = torch.where(logits > 0, logits / repetition_penalties,
-                         logits * repetition_penalties)
-
-    # We follow the definition in OpenAI API.
-    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
-    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
-    return logits
-
-
 def _apply_top_k_top_p(
     logits: torch.Tensor,
     p: torch.Tensor,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
new file mode 100644
index 0000000000000..f6f34cd49d953
--- /dev/null
+++ b/vllm/model_executor/layers/utils.py
@@ -0,0 +1,57 @@
+"""Utility methods for model layers."""
+from typing import Tuple
+
+import torch
+
+
+def get_token_bin_counts_and_mask(
+    tokens: torch.Tensor,
+    vocab_size: int,
+    num_seqs: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=tokens.device)
+    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+
+    return bin_counts, mask
+
+
+def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+                    output_tokens_tensor: torch.Tensor,
+                    presence_penalties: torch.Tensor,
+                    frequency_penalties: torch.Tensor,
+                    repetition_penalties: torch.Tensor) -> torch.Tensor:
+    """
+    Applies penalties in place to the logits tensor
+    logits : The input logits tensor of shape [num_seqs, vocab_size]
+    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
+        are padded to the maximum prompt length within the batch using 
+        `vocab_size` as the padding value. The value `vocab_size` is used 
+        for padding because it does not correspond to any valid token ID 
+        in the vocabulary.
+    output_tokens_tensor: The output tokens tensor.
+    presence_penalties: The presence penalties of shape (num_seqs, )
+    frequency_penalties: The frequency penalties of shape (num_seqs, )
+    repetition_penalties: The repetition penalties of shape (num_seqs, )
+    """
+    num_seqs, vocab_size = logits.shape
+    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
+                                                   vocab_size, num_seqs)
+    output_bin_counts, output_mask = get_token_bin_counts_and_mask(
+        output_tokens_tensor, vocab_size, num_seqs)
+    repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
+        1, vocab_size)
+    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
+                                      repetition_penalties, 1.0)[logits > 0]
+    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
+                                       repetition_penalties, 1.0)[logits <= 0]
+    # We follow the definition in OpenAI API.
+    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+    return logits
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index fdc4c6305bd5e..f2d9293b31a83 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -45,9 +45,11 @@
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
-    safetensors_weights_iterator)
+    runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.transformers_utils.s3_utils import glob as s3_glob
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import is_pin_memory_available
 
 
@@ -1234,6 +1236,108 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         return model
 
 
+class RunaiModelStreamerLoader(BaseModelLoader):
+    """
+        Model loader that can load safetensors 
+        files from local FS or S3 bucket.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            extra_config = load_config.model_loader_extra_config
+
+            if ("concurrency" in extra_config
+                    and isinstance(extra_config.get("concurrency"), int)):
+                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                    extra_config.get("concurrency"))
+
+            if ("memory_limit" in extra_config
+                    and isinstance(extra_config.get("memory_limit"), int)):
+                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                    extra_config.get("memory_limit"))
+
+            runai_streamer_s3_endpoint = os.getenv(
+                'RUNAI_STREAMER_S3_ENDPOINT')
+            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
+            if (runai_streamer_s3_endpoint is None
+                    and aws_endpoint_url is not None):
+                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> List[str]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        is_s3_path = is_s3(model_name_or_path)
+        is_local = os.path.isdir(model_name_or_path)
+        safetensors_pattern = "*.safetensors"
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+
+        hf_folder = (model_name_or_path if
+                     (is_local or is_s3_path) else download_weights_from_hf(
+                         model_name_or_path,
+                         self.load_config.download_dir,
+                         [safetensors_pattern],
+                         revision,
+                         ignore_patterns=self.load_config.ignore_patterns,
+                     ))
+
+        if is_s3_path:
+            hf_weights_files = s3_glob(path=hf_folder,
+                                       allow_pattern=[safetensors_pattern])
+        else:
+            hf_weights_files = glob.glob(
+                os.path.join(hf_folder, safetensors_pattern))
+
+        if not is_local and not is_s3_path:
+            download_safetensors_index_file_from_hf(
+                model_name_or_path, index_file, self.load_config.download_dir,
+                revision)
+
+        if not hf_weights_files:
+            raise RuntimeError(
+                f"Cannot find any safetensors model weights with "
+                f"`{model_name_or_path}`")
+
+        return hf_weights_files
+
+    def _get_weights_iterator(
+            self, model_or_path: str,
+            revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_weights_files = self._prepare_weights(model_or_path, revision)
+        return runai_safetensors_weights_iterator(hf_weights_files)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download model if necessary"""
+        self._prepare_weights(model_config.model, model_config.revision)
+
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        """Perform streaming of the model to destination"""
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(vllm_config=vllm_config)
+
+            model_weights = model_config.model
+            if hasattr(model_config, "model_weights"):
+                model_weights = model_config.model_weights
+            model.load_weights(
+                self._get_weights_iterator(model_weights,
+                                           model_config.revision))
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+        return model.eval()
+
+
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     """Get a model loader based on the load format."""
 
@@ -1255,4 +1359,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.GGUF:
         return GGUFModelLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER:
+        return RunaiModelStreamerLoader(load_config)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 87f3fcb5cae00..8b929f299c8d8 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -19,9 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.utils import FlexibleArgumentParser
-
-tensorizer_error_msg = None
+from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
 try:
     from tensorizer import (DecryptionParams, EncryptionParams,
@@ -34,8 +32,19 @@
         open_stream,
         mode=mode,
     ) for mode in ("rb", "wb+"))
-except ImportError as e:
-    tensorizer_error_msg = str(e)
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")
+    DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+    TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
+    TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
+    open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
+    convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
+    get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
+    no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
+
+    _read_stream = tensorizer.placeholder_attr("_read_stream")
+    _write_stream = tensorizer.placeholder_attr("_write_stream")
 
 __all__ = [
     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
@@ -267,12 +276,6 @@ class TensorizerAgent:
     """
 
     def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
-        if tensorizer_error_msg is not None:
-            raise ImportError(
-                "Tensorizer is not installed. Please install tensorizer "
-                "to use this feature with `pip install vllm[tensorizer]`. "
-                "Error message: {}".format(tensorizer_error_msg))
-
         self.tensorizer_config = tensorizer_config
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f15e7176b3d50..44978a55e072d 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -7,7 +7,9 @@
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 
 
 @contextlib.contextmanager
@@ -35,8 +37,12 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.runner_type == "pooling":
+    if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
+    elif model_config.task == "classify":
+        model_cls = as_classification_model(model_cls)
+    elif model_config.task == "reward":
+        model_cls = as_reward_model(model_cls)
 
     return model_cls, arch
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9488d54edf365..8aa0c98df70d2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,15 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+from vllm.utils import PlaceholderModule, print_warning_once
+
+try:
+    from runai_model_streamer import SafetensorsStreamer
+except ImportError:
+    runai_model_streamer = PlaceholderModule(
+        "runai_model_streamer")  # type: ignore[assignment]
+    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
+        "SafetensorsStreamer")
 
 logger = init_logger(__name__)
 
@@ -410,6 +418,23 @@ def safetensors_weights_iterator(
                 yield name, param
 
 
+def runai_safetensors_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
+    with SafetensorsStreamer() as streamer:
+        for st_file in tqdm(
+                hf_weights_files,
+                desc="Loading safetensors using Runai Model Streamer",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+        ):
+            streamer.stream_file(st_file)
+            yield from streamer.get_tensors()
+
+
 def pt_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 9cc43ae9181b9..55e90b9d41950 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,29 +1,48 @@
 from collections.abc import Iterable
-from typing import Any, TypeVar
+from typing import TYPE_CHECKING, Any, Optional, TypeVar
 
 import torch
 import torch.nn as nn
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.pooler import PoolingType
+
 _T = TypeVar("_T", bound=type[nn.Module])
 
+_GENERATE_SUFFIXES = [
+    "ForCausalLM",
+    "ForConditionalGeneration",
+    "ChatModel",
+    "LMHeadModel",
+]
 
-def as_embedding_model(cls: _T) -> _T:
-    """Subclass an existing vLLM model to support embeddings."""
-    # Avoid modifying existing embedding models
-    if is_pooling_model(cls):
-        return cls
 
+def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
+    model_name = orig_model_name
+
+    for generate_suffix in _GENERATE_SUFFIXES:
+        model_name = model_name.removesuffix(generate_suffix)
+
+    return model_name + pooling_suffix
+
+
+def _create_pooling_model_cls(
+    orig_cls: _T,
+    *,
+    default_pooling_type: "PoolingType",
+    default_normalize: bool,
+    default_softmax: bool,
+) -> _T:
     # Lazy import
     from vllm.config import VllmConfig
-    from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput,
-                                                   PoolingType)
+    from vllm.model_executor.layers.pooler import Pooler, PoolerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
 
     from .utils import AutoWeightsLoader, WeightsMapper
 
-    class ModelForEmbedding(cls, VllmModelForPooling):
+    class ModelForPooling(orig_cls, VllmModelForPooling):
 
         def __init__(
             self,
@@ -34,7 +53,7 @@ def __init__(
         ) -> None:
             super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
-            # These are not used in embedding models
+            # These are not used in pooling models
             for attr in ("lm_head", "logits_processor"):
                 if hasattr(self, attr):
                     delattr(self, attr)
@@ -46,9 +65,9 @@ def __init__(
             if not getattr(self, "_pooler", None):
                 self._pooler = Pooler.from_config_with_defaults(
                     pooler_config,
-                    pooling_type=PoolingType.LAST,
-                    normalize=True,
-                    softmax=False,
+                    pooling_type=default_pooling_type,
+                    normalize=default_normalize,
+                    softmax=default_softmax,
                 )
 
         def pooler(
@@ -82,17 +101,148 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                     return
 
             # For most other models
-            if hasattr(cls, "load_weights"):
-                cls.load_weights(self, weights)  # type: ignore
+            if hasattr(orig_cls, "load_weights"):
+                orig_cls.load_weights(self, weights)  # type: ignore
             # Fallback
             else:
                 loader = AutoWeightsLoader(self)
                 loader.load_weights(weights)
 
-    ModelForEmbedding.__name__ = cls.__name__ \
-        .removesuffix("ForCausalLM") \
-        .removesuffix("ForConditionalGeneration") \
-        .removesuffix("ChatModel") \
-        .removesuffix("LMHeadModel") + "ForEmbedding"
+    return ModelForPooling  # type: ignore
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support embeddings.
+
+    By default, the embeddings of the whole prompt are extracted from the
+    normalized hidden state corresponding to the last token.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing embedding models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import PoolingType
+
+    ModelForEmbedding = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.LAST,
+        default_normalize=True,
+        default_softmax=False,
+    )
+    ModelForEmbedding.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForEmbedding")
 
     return ModelForEmbedding  # type: ignore
+
+
+def as_classification_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support classification.
+
+    By default, the class probabilities are extracted from the softmaxed
+    hidden state corresponding to the last token.
+
+    Note:
+        We assume that the classification head is a single linear layer
+        stored as the attribute `score` of the top-level model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing classification models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.attention import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.linear import RowParallelLinear
+    from vllm.model_executor.layers.pooler import PoolingType
+    from vllm.sequence import IntermediateTensors
+
+    from .utils import maybe_prefix
+
+    ModelForPooling = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.LAST,
+        default_normalize=False,
+        default_softmax=True,
+    )
+
+    class ModelForClassification(ModelForPooling):
+
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+            config = vllm_config.model_config.hf_config
+            quant_config = vllm_config.quant_config
+
+            self.score = RowParallelLinear(config.hidden_size,
+                                           config.num_labels,
+                                           quant_config=quant_config,
+                                           input_is_parallel=False,
+                                           bias=False,
+                                           prefix=maybe_prefix(
+                                               prefix, "score"))
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: list[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            hidden_states = super().forward(input_ids, positions, kv_caches,
+                                            attn_metadata,
+                                            intermediate_tensors,
+                                            inputs_embeds)
+            logits, _ = self.score(hidden_states)
+            return logits
+
+
+    ModelForClassification.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForClassification")
+
+    return ModelForClassification  # type: ignore
+
+
+def as_reward_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support reward modeling.
+
+    By default, we return the hidden states of each token directly.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing reward models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import PoolingType
+
+    ModelForReward = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.ALL,
+        default_normalize=False,
+        default_softmax=False,
+    )
+
+    ModelForReward.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForReward")
+
+    return ModelForReward  # type: ignore
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index dd4b0c75cb84d..9437ad9688422 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -521,6 +521,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     This model combines a vision tower, a multi-modal projector, and a language
     model to perform tasks that involve both image and text inputs.
     """
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.model": "language_model",
+            "language_model.lm_head": "lm_head",
+        },
+        orig_to_new_suffix={
+            "router.weight": "router_weight",
+        },
+    )
 
     def __init__(
         self,
@@ -662,15 +671,6 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "language_model.model": "language_model",
-                "language_model.lm_head": "lm_head",
-            },
-            orig_to_new_suffix={
-                "router.weight": "router_weight",
-            },
-        )
 
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 053d838432885..c1d47b1bc9bcd 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -409,6 +409,7 @@ class BertEmbeddingModel(nn.Module):
        model: An instance of BertModel used for forward operations.
        _pooler: An instance of Pooler used for pooling operations.
    """
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -441,8 +442,7 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
new file mode 100644
index 0000000000000..333dc019b4d99
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -0,0 +1,650 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV3 model."""
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class DeepseekV3MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV3MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}.")
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV3MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # self.attn = Attention(self.num_heads,
+        #                       self.qk_head_dim,
+        #                       self.scaling,
+        #                       num_kv_heads=self.num_heads)
+
+        # TODO, support head_size 192
+        self.attn = Attention(self.num_local_heads,
+                              256,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.self_attn = DeepseekV3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV3MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = DeepseekV3MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+# TODO(simon): check whether we support torch compile for Deepseek V3
+# @support_torch_compile
+class DeepseekV3Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: DeepseekV3DecoderLayer(
+                config,
+                prefix,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV3ForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV3Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # TODO(simon): support nextn predict layers
+            if self.config.num_nextn_predict_layers > 0:
+                assert self.config.num_nextn_predict_layers == 1
+                layer_idx = self.config.num_hidden_layers
+                if name.startswith(f"model.layers.{layer_idx}"):
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if name not in params_dict:
+                        for key in params_dict:
+                            print(key)
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4664aa53ea092..f4530e4771960 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -31,11 +31,14 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -326,6 +329,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
                     continue
@@ -343,6 +355,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 831db2ae52d74..91786db5ddc96 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -17,6 +17,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -24,8 +25,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import LayerBlockType
 
 from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
@@ -593,3 +595,35 @@ def _is_moe_layer(name: str):
             "experts",
             "router",
         ]])
+
+
+class JambaForSequenceClassification(JambaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+        num_labels: int = config.num_labels
+        score_bias: bool = getattr(config, 'score_bias', False)
+        self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=False,
+            softmax=False)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        hidden_states = hidden_states.float()
+        logits = self.score(hidden_states)
+        return self._pooler(logits, pooling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # TODO: The reward weights themselves have float32 accuracy data, we
+        # would like to load them in fp32 to get that extra precision.
+        super().load_weights(weights)
+        self.score = self.score.float()
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a2e404cf43238..0662d90e79b92 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -133,8 +133,8 @@ def preprocess(__self, *args, **kwargs):
         hf_processor.__is_patched__ = True  # type: ignore
 
     def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        hf_processor = self.ctx.get_hf_processor()
-        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
+        hf_processor = self.ctx.get_hf_processor(
+            (LlavaProcessor, PixtralProcessor))
 
         if isinstance(hf_processor, PixtralProcessor):
             self._patch_pixtral_processor(hf_processor)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a328b5a2aeea7..8938f62d0c494 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -464,24 +464,27 @@ def forward(
 class MolmoMLP(nn.Module):
     """Molmo's LLM mlp."""
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
+    def __init__(self,
+                 config: PretrainedConfig,
+                 input_dim: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 proj_name: str = "gate_up_proj") -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Feed-forward input projection.
-        self.gate_up_proj = MergedColumnParallelLinear(
-            input_dim or self.hidden_size,
-            [self.intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-        )
-
+        # Molmo's LLM proj weights are already merged into the disk, while
+        # image_projector proj is separate. If the same proj_name were used, it
+        # would create ambiguity and make it difficult to support BNB and LoRA.
+        self.proj_name = proj_name
+        setattr(
+            self, proj_name,
+            MergedColumnParallelLinear(
+                input_dim or self.hidden_size,
+                [self.intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+            ))
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -497,7 +500,7 @@ def forward(
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = self.gate_up_proj(x)
+        gate_up, _ = getattr(self, self.proj_name)(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -520,7 +523,9 @@ def __init__(
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config, quant_config=quant_config)
+        self.mlp = MolmoMLP(config,
+                            quant_config=quant_config,
+                            proj_name="gate_up_proj")
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -616,6 +621,7 @@ def __init__(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
+            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -714,8 +720,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
+            ("merged_linear", "gate_proj", 0),
+            ("merged_linear", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
@@ -928,7 +934,11 @@ def image_input_mapper_for_molmo(
     data: object,
 ):
     if isinstance(data, list):
+        assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
+
+    # Remove unused dummy PIL image
+    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -974,6 +984,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
+        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
@@ -1118,6 +1129,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
 class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            # vision backbone mapping
+            "image_projector.w1.": "image_projector.gate_proj.",
+            "image_projector.w3.": "image_projector.up_proj.",
+            "image_projector.w2.": "image_projector.down_proj.",
+            # language backbone mapping
+            "att_proj": "self_attn.qkv_proj",
+            "attn_out": "self_attn.o_proj",
+            "q_norm": "self_attn.q_norm",
+            "k_norm": "self_attn.k_norm",
+            "ff_proj": "mlp.gate_up_proj",
+            "ff_out": "mlp.down_proj",
+            "attn_norm": "input_layernorm",
+            "ff_norm": "post_attention_layernorm",
+        },
+        orig_to_new_prefix={
+            # vision backbone mapping
+            "model.vision_backbone.": "vision_backbone.",
+            # language backbone mapping
+            "model.transformer.blocks.": "model.layers.",
+            "model.transformer.ln_f.": "model.norm.",
+            # lm_head is renamed to model.transformer.mlp.down_proj firstly,
+            # we need to run a second renaming for it
+            "model.transformer.mlp.down_proj.": "lm_head.",
+        },
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -1293,36 +1332,10 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_substr={
-                # vision backbone mapping
-                "image_projector.w1.": "image_projector.gate_proj.",
-                "image_projector.w3.": "image_projector.up_proj.",
-                "image_projector.w2.": "image_projector.down_proj.",
-                # language backbone mapping
-                "att_proj": "self_attn.qkv_proj",
-                "attn_out": "self_attn.o_proj",
-                "q_norm": "self_attn.q_norm",
-                "k_norm": "self_attn.k_norm",
-                "ff_proj": "mlp.gate_up_proj",
-                "ff_out": "mlp.down_proj",
-                "attn_norm": "input_layernorm",
-                "ff_norm": "post_attention_layernorm",
-            },
-            orig_to_new_prefix={
-                # vision backbone mapping
-                "model.vision_backbone.": "vision_backbone.",
-                # language backbone mapping
-                "model.transformer.blocks.": "model.layers.",
-                "model.transformer.ln_f.": "model.norm.",
-                # lm_head is renamed to model.transformer.mlp.down_proj firstly,
-                # we need to run a second renaming for it
-                "model.transformer.mlp.down_proj.": "lm_head.",
-            },
-        )
+
         loader = AutoWeightsLoader(self)
         weights = _get_weights_with_merged_embedding(weights)
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 def _get_weights_with_merged_embedding(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7ab06768ae612..4e2e7f5761544 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,7 +34,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataDict,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
@@ -302,11 +301,18 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(ctx: InputContext) -> int:
-    processor = ctx.get_hf_processor()
-    image_processor = processor.image_processor  # type: ignore
+def get_max_phi3v_image_tokens(
+    ctx: InputContext,
+    *,
+    num_crops: Optional[int] = None,
+) -> int:
+    mm_processor_kwargs = {}
+    if num_crops:
+        mm_processor_kwargs["num_crops"] = num_crops
 
-    return image_processor.calc_num_image_tokens_from_image_size(
+    processor = ctx.get_hf_processor(**mm_processor_kwargs)
+
+    return processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
@@ -323,20 +329,27 @@ def _get_hf_processor(
             return self.ctx.get_hf_processor(num_crops=num_crops)
         return self.ctx.get_hf_processor()
 
-    def _apply_hf_processor(
+    def _call_hf_processor(
         self,
+        hf_processor: ProcessorMixin,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        processor_data: Mapping[str, object],
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processed_outputs = super()._apply_hf_processor(
-            prompt, mm_data, mm_processor_kwargs)
+        processed_outputs = super()._call_hf_processor(
+            hf_processor,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+
         # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
         # which will cause OverflowError when decoding the prompt_ids.
         # Therefore, we need to do an early replacement here
         token_ids = processed_outputs['input_ids']
         token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
         processed_outputs['input_ids'] = token_ids
+
         return processed_outputs
 
     def _get_prompt_replacements(
@@ -395,6 +408,13 @@ def _get_dummy_mm_inputs(
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_embed_tokens.wte": "embed_tokens",
+            "model.vision_embed_tokens.": "vision_embed_tokens.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -603,17 +623,10 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "model.vision_embed_tokens.wte": "embed_tokens",
-                "model.vision_embed_tokens.": "vision_embed_tokens.",
-                "lm_head.": "language_model.lm_head.",
-                "model.": "language_model.model.",
-            })
 
         loader = AutoWeightsLoader(self)
         autoloaded_weights = loader.load_weights(weights,
-                                                 mapper=hf_to_vllm_mapper)
+                                                 mapper=self.hf_to_vllm_mapper)
 
         # The HF config doesn't specify whether these are tied,
         # so we detect it this way
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f05ea195e043d..f3d66c2313198 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -10,12 +10,12 @@
 from PIL import Image
 from transformers import PixtralVisionConfig
 from transformers.models.pixtral.image_processing_pixtral import (
-    _num_image_tokens)
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens)
 from transformers.models.pixtral.modeling_pixtral import (
     PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -27,7 +27,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
@@ -35,11 +34,10 @@
                                    consecutive_placeholder_ranges,
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import init_vllm_registered_model, maybe_prefix
+from .utils import (init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 try:
     from xformers import ops as xops
@@ -47,8 +45,12 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
-PIXTRAL_IMAGE_BREAK_ID = 12
-PIXTRAL_IMAGE_END_ID = 13
+# These token ids cannot be retrieved from model config
+# so we hardcode them here.
+PIXTRAL_12B_IMAGE_BREAK_ID = 12
+PIXTRAL_12B_IMAGE_END_ID = 13
+PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
+PIXTRAL_LARGE_IMAGE_END_ID = 15
 
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
@@ -120,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
     for image_data in data_list:
         image = ImageChunk(image=image_data)
         encoding = tokenizer.instruct.mm_encoder(image)
-        image = torch.from_numpy(encoding.image).to(device="cuda",
-                                                    dtype=torch.float16)
+        image = torch.from_numpy(encoding.image).to(dtype=torch.float16)
         images.append(image)
         image_tokens_list.append(encoding.tokens)
 
@@ -239,8 +240,9 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the indices of `[IMG_END]` token.
-        split_indices = torch.where(
-            image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1
+        image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
+            image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
+        split_indices = torch.where(image_end_condition)[0] + 1
         if len(split_indices) <= 1:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
@@ -262,8 +264,11 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
-                    self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID,
-                    PIXTRAL_IMAGE_BREAK_ID
+                    self.vision_args.image_token_id,
+                    PIXTRAL_12B_IMAGE_END_ID,
+                    PIXTRAL_12B_IMAGE_BREAK_ID,
+                    PIXTRAL_LARGE_IMAGE_BREAK_ID,
+                    PIXTRAL_LARGE_IMAGE_END_ID,
                 ])
         return inputs_embeds
 
@@ -699,37 +704,14 @@ def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_max_pixtral_hf_image_feature_size(
-        hf_config: PixtralVisionConfig) -> int:
-    return get_pixtral_hf_num_patches(image_size=hf_config.image_size,
-                                      patch_size=hf_config.patch_size)
-
-
 def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
-    return get_max_pixtral_hf_image_feature_size(hf_config)
+    grid_length = get_pixtral_hf_patch_grid_length(
+        image_size=hf_config.image_size,
+        patch_size=hf_config.patch_size,
+    )
 
-
-def dummy_seq_data_for_pixtral_hf(
-        hf_config: PixtralVisionConfig,
-        seq_len: int,
-        num_images: int,
-        *,
-        image_token_id: int,
-        image_feature_size_override: Optional[int] = None,
-        mm_key: str = "image"):
-    if image_feature_size_override is None:
-        image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        mm_key:
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
+    # Consider the image_break_token
+    return (grid_length + 1) * grid_length
 
 
 def dummy_image_for_pixtral_hf(
@@ -763,116 +745,14 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
         image_width = int(numpy.ceil(image_width / ratio))
         image_height = int(numpy.ceil(image_height / ratio))
 
-    num_height_tokens, num_width_tokens = _num_image_tokens(
-        (image_height, image_width), (patch_height, patch_width))
+    num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
+        (image_height, image_width),
+        (patch_height, patch_width),
+    )
 
     return num_width_tokens, num_height_tokens
 
 
-def input_processor_for_pixtral_hf(
-    model_config: ModelConfig,
-    hf_config: PixtralVisionConfig,
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[Union[int, List[int]]] = None,
-) -> DecoderOnlyInputs:
-    assert image_feature_size_override is None, (
-        "image_feature_size_override is not supported for Pixtral")
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    processor = cached_get_processor(model_config.model)
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_data = [image_data]
-    elif not is_list_of(image_data, Image.Image):
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    new_prompt = inputs.get("prompt")
-    new_token_ids = inputs["prompt_token_ids"]
-
-    image_token = processor.image_token
-    image_break_token = processor.image_break_token
-    image_end_token = processor.image_end_token
-
-    # Update new_prompt if present
-    if new_prompt:
-        parts = new_prompt.split(image_token)
-        assert len(parts) - 1 == len(image_data)
-        new_parts = [parts[0]]  # Start with the part before any image tokens
-
-        for image, next_part in zip(image_data, parts[1:]):
-            w, h = image.size
-            (num_width_tokens,
-             num_height_tokens) = get_pixtral_hf_image_feature_size(
-                 hf_config, image_width=w, image_height=h)
-
-            replace_tokens = [image_token] * num_width_tokens + [
-                image_break_token
-            ]
-            replace_tokens = replace_tokens * num_height_tokens
-            replace_tokens[-1] = image_end_token
-
-            new_parts.append("".join(replace_tokens))
-            new_parts.append(next_part)
-
-        new_prompt = "".join(new_parts)
-
-    # Update new_token_ids
-    convert_tokens_to_ids = processor.tokenizer.convert_tokens_to_ids
-    image_token_id = convert_tokens_to_ids(image_token)
-    image_break_id = convert_tokens_to_ids(image_break_token)
-    image_end_id = convert_tokens_to_ids(image_end_token)
-    placeholder_token_id = -999
-    # Find all image token indices at once
-    placeholder_indices = [
-        idx for idx, token_id in enumerate(new_token_ids)
-        if token_id == image_token_id
-    ]
-    assert len(placeholder_indices) == len(image_data)
-    replace_tokens_list = []
-    for placeholder_idx, image in zip(placeholder_indices, image_data):
-        new_token_ids[placeholder_idx] = placeholder_token_id
-
-        w, h = image.size
-        (num_width_tokens,
-         num_height_tokens) = get_pixtral_hf_image_feature_size(hf_config,
-                                                                image_width=w,
-                                                                image_height=h)
-
-        replace_tokens = [image_token_id] * num_width_tokens + [image_break_id]
-        replace_tokens = replace_tokens * num_height_tokens
-        replace_tokens[-1] = image_end_id
-        replace_tokens_list.append(replace_tokens)
-
-    reverse_offsets: List[int] = []
-    # Backward iteration for replacement without affecting known indices
-    for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices),
-                                               reversed(replace_tokens_list)):
-        reverse_offsets.append(
-            len(new_token_ids) - placeholder_idx + len(replace_tokens))
-        new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens
-
-    placeholder_ranges: List[PlaceholderRange] = []
-    for reverse_offset, replace_tokens in zip(reversed(reverse_offsets),
-                                              replace_tokens_list):
-        placeholder_ranges.append(
-            PlaceholderRange(
-                offset=len(new_token_ids) - reverse_offset,
-                length=len(replace_tokens),
-            ))
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
-
-
 class PixtralHFMLP(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3ce4eb5869f21..88f4ea4352726 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -529,6 +529,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -543,8 +545,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
-        # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM),
-        # after changing the default pooling method
+        # TODO: Replace this model class with as_embedding_model(
+        # Qwen2ForCausalLM) after changing the default pooling method
         if pooler_config.pooling_type is None:
             logger.warning(
                 "This embedding model will default to last-token pooling in "
@@ -577,8 +579,7 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 48a2d470414b9..6259166a7fc57 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -19,45 +19,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
-from functools import cached_property, lru_cache
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from functools import cached_property
+from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
-import librosa
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import Qwen2AudioEncoder
+from transformers import BatchFeature, ProcessorMixin
+from transformers.models.qwen2_audio import (Qwen2AudioConfig,
+                                             Qwen2AudioEncoder,
+                                             Qwen2AudioProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
-from vllm.logger import init_logger
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 
 # # === Audio Inputs === #
 class Qwen2AudioInputs(TypedDict):
     input_features: torch.Tensor
-    """Shape: 
-    `(num_audios, num_mel_bins, 3000)`
-    """
+    """Shape: `(num_audios, num_mel_bins, 3000)`"""
 
     feature_attention_mask: torch.Tensor
-    """Shape: `(num_audios, 3000)`
-    """
+    """Shape: `(num_audios, 3000)`"""
 
 
 # === Audio Encoder === #
@@ -74,187 +72,116 @@ def forward(self, audio_features):
         return hidden_states
 
 
-def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
-                               mm_counts: Mapping[str, int]):
-    num_audios = mm_counts["audio"]
-    max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
-    max_llm_audio_tokens = max_tokens_per_audio * num_audios
-    if seq_len - max_llm_audio_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
-            "please increase max_model_len or reduce audio limit by "
-            "--limit-mm-per-prompt.")
-
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
-
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (audio_token_index, max_llm_audio_tokens),
-        (0, seq_len - max_llm_audio_tokens),
-    )
-    dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
-    return DummyData(
-        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
-            "audio":
-            consecutive_placeholder_ranges(num_items=num_audios,
-                                           item_size=max_tokens_per_audio)
-        })
-
-
-def get_processor(
-    processor_name: str,
-    *args,
-    trust_remote_code: bool = False,
-    **kwargs,
-):
-    """Gets a processor for the given model name via HuggingFace.
-
-    Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
-    """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor
-
-    try:
-        processor = AutoProcessor.from_pretrained(
-            processor_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            **kwargs)
-    except ValueError as e:
-        # If the error pertains to the processor class not existing or not
-        # currently being imported, suggest using the --trust-remote-code flag.
-        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
-        if not trust_remote_code:
-            err_msg = (
-                "Failed to load the processor. If the processor is "
-                "a custom processor not yet available in the HuggingFace "
-                "transformers library, consider setting "
-                "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
+# From Qwen2AudioEncoder._get_feat_extract_output_lengths
+def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = (feat_lengths - 2) // 2 + 1
+    return feat_lengths, output_lengths
 
-    return processor
 
+def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
+    hf_config = ctx.get_hf_config(Qwen2AudioConfig)
+    max_source_position = hf_config.audio_config.max_source_positions
+    output_lengths = (max_source_position - 2) // 2 + 1
+    return output_lengths
 
-cached_get_processor = lru_cache(get_processor)
 
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
-    """
-    Computes the output length of the convolutional layers
-    and the output length of the audio encoder
-    """
-    input_lengths = (input_lengths - 1) // 2 + 1
-    output_lengths = (input_lengths - 2) // 2 + 1
-    return input_lengths, output_lengths
+    def _get_hf_processor(self) -> Qwen2AudioProcessor:
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().feature_extractor  # type: ignore
 
-def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
-    max_source_position = (
-        ctx.model_config.hf_config.audio_config.max_source_positions)
-    output_lengths = (max_source_position - 2) // 2 + 1
-    return output_lengths
+    def _get_processor_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
+        mm_items.resample_audios(feature_extractor.sampling_rate)
 
+        return super()._get_processor_data(mm_items)
 
-def input_processor_for_qwen2_audio(
-        ctx: InputContext, inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
-
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    if len(audios) == 0:
-        return inputs
-
-    processor = cached_get_processor(ctx.model_config.model)
-    resampled_audios = [
-        librosa.resample(audio,
-                         orig_sr=sampling_rate,
-                         target_sr=processor.feature_extractor.sampling_rate)
-        for audio, sampling_rate in audios
-    ]
-    audio_input_lengths = np.array(
-        [min(3000, _.shape[0] // 160 + 1) for _ in resampled_audios])
-
-    audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
-        audio_input_lengths)
-
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
-
-    input_ids = inputs['prompt_token_ids']
-
-    new_input_ids = []
-    audio_num = input_ids.count(audio_token_index)
-    assert len(audio_input_lengths) == audio_num, \
-        (f'The text input contains {audio_num} audio tokens, '
-         f'but {len(audio_input_lengths)} audios provided')
-    start = 0
-    for audio_idx in range(audio_num):
-        end = input_ids.index(audio_token_index, start)
-        new_input_ids.extend(input_ids[start:end])  # text part
-
-        new_input_ids.extend([audio_token_index] *
-                             audio_output_lengths[audio_idx])
-        start = end + 1
-    new_input_ids.extend(input_ids[start:])
-
-    return token_inputs(
-        prompt_token_ids=new_input_ids,
-        prompt=inputs.get("prompt"),
-        multi_modal_data=multi_modal_data,
-    )
-
-
-def input_mapper_for_qwen2_audio(
-    ctx: InputContext,
-    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-Audio."""
-    if not isinstance(multi_modal_data, list):
-        multi_modal_data = [multi_modal_data]
-
-    if len(multi_modal_data) == 0:
-        return MultiModalKwargs()
-
-    processor = cached_get_processor(ctx.model_config.model)
-    audio_feature_extractor = processor.feature_extractor
-    if audio_feature_extractor is None:
-        raise RuntimeError(
-            "No HuggingFace audio_feature_extractor is available "
-            "to process the audio object")
-
-    try:
-        resampled_audios = [
-            librosa.resample(
-                audio,
-                orig_sr=sampling_rate,
-                target_sr=processor.feature_extractor.sampling_rate)
-            for audio, sampling_rate in multi_modal_data
+    def _call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processor_data = dict(processor_data)
+        audios = processor_data.pop("audios", [])
+
+        if audios:
+            processor_data["audios"] = audios
+
+            feature_extractor = self._get_feature_extractor()
+            mm_processor_kwargs = dict(
+                **mm_processor_kwargs,
+                sampling_rate=feature_extractor.sampling_rate,
+            )
+        else:
+            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+            pass
+
+        return super()._call_hf_processor(
+            hf_processor,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        placeholder = hf_config.audio_token_index
+
+        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        if feature_attention_mask is None:
+            audio_output_lengths = []
+        else:
+            _, audio_output_lengths = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            return [placeholder] * audio_output_lengths[item_idx]
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[placeholder],
+                replacement=get_replacement_qwen2_audio,
+            )
         ]
-        batch_data = audio_feature_extractor(resampled_audios,
-                                             sampling_rate=16000,
-                                             return_attention_mask=True,
-                                             padding="max_length",
-                                             return_tensors="pt").data
-        batch_data["feature_attention_mask"] = batch_data.pop("attention_mask")
-    except Exception:
-        logger.error("Failed to process audio (%s)", multi_modal_data)
-        raise
-
-    return MultiModalKwargs(batch_data)
-
-
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_qwen2_audio)
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [audio] * audio_count}
+
+        return ProcessorInputs(
+            prompt_text="<|AUDIO|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
+
+
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_max_qwen2_audio_audio_tokens)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
@@ -289,9 +216,7 @@ def sampler(self):
 
         return get_sampler()
 
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
deleted file mode 100644
index dc5dabf6fc38b..0000000000000
--- a/vllm/model_executor/models/qwen2_cls.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Adapted from
-# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
-# Copyright 2024 Kakao Corp. (Kanana-X Team)
-# Copyright 2024 The Qwen team.
-# Copyright 2023 The vLLM team.
-"""Inference-only Qwen2-Classification model compatible with HF weights."""
-from typing import Iterable, List, Optional, Set, Tuple
-
-import torch
-from torch import nn
-
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import AutoWeightsLoader, maybe_prefix
-
-
-class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-
-        # hidden_states from Qwen2Model has been reduced,
-        # the input of score layer is not parallelized.
-        self.score = RowParallelLinear(config.hidden_size,
-                                       config.num_labels,
-                                       quant_config=quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(prefix, "score"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=True)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        logits, _ = self.score(hidden_states)
-        return logits
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cfc90cdab01e4..fb97eb1916002 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,28 +22,26 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, Type, TypedDict, Union)
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from PIL import Image
-from transformers.image_utils import (get_image_size,
-                                      infer_channel_dimension_format,
-                                      to_numpy_array)
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
+                                          Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
     Qwen2VLConfig, Qwen2VLVisionConfig)
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
-    make_batched_images, make_batched_videos, smart_resize)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -56,14 +54,14 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
-                                    MultiModalKwargs, NestedTensors)
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.platforms import _Backend
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -159,7 +157,7 @@ class Qwen2VisionMLP(nn.Module):
     def __init__(
         self,
         in_features: int,
-        hidden_features: int = None,
+        hidden_features: int,
         act_layer: Type[nn.Module] = QuickGELU,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -644,78 +642,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
 # === Vision input helpers === #
 
 
-def get_mm_processor_kwargs(
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None) -> Dict[str, int]:
-    mm_processor_kwargs = {}
-    if min_pixels:
-        mm_processor_kwargs["min_pixels"] = min_pixels
-    if max_pixels:
-        mm_processor_kwargs["max_pixels"] = max_pixels
-    return mm_processor_kwargs
-
-
-def mm_input_mapper_for_qwen2_vl(
-    ctx: InputContext,
-    data: MultiModalData[object],
-    data_type_key: str,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-VL."""
-    if data_type_key == "image" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "image_embeds": data.get("image_embeds"),
-            "image_grid_thw": data.get("image_grid_thw"),
-        })
-    if data_type_key == "video" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "video_embeds": data.get("video_embeds"),
-            "video_grid_thw": data.get("video_grid_thw"),
-        })
-
-    model_config = ctx.model_config
-    # Handle mm processor kwargs; we pass these at creation time
-    # because preprocess() in transformers doesn't expose them
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-
-    images = None
-    videos = None
-    if data_type_key == "image":
-        images = data
-    else:
-        assert data_type_key == "video"
-        videos = data
-
-    try:
-        batch_data = image_processor \
-            .preprocess(images=images, videos=videos, return_tensors="pt") \
-            .data
-    except Exception:
-        logger.error("Failed to process image (%s)", data)
-        raise
-
-    return MultiModalKwargs(batch_data)
-
-
-image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="image")
-video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="video")
-
-
 def _get_vision_info(
-    image_processor,
+    vision_config: Qwen2VLVisionConfig,
     height: int,
     width: int,
     min_pixels: int,
@@ -726,12 +654,15 @@ def _get_vision_info(
 ):
     """Get information (resized height / width and number of vision tokens)
     of input image / video frame."""
+    patch_size = vision_config.patch_size
+    merge_size = vision_config.spatial_merge_size
+    temporal_patch_size = vision_config.temporal_patch_size
 
     if do_resize:
         resized_height, resized_width = smart_resize(
             height=height,
             width=width,
-            factor=image_processor.patch_size * image_processor.merge_size,
+            factor=patch_size * merge_size,
             min_pixels=min_pixels,
             max_pixels=max_pixels,
         )
@@ -742,54 +673,41 @@ def _get_vision_info(
         grid_t = mm_count
     else:
         assert data_type_key == "video"
-        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
+        grid_t = max(mm_count // temporal_patch_size, 1)
 
-    grid_h = resized_height // image_processor.patch_size
-    grid_w = resized_width // image_processor.patch_size
+    grid_h = resized_height // patch_size
+    grid_w = resized_width // patch_size
     vision_tokens = grid_t * grid_h * grid_w
-    llm_num_vision_tokens = (vision_tokens // image_processor.merge_size //
-                             image_processor.merge_size)
+    llm_num_vision_tokens = vision_tokens // (merge_size**2)
 
     return resized_height, resized_width, llm_num_vision_tokens
 
 
-def _get_max_image_info(
-    image_processor,
-    data_type_key: str = "image",
-    mm_count: int = 1,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-):
-    # Limit min / max pixels unless they're explicitly provided
-    if min_pixels is None:
-        min_pixels = max(image_processor.min_pixels, 28 * 28)
-    if max_pixels is None:
-        max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28)
-
-    return _get_vision_info(
-        image_processor,
-        height=9999999,
-        width=9999999,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        data_type_key=data_type_key,
-        mm_count=mm_count,
-    )
+def _get_image_processor(hf_processor: Qwen2VLProcessor):
+    image_processor = hf_processor.image_processor  # type: ignore
+    assert isinstance(image_processor, Qwen2VLImageProcessor)
+    return image_processor
 
 
 def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                data_type_key: str,
                                *,
-                               min_pixels=None,
-                               max_pixels=None) -> int:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
-        _get_max_image_info(image_processor, data_type_key=data_type_key,
-                            mm_count=1, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
+                               min_pixels: Optional[int] = None,
+                               max_pixels: Optional[int] = None) -> int:
+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+    vision_config = hf_config.vision_config
+
+    hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
+    image_processor = _get_image_processor(hf_processor)
+
+    _, _, max_llm_image_tokens = _get_vision_info(
+        vision_config,
+        height=9999999,
+        width=9999999,
+        min_pixels=min_pixels or image_processor.min_pixels,
+        max_pixels=max_pixels or image_processor.max_pixels,
+        data_type_key=data_type_key,
+    )
     return max_llm_image_tokens
 
 
@@ -799,290 +717,166 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                         data_type_key="video")
 
 
-def dummy_data_for_qwen2_vl(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None
-) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
-
-    num_images = mm_counts["image"]
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
-        _get_max_image_info(image_processor, data_type_key="image",
-                            mm_count=num_images, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_image_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_images} images in a prompt, "
-            "please increase max_model_len or reduce image limit by "
-            "--limit-mm-per-prompt.")
-
-    # Check video counts.
-    num_videos = mm_counts["video"]
-    max_resized_height, max_resized_width, max_llm_video_tokens = \
-        _get_max_image_info(image_processor, data_type_key="video",
-                            mm_count=num_videos, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_video_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_videos} videos in a prompt, "
-            "please increase max_model_len or reduce video limit by "
-            "--limit-mm-per-prompt.")
-
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (hf_config.vision_start_token_id, 1),
-        (hf_config.image_token_id, max_llm_image_tokens),
-        (hf_config.vision_end_token_id, 1),
-        (0, seq_len - max_llm_image_tokens - 2),
-    )
-
-    dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
-                            color=0)
+class Qwen2VLMultiModalDataItems(MultiModalDataItems):
 
-    return DummyData(dummy_seqdata, {
-        "image":
-        dummy_image if num_images == 1 else [dummy_image] * num_images
-    })
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = Qwen2VLMultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                          or is_list_of(v, list)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
 
+        return multi_data
 
-def _get_llm_num_vision_tokens(
-    mm_inputs: list,
-    data_type_key: str,
-    image_processor,
-    min_pixels: int,
-    max_pixels: int,
-):
-    """Get number of vision tokens of multimodal inputs.
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {
+            m: (
+                len(items[f"{m}_grid_thw"])  # type: ignore
+                if isinstance(items, dict) else len(items))
+            for m, items in self.items()
+        }
 
-    This method is derived from `transformers.models.qwen2_vl.
-    image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
-    """
-    image = to_numpy_array(mm_inputs[0])
-    input_data_format = infer_channel_dimension_format(image)
-    height, width = get_image_size(image, channel_dim=input_data_format)
-
-    _, _, llm_num_vision_tokens = _get_vision_info(
-        image_processor,
-        height=height,
-        width=width,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        do_resize=image_processor.do_resize,
-        data_type_key=data_type_key,
-        mm_count=len(mm_inputs),
-    )
-    return llm_num_vision_tokens
 
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
-def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable,
-                       data_type_key: str, image_processor: Any,
-                       prompt_token_ids: List[int], min_pixels: Optional[int],
-                       max_pixels: Optional[int]) -> List[int]:
-    """
-    Expand pad tokens for multi-modal inputs (e.g., images or videos).
-
-    Args:
-        inputs (list): The multi-modal inputs (e.g., images or videos).
-        token_id (int): The token ID used to represent the multi-modal input.
-        make_batched_fn (Callable): A function to batch the inputs.
-        data_type_key (str): The type of the multi-modal input.
-        image_processor (Any): The image processor used to process the inputs.
-        prompt_token_ids (List[int]): The list of token IDs in the prompt.
-        min_pixels (int): min pixels to used for img processing
-        max_pixels (int): max pixels to be used for img processing
-
-    Returns:
-        List[int]: The list of token IDs for the multi-modal inputs.
-    """
-    indices = [
-        idx for idx, token in enumerate(prompt_token_ids) if token == token_id
-    ]
-    inputs = make_batched_fn(inputs)
-    assert len(indices) == len(inputs)
-
-    prompt_token_ids_with_data = []
-    for cnt, data in enumerate(inputs):
-        num_tokens = _get_llm_num_vision_tokens(
-            [data] if data_type_key == "image" else data,
-            data_type_key=data_type_key,
-            image_processor=image_processor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-        if cnt == 0:
-            end_idx = indices[cnt]
-            non_data_tokens = prompt_token_ids[:end_idx]
-        else:
-            non_data_tokens = prompt_token_ids[indices[cnt - 1] +
-                                               1:indices[cnt]]
-        prompt_token_ids_with_data.extend(non_data_tokens)
-        prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
-    prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
-    return prompt_token_ids_with_data
-
-
-def input_processor_for_qwen2_vl(
-    ctx: InputContext,
-    inputs: DecoderOnlyInputs,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> DecoderOnlyInputs:
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None:
-        return inputs
-
-    image_inputs = multi_modal_data.get("image", None)
-    video_inputs = multi_modal_data.get("video", None)
-
-    processor = cached_get_processor(ctx.model_config.model)
-    image_processor = processor.image_processor
-    # Apply processor kwarg overrides for image processor options
-    min_pixels = min_pixels if min_pixels else image_processor.min_pixels
-    max_pixels = max_pixels if max_pixels else image_processor.max_pixels
-
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
 
-    # To avoid redundant processing of vision objects (resize, rescale, etc.),
-    # we extract code of calculating number of vision tokens from
-    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
-    #
-    # The following code is equivalent to:
-    #    prompt = inputs["prompt"]
-    #    inputs = processor(text=[prompt],
-    #                       images=image_inputs,
-    #                       videos=video_inputs,
-    #                       padding=True,
-    #                       return_tensors="pt")
-    #    prompt_token_ids = inputs["input_ids"][0].tolist()
-
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    prompt_token_ids = inputs["prompt_token_ids"]
-
-    # Expand image pad tokens.
-
-    if image_inputs is not None:
-        if isinstance(image_inputs, dict):
-            prompt_token_ids_with_image = []
-            image_indices = [
-                idx for idx, token in enumerate(prompt_token_ids)
-                if token == hf_config.image_token_id
-            ]
-
-            # ensure all image tokens have grid_thw
-            assert \
-                len(image_indices) == image_inputs["image_grid_thw"].size(0), \
-                "image token num does not match image_grid_thw.shape"
-
-            image_counter = 0
-            pad_token_counter = 0
-            for idx, token in enumerate(prompt_token_ids):
-                if idx in image_indices:
-                    grid_thw = image_inputs["image_grid_thw"][image_counter]
-                    grid_t, grid_h, grid_w = grid_thw
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
-                                      image_processor.merge_size //
-                                      image_processor.merge_size)
-                    prompt_token_ids_with_image.extend([token] *
-                                                       num_pad_tokens)
-                    image_counter += 1
-                    pad_token_counter += num_pad_tokens
+    def _get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> Qwen2VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
+        image_processor = _get_image_processor(hf_processor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def _get_processor_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        processor_data = dict[str, Any]()
+        passthrough_data = dict[str, Any]()
+
+        for k, v in mm_items.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            if k in ("image", "video", "audio"):
+                if isinstance(v, dict):
+                    # Pass through embedding inputs (dict)
+                    passthrough_data.update(v)
+                elif isinstance(v, torch.Tensor) and v.ndim == 3:
+                    # Pass through embedding inputs (single)
+                    passthrough_data[f"{k}_embeds"] = [v]
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
+                    # Pass through embedding inputs (multi)
+                    passthrough_data[f"{k}_embeds"] = v
                 else:
-                    prompt_token_ids_with_image.append(token)
+                    # Map keys to plural form, e.g.: image -> images
+                    processor_data[f"{k}s"] = v
+            else:
+                processor_data[k] = v
 
-            # ensure all embeddings are used
-            assert \
-                pad_token_counter == image_inputs["image_embeds"].size(0), \
-                "image_embeds.shape does not match image_grid_thw"
+        return processor_data, passthrough_data
 
-            prompt_token_ids = prompt_token_ids_with_image
-        else:
-            prompt_token_ids = _expand_pad_tokens(image_inputs,
-                                                  hf_config.image_token_id,
-                                                  make_batched_images,
-                                                  "image",
-                                                  image_processor,
-                                                  prompt_token_ids,
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-
-    if video_inputs is not None:
-        if isinstance(video_inputs, dict):
-            prompt_token_ids_with_video = []
-            video_indices = [
-                idx for idx, token in enumerate(prompt_token_ids)
-                if token == hf_config.video_token_id
-            ]
-
-            # ensure all video tokens have grid_thw
-            assert \
-                len(video_indices) == video_inputs["video_grid_thw"].size(0), \
-                "video token num does not match video_grid_thw.shape"
-
-            video_counter = 0
-            pad_token_counter = 0
-            for idx, token in enumerate(prompt_token_ids):
-                if idx in video_indices:
-                    grid_thw = video_inputs["video_grid_thw"][video_counter]
-                    grid_t, grid_h, grid_w = grid_thw
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
-                                      image_processor.merge_size //
-                                      image_processor.merge_size)
-                    prompt_token_ids_with_video.extend([token] *
-                                                       num_pad_tokens)
-                    video_counter += 1
-                    pad_token_counter += num_pad_tokens
-                else:
-                    prompt_token_ids_with_video.append(token)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_processor = _get_image_processor(hf_processor)
+
+        # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
+        # image_token and video_token registered
+        placeholder = {
+            "image": hf_processor.image_token,
+            "video": hf_processor.video_token,
+        }
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_qwen2vl(item_idx: int, modality: str):
+            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            num_tokens = grid_thw.prod() // merge_length
+            return placeholder[modality] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=placeholder[modality],
+                replacement=partial(get_replacement_qwen2vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
 
-            # ensure all embeddings are used
-            assert \
-                pad_token_counter == video_inputs["video_embeds"].size(0), \
-                "video_embeds.shape does not match video_grid_thw"
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token
+        image_processor = _get_image_processor(hf_processor)
+
+        data = {}
+        resized_height, resized_width = smart_resize(
+            height=9999999,
+            width=9999999,
+            factor=image_processor.patch_size * image_processor.merge_size,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+        )
 
-            prompt_token_ids = prompt_token_ids_with_video
-        else:
-            prompt_token_ids = _expand_pad_tokens(video_inputs,
-                                                  hf_config.video_token_id,
-                                                  make_batched_videos,
-                                                  "video",
-                                                  image_processor,
-                                                  prompt_token_ids,
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    return token_inputs(
-        prompt_token_ids=prompt_token_ids,
-        prompt=prompt,
-        multi_modal_data=multi_modal_data,
-    )
+        dummy_image = Image.new("RGB", (resized_width, resized_height),
+                                color=0)
+        data["image"] = [dummy_image] * num_images
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(
-    image_input_mapper_for_qwen2_vl)
-@MULTIMODAL_REGISTRY.register_input_mapper("video",
-                                           video_input_mapper_for_qwen2_vl)
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "video", get_max_qwen2_vl_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -1107,10 +901,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: Qwen2VLConfig = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -1396,11 +1195,6 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "lm_head.": "language_model.lm_head.",
-                "model.": "language_model.model.",
-            })
 
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 68a2467a813a1..feb33bb373c3e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -20,11 +20,10 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-from .adapters import as_embedding_model
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
                          supports_pp)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -46,6 +45,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
@@ -113,6 +113,7 @@
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
@@ -124,12 +125,13 @@
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
-    "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    # [Auto-converted (see adapters.py)]
+    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
 }
 
 _CROSS_ENCODER_MODELS = {
@@ -225,19 +227,10 @@ class _ModelInfo:
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
-        is_pooling_model_ = is_pooling_model(model)
-        if not is_pooling_model_:
-            try:
-                as_embedding_model(model)
-            except Exception:
-                pass
-            else:
-                is_pooling_model_ = True
-
         return _ModelInfo(
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=is_pooling_model_,
+            is_pooling_model=True,  # Can convert any model into a pooling model
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 39c9103527f01..28c37bb96612c 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,6 +31,19 @@
 
 class TeleChat2Model(LlamaModel):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 1. Initialize the LlamaModel with bias
         vllm_config.model_config.hf_config.bias = True
@@ -111,21 +124,9 @@ def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "transformer.": "model.",
-            },
-            orig_to_new_substr={
-                ".h.": ".layers.",
-                ".self_attention.": ".self_attn.",
-                ".word_embeddings.": ".embed_tokens.",
-                ".dense.": ".o_proj.",
-                ".ln_f.": ".norm.",
-            },
-        )
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ebaa8a4c4f38a..509ad9e580ddf 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,7 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import numpy as np
@@ -11,7 +11,7 @@
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, ProcessorMixin
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
@@ -25,11 +25,11 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataDict,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
 
 
 def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
-    return cached_feature_extractor(
-        ctx.get_hf_config(UltravoxConfig).audio_model_id)
+    hf_config = ctx.get_hf_config(UltravoxConfig)
+    return cached_feature_extractor(hf_config.audio_model_id)
 
 
 def get_ultravox_max_audio_tokens(ctx: InputContext):
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        return self._get_hf_processor().audio_processor.feature_extractor
+        hf_processor = self._get_hf_processor()
+        return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _resample_audio(
+    def _get_processor_data(
         self,
-        audio: np.ndarray,
-        sr: int,
-    ) -> Dict[str, Union[np.ndarray, int]]:
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         # resample audio to the model's sampling rate
         feature_extractor = self._get_feature_extractor()
-        if sr != feature_extractor.sampling_rate:
-            try:
-                import librosa
-            except ImportError as exc:
-                raise ImportError(
-                    "Please install vllm[audio] for audio support.") from exc
-            audio = librosa.resample(audio,
-                                     orig_sr=sr,
-                                     target_sr=feature_extractor.sampling_rate)
-            sr = feature_extractor.sampling_rate
-        return {"audio": audio, "sampling_rate": sr}
-
-    def _apply_hf_processor(
+        mm_items.resample_audios(feature_extractor.sampling_rate)
+
+        return super()._get_processor_data(mm_items)
+
+    def _call_hf_processor(
         self,
+        hf_processor: ProcessorMixin,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        processor_data: Mapping[str, object],
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if not mm_data or not mm_data.get("audio", None):
-            return super()._apply_hf_processor(prompt, mm_data,
-                                               mm_processor_kwargs)
+        processor_data = dict(processor_data)
+        audios = processor_data.pop("audios", [])
+
+        if not audios:
+            return super()._call_hf_processor(
+                hf_processor,
+                prompt=prompt,
+                processor_data=processor_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        feature_extractor = self._get_feature_extractor()
+        mm_processor_kwargs = dict(
+            **mm_processor_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
 
-        audio_data = mm_data["audio"]
-        if not isinstance(audio_data, list):
-            audio_data = [audio_data]
+        # Already resampled by _get_processor_data
+        assert is_list_of(audios, np.ndarray)
 
         # Ultravox processor doesn't support multiple inputs,
         # therefore we need to input text and audio one by one
-        tokenizer = self._get_tokenizer()
         audio_features, audio_token_len = [], []
-        processed_inputs = {}
-        for audio, sr in audio_data:
-            data = self._resample_audio(audio, sr)
-            processed_inputs = super()._apply_hf_processor(
-                prompt, data, mm_processor_kwargs)
-            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
-                                      skip_special_tokens=False)
-            audio_features.append(
-                processed_inputs.pop("audio_values").squeeze(0))
-            audio_token_len.append(
-                processed_inputs.pop("audio_token_len").item())
-
-        return dict(
-            **processed_inputs,
+        shared_outputs = {}
+        for audio in audios:
+            # NOTE: Ultravox processor accepts "audio" instead of "audios"
+            item_processor_data = dict(**processor_data, audio=audio)
+
+            item_outputs = super()._call_hf_processor(
+                hf_processor,
+                prompt=prompt,
+                processor_data=item_processor_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+            audio_features.append(item_outputs.pop("audio_values")[0])
+            audio_token_len.append(item_outputs.pop("audio_token_len").item())
+            shared_outputs = item_outputs
+
+        combined_outputs = dict(
+            **shared_outputs,
             audio_features=audio_features,
             audio_token_len=audio_token_len,
         )
-
-    def _get_processor_data(
-        self,
-        mm_data: MultiModalDataDict,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        # Ultravox uses "audio" instead of "audios" as calling keyword
-        processor_data, passthrough_data = super()._get_processor_data(mm_data)
-        if "audios" in processor_data:
-            processor_data["audio"] = processor_data.pop("audios")
-        return processor_data, passthrough_data
+        return BatchFeature(combined_outputs)
 
     def _get_prompt_replacements(
         self,
@@ -147,7 +146,7 @@ def _get_prompt_replacements(
         mm_processor_kwargs: Mapping[str, object],
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
-        placeholder = hf_processor.audio_token_replacement
+        placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
             audio_token_len = hf_inputs["audio_token_len"][item_idx]
@@ -171,7 +170,7 @@ def _get_dummy_mm_inputs(
 
         audio_count = mm_counts["audio"]
         audio = np.zeros(audio_len)
-        data = {"audio": [(audio, sampling_rate)] * audio_count}
+        data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|audio|>" * audio_count,
@@ -303,6 +302,9 @@ def forward(
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -495,9 +497,7 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["audio_tower."])
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 7a6d7c90f34d5..02d22a5ca62c0 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -328,6 +328,15 @@ def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
             marlin_tile_size=self.marlin_tile_size)
 
 
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                           output_dim: int, **kwargs) -> BasevLLMParameter:
     """
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 928c31a2f2843..9255e062e4870 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -11,7 +11,7 @@
 dispatch data processing according to its modality and the target model.
 
 See also:
-    :ref:`input_processing_pipeline`
+    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 1a230602966d4..ed3bb82bf0aaa 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,8 +1,17 @@
+import numpy as np
+import numpy.typing as npt
+
 from vllm.inputs.registry import InputContext
+from vllm.utils import PlaceholderModule
 
 from .base import MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -21,3 +30,12 @@ def _default_input_mapper(
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         raise NotImplementedError(
             "There is no default maximum multimodal tokens")
+
+
+def resample_audio(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fe77a4635f7d8..1e5a46946c6c0 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
     (i.e., the modality of the data).
 
     See also:
-        :ref:`adding_multimodal_plugin`
+        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
@@ -94,8 +94,8 @@ def register_input_mapper(
         If `None` is provided, then the default input mapper is used instead.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -130,8 +130,8 @@ def map_input(
             TypeError: If the data type is not supported.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         # Avoid circular import
@@ -190,7 +190,7 @@ def register_max_multimodal_tokens(
         If `None` is provided, then the default calculation is used instead.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 97bbce1ce1570..c705e1a3d1554 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -84,3 +84,15 @@ def _default_input_mapper(
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 3000
+
+
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 229a8fbdf5831..9ecae2c1ca2bf 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -15,31 +15,32 @@
 # yapf: disable
 ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image,
-which can be passed to a HuggingFace :code:`ImageProcessor`.
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 
 VideoItem: TypeAlias = Union[
-    List[Image],
+    list[Image],
     np.ndarray,
     torch.Tensor,
-    List[np.ndarray],
-    List[torch.Tensor],
+    list[np.ndarray],
+    list[torch.Tensor],
 ]
 """
-
-A :class:`transformers.image_utils.VideoInput` representing a single video,
-which can be passed to a HuggingFace :code:`VideoProcessor`.
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 
 AudioItem: TypeAlias = Union[
     np.ndarray,
-    List[float],
-    Tuple[np.ndarray, float],  # DEPRECATED: Use mm_processor_kwargs instead
+    list[float],
+    # `(audio, sampling_rate)`: If the audio's sampling rate is different
+    # from that expected by the model, we need to resample it.
+    tuple[np.ndarray, float],
 ]
 """
-Represents a single audio that can be inputted to a HuggingFace
-:code:`AudioProcessor`.
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
 # yapf: enable
 
@@ -74,7 +75,7 @@ class MultiModalDataBuiltins(TypedDict, total=False):
     This dictionary also accepts modality keys defined outside
     :class:`MultiModalDataBuiltins` as long as a customized plugin
     is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
+    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
+
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 339e193eefe20..6baf19d675d50 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -17,6 +17,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
+from .audio import resample_audio
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
@@ -30,7 +31,7 @@
 @dataclass
 class PromptReplacement:
     modality: str
-    """The modality for which the replacement is made"""
+    """The modality for which the replacement is made."""
 
     target: _PromptSeq
     """The text or token sequence to find and replace."""
@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
     corresponds to a list.
     """
 
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (isinstance(v, torch.Tensor)
+                          or is_list_of(v, list)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+
+        return multi_data
+
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
     @property
-    def image(self) -> list[ImageItem]:
-        return self["image"]
+    def images(self) -> Sequence[ImageItem]:
+        return self.get("image", [])
 
     @property
-    def video(self) -> list[VideoItem]:
-        return self["video"]
+    def videos(self) -> Sequence[VideoItem]:
+        return self.get("video", [])
 
     @property
-    def audio(self) -> list[AudioItem]:
-        return self["audio"]
+    def audios(self) -> Sequence[AudioItem]:
+        return self.get("audio", [])
+
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
 
     def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.image[item_idx]
+        image = self.images[item_idx]
 
         if isinstance(image, Image):
             return ImageSize(*image.size)
@@ -234,25 +269,41 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
         assert_never(image)
 
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+
+        assert_never(audio)
+
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        if not self.audios:
+            return
 
-def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
-    """
-    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-    """
-    multi_data = MultiModalDataItems()
-
-    for k, v in data.items():
-        # yapf: disable
-        if k == "video":
-            # Special case since even a single item can be a list
-            multi_data[k] = v if is_list_of(v, list) else [v]  # type: ignore[index]
-        elif k in ("image", "audio"):
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        else:
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        # yapf: enable
+        new_audios = []
+        for item_idx in range(len(self.audios)):
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
+
+            new_audios.append(audio if drop_sr else (audio, new_sr))
 
-    return multi_data
+        self["audio"] = new_audios
 
 
 class _TokenMatch(NamedTuple):
@@ -567,6 +618,12 @@ def _get_hf_processor(self) -> ProcessorMixin:
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return MultiModalDataItems.from_dict(mm_data)
+
     @abstractmethod
     def _get_prompt_replacements(
         self,
@@ -596,18 +653,20 @@ def _find_placeholders(
 
     def _get_processor_data(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> BatchFeature:
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
-        for k, v in mm_data.items():
+
+        for k, v in mm_items.items():
             # TODO: Make a separate modality for embedding inputs
             # to avoid confusion
             if k in ("image", "video", "audio"):
                 if isinstance(v, torch.Tensor) and v.ndim == 3:
                     # Pass through embedding inputs (single)
                     passthrough_data[f"{k}_embeds"] = [v]
-                elif is_list_of(v, torch.Tensor) and v[0].ndim == 2:
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
                 else:
@@ -615,40 +674,41 @@ def _get_processor_data(
                     processor_data[f"{k}s"] = v
             else:
                 processor_data[k] = v
+
         return processor_data, passthrough_data
 
+    def _call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.ctx.call_hf_processor(
+            hf_processor,
+            prompt,
+            processor_data,
+            mm_processor_kwargs,
+        )
+
     def _apply_hf_processor(
         self,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # some mm_processor_kwargs may be used in processor initialization
         # instead of processor call
         hf_processor = self._get_hf_processor(**mm_processor_kwargs)
 
-        processor_data, passthrough_data = self._get_processor_data(mm_data)
+        processor_data, passthrough_data = self._get_processor_data(mm_items)
 
-        assert callable(hf_processor)
-        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
+        hf_inputs = self._call_hf_processor(
             hf_processor,
-            mm_processor_kwargs,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
         )
-
-        try:
-            hf_inputs = hf_processor(
-                text=prompt,  # type: ignore
-                **processor_data,
-                **mm_processor_kwargs,
-                return_tensors="pt",
-            )
-        except Exception as exc:
-            data = dict(text=prompt, **processor_data)
-
-            raise RuntimeError(
-                f"Failed to apply {type(hf_processor).__name__} "
-                f"on data={data} with kwargs={mm_processor_kwargs}") from exc
-
         hf_inputs.update(passthrough_data)
 
         return hf_inputs
@@ -730,25 +790,25 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        tokenizer = self._get_tokenizer()
+        mm_items = self._get_mm_items(mm_data)
 
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_data,
+        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
                                              mm_processor_kwargs)
         prompt_ids, = hf_inputs.pop("input_ids").tolist()
         mm_kwargs = MultiModalKwargs(hf_inputs)
 
-        mm_items = to_multi_format(mm_data)
         prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
                                                      mm_processor_kwargs)
         all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
+        mm_item_counts = mm_items.get_item_counts()
         all_placeholders = self._find_placeholders(all_prompt_repls,
                                                    prompt_ids, mm_item_counts)
 
         if all_placeholders:
+            tokenizer = self._get_tokenizer()
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
             (
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 6cd79d414c978..ded45a7184b5d 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None:
         Register a multi-modal plugin so it can be recognized by vLLM.
 
         See also:
-            :ref:`adding_multimodal_plugin`
+            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
@@ -311,8 +311,8 @@ def register_processor(
         invoked to transform the data into a dictionary of model inputs.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c898ca4e6573e..a49da2bdee972 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@
 import os
 from functools import lru_cache
 from io import BytesIO
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -13,9 +13,25 @@
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.utils import PlaceholderModule
 
 from .inputs import MultiModalDataDict, PlaceholderRange
 
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
 logger = init_logger(__name__)
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
@@ -125,19 +141,7 @@ async def async_fetch_image(image_url: str,
     return image.convert(image_mode)
 
 
-def _load_video_frames_from_bytes(b: bytes):
-    frame = Image.open(BytesIO(b))
-    return np.array(frame)
-
-
-def load_video_frames_from_base64(frame: Union[bytes, str]):
-    """Load frame from base64 format."""
-    return _load_video_frames_from_bytes(base64.b64decode(frame))
-
-
-def _load_video_from_bytes(b: bytes, num_frames: int = 32):
-    _, decord = try_import_video_packages()
-
+def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
     video_path = BytesIO(b)
     vr = decord.VideoReader(video_path, num_threads=1)
     total_frame_num = len(vr)
@@ -155,13 +159,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
     return frames
 
 
-def _load_video_from_data_url(video_url: str):
-    # Only split once and assume the second part is the base64 encoded image
-    frames_base64 = video_url.split(",")[1:]
-    return np.stack([
-        load_video_frames_from_base64(frame_base64)
-        for frame_base64 in frames_base64
-    ])
+def _load_video_from_data_url(video_url: str) -> npt.NDArray:
+    # Only split once and assume the second part is the base64 encoded video
+    _, video_base64 = video_url.split(",", 1)
+
+    if video_url.startswith("data:video/jpeg;"):
+        return np.stack([
+            np.array(load_image_from_base64(frame_base64))
+            for frame_base64 in video_base64.split(",")
+        ])
+
+    return load_video_from_base64(video_base64)
 
 
 def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
@@ -204,22 +212,10 @@ async def async_fetch_video(video_url: str,
     return video
 
 
-def try_import_audio_packages() -> Tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = global_http_connection.get_bytes(
             audio_url,
@@ -240,8 +236,6 @@ async def async_fetch_audio(
     """
     Asynchronously fetch audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = await global_http_connection.async_get_bytes(
             audio_url,
@@ -300,8 +294,6 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    _, soundfile = try_import_audio_packages()
-
     buffered = BytesIO()
     soundfile.write(buffered, audio, sampling_rate, format="WAV")
 
@@ -330,61 +322,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     return _load_image_from_bytes(base64.b64decode(image))
 
 
-def rescale_image_size(image: Image.Image,
-                       size_factor: float,
-                       transpose: int = -1) -> Image.Image:
-    """Rescale the dimensions of an image by a constant factor."""
-    new_width = int(image.width * size_factor)
-    new_height = int(image.height * size_factor)
-    image = image.resize((new_width, new_height))
-    if transpose >= 0:
-        image = image.transpose(Image.Transpose(transpose))
-    return image
-
-
-def try_import_video_packages() -> Any:
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-
-
-def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
-    num_frames, _, _, channels = frames.shape
-    new_height, new_width = size
-    resized_frames = np.empty((num_frames, new_height, new_width, channels),
-                              dtype=frames.dtype)
-    for i, frame in enumerate(frames):
-        resized_frame = cv2.resize(frame, (new_width, new_height))
-        resized_frames[i] = resized_frame
-    return resized_frames
-
-
-def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
-    _, height, width, _ = frames.shape
-    new_height = int(height * size_factor)
-    new_width = int(width * size_factor)
-
-    return resize_video(frames, (new_height, new_width))
-
-
-def sample_frames_from_video(frames: npt.NDArray,
-                             num_frames: int) -> npt.NDArray:
-    total_frames = frames.shape[0]
-    if num_frames == -1:
-        return frames
-    else:
-        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-        sampled_frames = frames[frame_indices, ...]
-        return sampled_frames
-
-
-def encode_video_base64(frames: npt.NDArray):
+def encode_video_base64(frames: npt.NDArray) -> str:
     base64_frames = []
     frames_list = [frames[i] for i in range(frames.shape[0])]
     for frame in frames_list:
@@ -393,6 +331,11 @@ def encode_video_base64(frames: npt.NDArray):
     return ",".join(base64_frames)
 
 
+def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
+    """Load video from base64 format."""
+    return _load_video_from_bytes(base64.b64decode(video))
+
+
 def resolve_visual_encoder_outputs(
     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
     feature_sample_layers: Optional[list[int]],
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ba9bf58a4a20c..c4be100562703 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,7 +1,9 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
+import cv2
 import numpy as np
+import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
@@ -75,3 +77,33 @@ def _default_input_mapper(
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 4096
+
+
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2ecdf74ee59b3..b519c159b1531 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -355,7 +355,8 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
         pooled_data = seq_group.pooled_data
         assert pooled_data is not None
 
-        output = PoolingOutput(pooled_data)
+        data = pooled_data.to(dtype=torch.float32, device="cpu")
+        output = PoolingOutput(data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index d95a2b4cd5565..09bde9f065eaa 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
diff --git a/vllm/scripts.py b/vllm/scripts.py
index a51c21cfa29e7..42e1c639eda10 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -165,7 +165,7 @@ def main():
         required=False,
         help="Read CLI options from a config file."
         "Must be a YAML with the following options:"
-        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server"
+        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
     )
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 2689802161987..de593113b938b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
new file mode 100644
index 0000000000000..6ae68161bbd97
--- /dev/null
+++ b/vllm/transformers_utils/s3_utils.py
@@ -0,0 +1,151 @@
+import fnmatch
+import os
+import shutil
+import signal
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from vllm.utils import PlaceholderModule
+
+try:
+    import boto3
+except ImportError:
+    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths if any(
+            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def glob(s3=None,
+         path: str = "",
+         allow_pattern: Optional[list[str]] = None) -> list[str]:
+    """
+    List full file names from S3 path and filter by allow pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full S3 paths allowed by the pattern
+    """
+    if s3 is None:
+        s3 = boto3.client("s3")
+    bucket_name, _, paths = list_files(s3,
+                                       path=path,
+                                       allow_pattern=allow_pattern)
+    return [f"s3://{bucket_name}/{path}" for path in paths]
+
+
+def list_files(
+        s3,
+        path: str,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket 
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or 
+              disallowed by pattern
+    """
+    parts = path.removeprefix('s3://').split('/')
+    prefix = '/'.join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Model:
+    """
+    A class representing a S3 model mirrored into a temporary directory.
+
+    Attributes:
+        s3: S3 client.
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from S3 to the temporary directory.
+    """
+
+    def __init__(self) -> None:
+        self.s3 = boto3.client('s3')
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   s3_model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
+                                                  allow_pattern,
+                                                  ignore_pattern)
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = self.dir + file.removeprefix(base_dir)
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.s3.download_file(bucket_name, file, destination_file)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 54f9f895fe541..e6701f4c4b835 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -132,7 +132,7 @@ def get_tokenizer(
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
             'It is strongly recommended to run mistral models with '
-            '`--tokenizer_mode "mistral"` to ensure correct '
+            '`--tokenizer-mode "mistral"` to ensure correct '
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 761b07f34d2f9..95a8f7098bbac 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -22,7 +22,7 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
         max_loras = tokenizer_config.get("max_loras", 0)
-        self.lora_tokenizers = LRUCache[AnyTokenizer](
+        self.lora_tokenizers = LRUCache[int, AnyTokenizer](
             capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
     @classmethod
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 83b3c37d6f04c..17d722e3d88fe 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -314,12 +314,15 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
 
             decoded = ''.join(decoded_list)
 
         return decoded
 
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
     def decode(self,
                ids: Union[List[int], int],
                skip_special_tokens: bool = True) -> str:
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 7a9041b04fbb9..10a09fb4f566c 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -3,6 +3,10 @@
 from typing import Union
 
 
+def is_s3(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith('s3://')
+
+
 def check_gguf_file(model: Union[str, PathLike]) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
diff --git a/vllm/utils.py b/vllm/utils.py
index 38c7dea6d2d3d..3d198887021dc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,10 +6,13 @@
 import enum
 import gc
 import getpass
+import importlib.metadata
 import importlib.util
 import inspect
 import ipaddress
 import os
+import re
+import resource
 import signal
 import socket
 import subprocess
@@ -21,14 +24,13 @@
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import UserDict, defaultdict
+from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Hashable, List, Literal,
-                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
-                    overload)
+                    Optional, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
 import numpy as np
@@ -52,7 +54,7 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.md
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
@@ -154,10 +156,12 @@
 }
 
 P = ParamSpec('P')
-K = TypeVar("K")
 T = TypeVar("T")
 U = TypeVar("U")
 
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+
 
 class _Sentinel:
     ...
@@ -190,50 +194,48 @@ def reset(self) -> None:
         self.counter = 0
 
 
-class LRUCache(Generic[T]):
+class LRUCache(Generic[_K, _V]):
 
-    def __init__(self, capacity: int):
-        self.cache: OrderedDict[Hashable, T] = OrderedDict()
-        self.pinned_items: Set[Hashable] = set()
+    def __init__(self, capacity: int) -> None:
+        self.cache = OrderedDict[_K, _V]()
+        self.pinned_items = set[_K]()
         self.capacity = capacity
 
-    def __contains__(self, key: Hashable) -> bool:
+    def __contains__(self, key: _K) -> bool:
         return key in self.cache
 
     def __len__(self) -> int:
         return len(self.cache)
 
-    def __getitem__(self, key: Hashable) -> T:
+    def __getitem__(self, key: _K) -> _V:
         value = self.cache[key]  # Raise KeyError if not exists
         self.cache.move_to_end(key)
         return value
 
-    def __setitem__(self, key: Hashable, value: T) -> None:
+    def __setitem__(self, key: _K, value: _V) -> None:
         self.put(key, value)
 
-    def __delitem__(self, key: Hashable) -> None:
+    def __delitem__(self, key: _K) -> None:
         self.pop(key)
 
-    def touch(self, key: Hashable) -> None:
+    def touch(self, key: _K) -> None:
         self.cache.move_to_end(key)
 
-    def get(self,
-            key: Hashable,
-            default_value: Optional[T] = None) -> Optional[T]:
-        value: Optional[T]
+    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
+        value: Optional[_V]
         if key in self.cache:
             value = self.cache[key]
             self.cache.move_to_end(key)
         else:
-            value = default_value
+            value = default
         return value
 
-    def put(self, key: Hashable, value: T) -> None:
+    def put(self, key: _K, value: _V) -> None:
         self.cache[key] = value
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
-    def pin(self, key: Hashable) -> None:
+    def pin(self, key: _K) -> None:
         """
         Pins a key in the cache preventing it from being
         evicted in the LRU order.
@@ -242,13 +244,13 @@ def pin(self, key: Hashable) -> None:
             raise ValueError(f"Cannot pin key: {key} not in cache.")
         self.pinned_items.add(key)
 
-    def _unpin(self, key: Hashable) -> None:
+    def _unpin(self, key: _K) -> None:
         self.pinned_items.remove(key)
 
-    def _on_remove(self, key: Hashable, value: Optional[T]):
+    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
         pass
 
-    def remove_oldest(self, remove_pinned=False):
+    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
         if not self.cache:
             return
 
@@ -262,17 +264,15 @@ def remove_oldest(self, remove_pinned=False):
                                    "cannot remove oldest from the cache.")
         else:
             lru_key = next(iter(self.cache))
-        self.pop(lru_key)
+        self.pop(lru_key)  # type: ignore
 
     def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
             self.remove_oldest()
 
-    def pop(self,
-            key: Hashable,
-            default_value: Optional[T] = None) -> Optional[T]:
+    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
         run_on_remove = key in self.cache
-        value: Optional[T] = self.cache.pop(key, default_value)
+        value = self.cache.pop(key, default)
         # remove from pinned items
         if key in self.pinned_items:
             self._unpin(key)
@@ -280,7 +280,7 @@ def pop(self,
             self._on_remove(key, value)
         return value
 
-    def clear(self):
+    def clear(self) -> None:
         while len(self.cache) > 0:
             self.remove_oldest(remove_pinned=True)
         self.cache.clear()
@@ -775,7 +775,7 @@ def get_dtype_size(dtype: torch.dtype) -> int:
 # `collections` helpers
 def is_list_of(
     value: object,
-    typ: Type[T],
+    typ: Union[type[T], tuple[type[T], ...]],
     *,
     check: Literal["first", "all"] = "first",
 ) -> TypeIs[List[T]]:
@@ -843,10 +843,6 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     return [item for sublist in lists for item in sublist]
 
 
-_K = TypeVar("_K", bound=Hashable)
-_V = TypeVar("_V")
-
-
 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
     """
     Unlike :class:`itertools.groupby`, groups are not broken by
@@ -1282,6 +1278,7 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
 def supports_kw(
     callable: Callable[..., object],
     kw_name: str,
+    *,
     requires_kw_only: bool = False,
     allow_var_kwargs: bool = True,
 ) -> bool:
@@ -1326,6 +1323,8 @@ def resolve_mm_processor_kwargs(
     init_kwargs: Optional[Mapping[str, object]],
     inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
+    *,
+    requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
@@ -1344,11 +1343,17 @@ def resolve_mm_processor_kwargs(
     runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
         callable,
         overrides=inference_kwargs,
-        allow_var_kwargs=allow_var_kwargs)
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
 
     # Filter init time multimodal processor kwargs provided
     init_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable, overrides=init_kwargs, allow_var_kwargs=allow_var_kwargs)
+        callable,
+        overrides=init_kwargs,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
 
     # Merge the final processor kwargs, prioritizing inference
     # time values over the initialization time values.
@@ -1359,6 +1364,8 @@ def resolve_mm_processor_kwargs(
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Optional[Mapping[str, object]],
+    *,
+    requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1390,16 +1397,21 @@ def get_allowed_kwarg_only_overrides(
         for kwarg_name, val in overrides.items()
         if supports_kw(callable,
                        kwarg_name,
-                       requires_kw_only=True,
+                       requires_kw_only=requires_kw_only,
                        allow_var_kwargs=allow_var_kwargs)
     }
 
     # If anything is dropped, log a warning
     dropped_keys = overrides.keys() - filtered_overrides.keys()
     if dropped_keys:
-        logger.warning(
-            "The following intended overrides are not keyword-only args "
-            "and and will be dropped: %s", dropped_keys)
+        if requires_kw_only:
+            logger.warning(
+                "The following intended overrides are not keyword-only args "
+                "and and will be dropped: %s", dropped_keys)
+        else:
+            logger.warning(
+                "The following intended overrides are not keyword args "
+                "and and will be dropped: %s", dropped_keys)
 
     return filtered_overrides
 
@@ -1541,6 +1553,67 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
+@lru_cache(maxsize=None)
+def get_vllm_optional_dependencies():
+    metadata = importlib.metadata.metadata("vllm")
+    requirements = metadata.get_all("Requires-Dist", [])
+    extras = metadata.get_all("Provides-Extra", [])
+
+    return {
+        extra: [
+            re.split(r";|>=|<=|==", req)[0] for req in requirements
+            if req.endswith(f'extra == "{extra}"')
+        ]
+        for extra in extras
+    }
+
+
+@dataclass(frozen=True)
+class PlaceholderModule:
+    """
+    A placeholder object to use when a module does not exist.
+
+    This enables more informative errors when trying to access attributes
+    of a module that does not exists.
+    """
+    name: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self, attr_path)
+
+    def __getattr__(self, key: str):
+        name = self.name
+
+        try:
+            importlib.import_module(self.name)
+        except ImportError as exc:
+            for extra, names in get_vllm_optional_dependencies().items():
+                if name in names:
+                    msg = f"Please install vllm[{extra}] for {extra} support"
+                    raise ImportError(msg) from exc
+
+            raise exc
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
+@dataclass(frozen=True)
+class _PlaceholderModuleAttr:
+    module: PlaceholderModule
+    attr_path: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self.module,
+                                      f"{self.attr_path}.{attr_path}")
+
+    def __getattr__(self, key: str):
+        getattr(self.module, f"{self.attr_path}.{key}")
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
 # create a library to hold the custom op
 vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 
@@ -1568,8 +1641,18 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build() or not supports_custom_op():
+    if is_in_doc_build():
         return
+
+    if not supports_custom_op():
+        assert not current_platform.is_cuda_alike(), (
+            "cuda platform needs torch>=2.4 to support custom op, "
+            "chances are you are using an old version of pytorch "
+            "or a custom build of pytorch. It is recommended to "
+            "use vLLM in a fresh new environment and let it install "
+            "the required dependencies.")
+        return
+
     import torch.library
     if hasattr(torch.library, "infer_schema"):
         schema_str = torch.library.infer_schema(op_func,
@@ -1736,3 +1819,20 @@ def memory_profiling(
     result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
     result.profile_time = diff.timestamp
     result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type,
+                               (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase"
+                "with error %s. This can cause fd limit errors like"
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n", current_soft, e)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index aaa44c930e324..78efacccfa078 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,7 +4,9 @@
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, hash_block_tokens,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
                                          hash_request_tokens)
 from vllm.v1.request import Request
 
@@ -83,10 +85,12 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
 
         computed_blocks = []
 
-        # TODO(rickyx): potentially we could cache this so we don't have to
-        # recompute it every time.
-        block_hashes = hash_request_tokens(self.block_size,
-                                           request.all_token_ids)
+        # The block hashes for the request may already be computed
+        # if the request was preempted and resumed.
+        if not request.kv_block_hashes:
+            request.set_kv_block_hashes(
+                hash_request_tokens(self.block_size, request))
+        block_hashes = request.kv_block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -197,23 +201,15 @@ def allocate_slots(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
         # Touch the computed blocks to make sure they won't be evicted.
-        num_evictable_computed_blocks = 0
         if self.enable_caching:
             self._touch(computed_blocks)
-
-            # If a computed block of a request is an eviction candidate (in the
-            # free queue and ref_cnt == 0), it cannot be counted as a free block
-            # when allocating this request.
-            num_evictable_computed_blocks = len(
-                [blk for blk in computed_blocks if blk.ref_cnt == 0])
         else:
             assert not computed_blocks, (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks -
-                num_evictable_computed_blocks):
+        if (num_required_blocks > self.free_block_queue.num_free_blocks):
             # Cannot allocate new blocks.
             return None
 
@@ -221,8 +217,7 @@ def allocate_slots(
         # preallocated blocks.
         num_new_blocks = min(
             num_required_blocks + self.num_preallocate_blocks,
-            self.free_block_queue.num_free_blocks -
-            num_evictable_computed_blocks,
+            self.free_block_queue.num_free_blocks,
             # Should not exceed the maximum number of blocks per request.
             # This is especially because the block table has the shape
             # [..., max_num_blocks_per_req].
@@ -242,14 +237,16 @@ def allocate_slots(
         num_computed_tokens = len(computed_blocks) * self.block_size
         num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
 
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=len(computed_blocks),
-            # The new full blocks are the full blocks that are not computed.
-            full_blocks=self.req_to_blocks[request.request_id]
-            [len(computed_blocks):num_full_blocks],
-            prev_block=computed_blocks[-1] if computed_blocks else None,
-        )
+        new_full_blocks = self.req_to_blocks[
+            request.request_id][len(computed_blocks):num_full_blocks]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=len(computed_blocks),
+                # The new full blocks are the full blocks that are not computed.
+                full_blocks=new_full_blocks,
+                prev_block=computed_blocks[-1] if computed_blocks else None,
+            )
 
         return new_blocks
 
@@ -376,6 +373,8 @@ def _cache_full_blocks(
             full_blocks: The list of blocks to update hash metadata.
             prev_block: The previous block in the chain.
         """
+        num_cached_block_hashes = len(request.kv_block_hashes)
+
         # Update the new blocks with the block hashes through the chain.
         prev_block_hash_value = None
         if prev_block is not None:
@@ -387,17 +386,35 @@ def _cache_full_blocks(
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
-            block_tokens = request.all_token_ids[blk_idx *
-                                                 self.block_size:(blk_idx +
-                                                                  1) *
-                                                 self.block_size]
-            assert len(block_tokens) == self.block_size, (
-                f"Expected {self.block_size} tokens, got {len(block_tokens)} "
-                f"at {blk_idx}th block for request "
-                f"{request.request_id}({request})")
-
-            # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash_value, block_tokens)
+            if blk_idx < num_cached_block_hashes:
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = request.kv_block_hashes[blk_idx]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * self.block_size
+                end_token_idx = (blk_idx + 1) * self.block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == self.block_size, (
+                    f"Expected {self.block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                request.append_kv_block_hashes(block_hash)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 0ba338aa5a3d2..9ddbff7c9a604 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,20 +1,25 @@
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import List, NamedTuple, Optional, Tuple
+from typing import Any, List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
 class BlockHashType(NamedTuple):
-    """Hash value of a block and the token IDs in the block.
-    The reason we keep a tuple of token IDs is to make sure no hash
-    collision happens when the hash value is the same.
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    The reason we keep a tuple of token IDs and extra keys is to make sure
+    no hash collision happens when the hash value is the same.
     """
+    # Hash value of the block in an integer.
     hash_value: int
+    # Token IDs in the block.
     token_ids: Tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
 
 
 @dataclass
@@ -159,8 +164,80 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Sequence[int]) -> BlockHashType:
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
+    indicate a mm input contained in the block and its starting offset in
+    the block tokens.
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return None, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set disable_mm_preprocessor_cache=False.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return None, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    extra_keys = []
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            mm_start = max(0, start_token_idx - offset)
+            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return tuple(extra_keys), curr_mm_idx
+
+
+def hash_block_tokens(
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -174,27 +251,39 @@ def hash_block_tokens(parent_block_hash: Optional[int],
             if this is the first block.
         curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
 
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
     return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids))
+                         tuple(curr_block_token_ids), extra_keys)
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: Sequence[int]) -> List[BlockHashType]:
+                        request: Request) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
     Args:
         block_size: The size of each block.
-        token_ids: A sequence of token ids in the request.
+        request: The request object.
 
     Returns:
         The list of computed hash values.
     """
+    token_ids = request.all_token_ids
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match.")
+
+    # TODO: Extend this to support other features such as LoRA.
+    need_extra_keys = bool(mm_positions)
+    extra_keys = None
+    curr_mm_idx = 0
+
     ret = []
     parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
@@ -203,8 +292,14 @@ def hash_request_tokens(block_size: int,
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
+
+        # Add extra keys if the block is a multi-modal block.
+        if need_extra_keys:
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
         block_hash = hash_block_tokens(parent_block_hash_value,
-                                       block_token_ids)
+                                       block_token_ids, extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 178532e477dae..08e7c0fd4dc9b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -516,6 +516,7 @@ class NewRequestData:
     prompt_token_ids: List[int]
     prompt: Optional[str]
     mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
     mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
@@ -533,6 +534,7 @@ def from_request(
             prompt_token_ids=request.prompt_token_ids,
             prompt=request.prompt,
             mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b36de5f66917c..ba2b8377759d6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -9,14 +9,13 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.async_stream import AsyncStream
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -54,15 +53,17 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> AsyncStream).
-        self.request_streams: Dict[str, AsyncStream] = {}
-        # List of cancelled request ids to be aborted.
-        self.client_aborted_requests: List[str] = []
+        # Request streams (map of request_id -> queue).
+        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
 
         # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
         self.detokenizer = Detokenizer(
@@ -94,7 +95,7 @@ def from_engine_args(
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "AsyncLLMEngine":
+    ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
         # Create the engine configs.
@@ -149,14 +150,13 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        if self.detokenizer.is_request_active(request_id):
-            raise ValueError(f"Request {request_id} already exists.")
-
-        # 1) Create a new AsyncStream for the request.
-        stream = self._add_request_to_streams(request_id)
+        # 1) Create a new output queue for the request.
+        if request_id in self.rid_to_queue:
+            raise ValueError(f"Request id {request_id} already running.")
+        self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
@@ -169,8 +169,10 @@ async def add_request(
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(engine_core_req)
 
-        # 5) Return the generator.
-        return stream.generator()
+        if self.log_requests:
+            logger.info("Added request %s.", request_id)
+
+        return self.rid_to_queue[request_id]
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -190,7 +192,7 @@ async def generate(
         """
         Main function called by the API server to kick off a request
             * 1) Making an AsyncStream corresponding to the Request.
-            # 2) Processing the Input.
+            * 2) Processing the Input.
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
@@ -202,14 +204,15 @@ async def generate(
         returning the RequestOutput back to the caller.
         """
 
-        # We start the output_handler on the first call to generate() so that
-        # we can call __init__ before the event loop starts, which enables us
-        # to handle startup failure gracefully in the OpenAI server.
-        if self.output_handler is None:
-            self.output_handler = asyncio.create_task(
-                self._run_output_handler())
-
-        async for output in await self.add_request(
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            if self.output_handler is None:
+                self.output_handler = asyncio.create_task(
+                    self._run_output_handler())
+
+            q = await self.add_request(
                 request_id,
                 prompt,
                 sampling_params,
@@ -217,79 +220,42 @@ async def generate(
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
-        ):
-            yield output
-
-    def _finish_stream(self, request_id: str):
-        stream = self.request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish()
-
-    def _add_request_to_streams(
-        self,
-        request_id: str,
-    ) -> AsyncStream:
-
-        if request_id in self.request_streams:
-            raise ValueError(f"Request id {request_id} already running.")
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        aborted_reqs = self.client_aborted_requests
-        stream = AsyncStream(request_id, aborted_reqs.append)
-        self.request_streams[request_id] = stream
-
-        if self.log_requests:
-            logger.info("Added request %s.", request_id)
+            )
 
-        return stream
-
-    async def _process_cancellations(self) -> None:
-        """
-        Process requests cancelled from user disconnecting.
-
-        When a client disconnects, AsyncStream._cancel() is called.
-        We passed a callback to AsyncStream(), which appends to 
-        self.client_aborted_requests.
-
-        As a result, if any requests are canceled from the user side
-        the request_id will show up in self.client_aborted_requests.
-        """
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        if not self.client_aborted_requests:
-            return
-        reqs_to_abort = self.client_aborted_requests.copy()
-        self.client_aborted_requests.clear()
-
-        # Remove from Detokenizer.
-        self.detokenizer.abort_requests(reqs_to_abort)
-
-        # Remove from RequestStreams.
-        for request_id in reqs_to_abort:
-            if self.log_requests:
-                logger.info("User-cancelled request %s.", request_id)
-            self._finish_stream(request_id)
-
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async(reqs_to_abort)
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            while True:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() if q.qsize() > 0 else await q.get()
+
+                # Note: both Detokenizer and EngineCore handle their
+                # own request cleanup based on finished.
+                if out.finished:
+                    del self.rid_to_queue[request_id]
+                    yield out
+                    break
+
+                yield out
+
+        # If the request is disconnected by the client, the
+        # generate() task will be canceled. So, we abort the
+        # request if we end up here.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request AsyncStreams."""
+        """Process outputs by putting them into per-request queues."""
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            assert request_id in self.request_streams
-
-            # Each request in the API server pulls from the per-request stream.
-            stream = self.request_streams.get(request_id)
-            if stream is not None:
-                stream.put(request_output)
 
-                # If finished, remove from the tracker.
-                if request_output.finished:
-                    if self.log_requests:
-                        logger.info("Finished request %s.", request_id)
-                    self._finish_stream(request_id)
+            # Note: it is possible a request was aborted and removed from
+            # the state due to client cancellations, so if we encounter a
+            # request id not in the state, we skip.
+            if request_id in self.rid_to_queue:
+                self.rid_to_queue[request_id].put_nowait(request_output)
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
@@ -302,24 +268,27 @@ async def _run_output_handler(self):
                 # 2) Detokenize based on the output.
                 request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
 
-                # 3) Put the RequestOutputs into the per-request AsyncStreams.
+                # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
 
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-                # 5) Abort any requests due to client cancellations.
-                await self._process_cancellations()
-
         except BaseException as e:
             logger.error(e)
             raise e
 
-    # TODO: can we eliminate these?
-
     async def abort(self, request_id: str) -> None:
-        # Note: Who Calls this? I dont think this is actually used.
-        raise ValueError("Not Supported on V1 yet.")
+        """Abort RequestId in self, detokenizer, and engine core."""
+
+        request_ids = [request_id]
+        await self.engine_core.abort_requests_async(request_ids)
+        self.detokenizer.abort_requests(request_ids)
+
+        # If a request finishes while we await then the request_id
+        # will be removed from the tracked queues before we get here.
+        if request_id in self.rid_to_queue:
+            del self.rid_to_queue[request_id]
 
     def encode(
         self,
@@ -382,7 +351,3 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
-
-
-# Retain V0 name for backwards compatibility.
-AsyncLLMEngine = AsyncLLM
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
deleted file mode 100644
index 35449238c3259..0000000000000
--- a/vllm/v1/engine/async_stream.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import asyncio
-from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
-
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-
-
-class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
-
-    STOP_ITERATION = Exception()  # Sentinel
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(exception if self._is_raisable(exception)
-                                   else AsyncStream.STOP_ITERATION)
-
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        finished = False
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    finished = True
-                    if result == AsyncStream.STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        finally:
-            self._finished = True
-            if not finished:
-                self._cancel(self.request_id)
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 56d4dc67e4a0e..0aef61fc7f680 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5000
+LOGGING_TIME_S = POLLING_TIMEOUT_S
 
 
 class EngineCore:
@@ -65,7 +65,8 @@ def __init__(
 
         self._last_logging_time = time.time()
 
-        self.mm_input_mapper_server = MMInputMapperServer()
+        self.mm_input_mapper_server = MMInputMapperServer(
+            vllm_config.model_config)
 
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
@@ -98,9 +99,8 @@ def add_request(self, request: EngineCoreRequest):
             # MM mapper, so anything that has a hash must have a HIT cache
             # entry here as well.
             assert request.mm_inputs is not None
-            request.mm_inputs, request.mm_hashes = (
-                self.mm_input_mapper_server.process_inputs(
-                    request.mm_inputs, request.mm_hashes))
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 15dedbd0f9529..b58f62778ffe9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -55,9 +55,12 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry, mm_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry,
+                                   mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
@@ -107,7 +110,10 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "mp":
+        if distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
         else:
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 6cdeba6f3f71e..8bfc739b3dbbc 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import PIL
 from blake3 import blake3
@@ -8,7 +8,7 @@
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
-from vllm.v1.utils import LRUDictCache
+from vllm.utils import LRUCache
 
 logger = init_logger(__name__)
 
@@ -42,7 +42,9 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
-        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+        # Init cache
+        self.use_cache = not model_config.disable_mm_preprocessor_cache
+        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
@@ -61,7 +63,7 @@ def process_inputs(
         mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
         precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]:
+    ) -> List[MultiModalKwargs]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -70,26 +72,21 @@ def process_inputs(
         else:
             num_inputs = len(precomputed_mm_inputs)
 
-        # Check if hash is enabled
-        use_hash = mm_hashes is not None
-        if use_hash:
+        # Sanity
+        if self.use_cache:
             assert mm_hashes is not None
-            assert num_inputs == len(
-                mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
-                    num_inputs, len(mm_hashes))
+            assert num_inputs == len(mm_hashes)
 
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_hashes: Optional[List[str]] = [] if use_hash else None
         ret_inputs: List[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
                 self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
 
-            mm_hash = None
             mm_input = None
-            if use_hash:
+            if self.use_cache:
                 assert mm_hashes is not None
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
@@ -106,7 +103,7 @@ def process_inputs(
                         mm_processor_kwargs=mm_processor_kwargs,
                     )
 
-                if use_hash:
+                if self.use_cache:
                     # Add to cache
                     assert mm_hash is not None
                     self.mm_cache.put(mm_hash, mm_input)
@@ -114,19 +111,16 @@ def process_inputs(
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
-            if use_hash:
-                assert mm_hash is not None
-                assert ret_hashes is not None
-                ret_hashes.append(mm_hash)
             ret_inputs.append(mm_input)
 
-        return ret_inputs, ret_hashes
+        return ret_inputs
 
 
 class MMInputMapperServer:
 
-    def __init__(self, ):
-        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+    def __init__(self, model_config):
+        self.use_cache = not model_config.disable_mm_preprocessor_cache
+        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
         self,
@@ -135,6 +129,9 @@ def process_inputs(
     ) -> List[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
+        if not self.use_cache:
+            return mm_inputs
+
         full_mm_inputs = []
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
             assert mm_hash is not None
@@ -154,12 +151,45 @@ class MMHasher:
     def __init__(self):
         pass
 
-    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+    def hash_dummy_mm_data(
+            self,
+            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
+        """Hash user-defined dummy multimodal data used for profiling."""
+
+        if mm_data is None:
+            return None
+
+        image_inputs = mm_data['image']
+
+        # This is a temporary workaround for models (e.g, Molmo) that
+        # process multimodal data in the input processor (therefore
+        # image_inputs is MultiModalKwargs instead of raw input format).
+        # `raw_mm_data` with the original input format is expected
+        # in this case.
+        if isinstance(image_inputs, dict):
+            assert "raw_mm_data" in image_inputs and isinstance(
+                image_inputs["raw_mm_data"], PIL.Image.Image)
+            image_inputs = image_inputs.pop("raw_mm_data")
+
+        return self.hash_images(image_inputs)
+
+    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
         if "multi_modal_data" not in prompt:
             return None
 
         mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
         image_inputs = mm_data["image"]
+
+        return self.hash_images(image_inputs)
+
+    def hash_images(self, image_inputs) -> Optional[List[str]]:
+        """Hash PIL image objects to strings."""
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
         assert len(image_inputs) > 0
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 679bf8e25e9ca..6ee8732bc902c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,7 @@
 import time
-from typing import Any, Dict, Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Tuple, Union
 
-from vllm.config import LoRAConfig, ModelConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -12,7 +12,6 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
@@ -23,6 +22,7 @@ class Processor:
     def __init__(
         self,
         model_config: ModelConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -33,8 +33,8 @@ def __init__(
         self.lora_config = lora_config
         self.tokenizer = tokenizer
 
-        self.generation_config_fields = _load_generation_config_dict(
-            model_config)
+        self.generation_config_fields = model_config.try_get_generation_config(
+        )
         self.input_preprocessor = InputPreprocessor(model_config,
                                                     self.tokenizer,
                                                     mm_registry)
@@ -45,8 +45,9 @@ def __init__(
         self.mm_input_mapper_client = MMInputMapperClient(model_config)
 
         # Multi-modal hasher (for images)
-        self.mm_hasher = MMHasher(
-        ) if model_config.mm_cache_preprocessor else None
+        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
+            cache_config.enable_prefix_caching
+        self.mm_hasher = MMHasher()
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -77,8 +78,8 @@ def process_inputs(
 
         # Compute MM hashes (if enabled)
         mm_hashes = None
-        if self.mm_hasher is not None:
-            mm_hashes = self.mm_hasher.hash(prompt)
+        if self.use_hash:
+            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
 
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
@@ -118,7 +119,7 @@ def process_inputs(
         # Apply MM mapper
         mm_inputs = None
         if len(decoder_inputs.multi_modal_data) > 0:
-            mm_inputs, mm_hashes = self.mm_input_mapper_client.process_inputs(
+            mm_inputs = self.mm_input_mapper_client.process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
@@ -179,16 +180,3 @@ def _validate_model_inputs(self, inputs: ProcessorInputs):
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-
-    if config is None:
-        return {}
-
-    return config.to_diff_dict()
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
new file mode 100644
index 0000000000000..79acc60001c99
--- /dev/null
+++ b/vllm/v1/executor/ray_executor.py
@@ -0,0 +1,342 @@
+import os
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
+                                        initialize_ray_cluster, ray)
+from vllm.v1.outputs import ModelRunnerOutput
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+class RayExecutor(Executor):
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        self.vllm_config = vllm_config
+        self.parallel_config = vllm_config.parallel_config
+        self.model_config = vllm_config.model_config
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        initialize_ray_cluster(self.parallel_config)
+        placement_group = self.parallel_config.placement_group
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # A list of workers to run a model.
+        self.workers: List[RayWorkerWrapper] = []
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                # Skip bundles that don't have GPUs,
+                # as each worker needs one GPU.
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
+            self.workers.append(worker)
+
+        logger.debug("workers: %s", self.workers)
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        worker_to_ip = dict(zip(self.workers, worker_ips))
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first. This is simply a tiebreaker to make
+                sure the workers are sorted in a deterministic way.
+            """
+            ip = worker_to_ip[worker]
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips)
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP` or "
+                "`HOST_IP` environment variable, make sure it is unique for"
+                " each node.")
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "CUDA_VISIBLE_DEVICES":
+            ",".join(map(str, node_gpus[node_id])),
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+            "VLLM_USE_V1":
+            str(int(envs.VLLM_USE_V1)),
+            **({
+                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
+            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
+        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+
+        self._env_vars_for_all_workers = (
+            all_args_to_update_environment_variables)
+
+        self._run_workers("update_environment_variables",
+                          all_args=self._get_env_vars_to_be_updated())
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+        self._run_workers("initialize")
+        self._run_workers("load_model")
+
+    def _configure_ray_workers_use_nsight(self,
+                                          ray_remote_kwargs) -> Dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update({
+            "nsight": {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        })
+
+        return ray_remote_kwargs
+
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Return worker init args for a given rank.
+        """
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+        )
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        
+        This invokes `determine_num_available_blocks` on each worker and takes
+        the min of the results, guaranteeing that the selected cache sizes are
+        compatible with all workers.
+        
+        Returns:
+            - tuple[num_gpu_blocks, num_cpu_blocks]
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers("determine_num_available_blocks")
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize(self, num_gpu_blocks: int) -> None:
+        """
+        Initialize the KV cache in all workers.
+        """
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
+        self._run_workers("initialize_cache", num_gpu_blocks)
+        self._run_workers("compile_or_warm_up_model")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> Any:
+        """
+        Runs the given method on all workers. Can be used in the following
+        ways:
+
+        Args:
+        - args/kwargs: All workers share the same args/kwargs
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 0, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 0, None)
+
+        ray_worker_refs = [
+            worker.execute_method.remote(  # type: ignore[attr-defined]
+                method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+        ]
+        return ray.get(ray_worker_refs)
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> ModelRunnerOutput:
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag()
+        # Only the first worker (with rank 0) returns the execution result.
+        # Others return None.
+        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
+        return output
+
+    def profile(self, is_start=True):
+        raise NotImplementedError
+
+    def shutdown(self):
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def check_health(self) -> None:
+        logger.debug("Called check_health.")
+
+    def _check_ray_compiled_graph_installation(self):
+        import pkg_resources
+        from packaging import version
+
+        required_version = version.parse("2.39")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} is "
+                             f"required, but found {current_version}")
+
+        import importlib.util
+        raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
+        if raycg is None:
+            raise ValueError("Ray Compiled Graph is not installed. "
+                             "Run `pip install ray[adag]` to install it.")
+
+        cupy_spec = importlib.util.find_spec("cupy")
+        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
+                "Run `pip install ray[adag]` and check cupy installation.")
+
+    def _compiled_ray_dag(self):
+        assert self.parallel_config.use_ray
+        self._check_ray_compiled_graph_installation()
+        from ray.dag import InputNode, MultiOutputNode
+
+        with InputNode() as input_batches:
+            outputs = [
+                worker.execute_model.bind(  # type: ignore[attr-defined]
+                    input_batches) for worker in self.workers
+            ]
+            forward_dag = MultiOutputNode(outputs)
+
+        return forward_dag.experimental_compile()
+
+    def __del__(self):
+        self.shutdown()
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
new file mode 100644
index 0000000000000..7733610e59c7f
--- /dev/null
+++ b/vllm/v1/executor/ray_utils.py
@@ -0,0 +1,271 @@
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import get_ip
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.worker.worker_base import WorkerWrapperBase
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+
+logger = init_logger(__name__)
+PG_WAIT_TIMEOUT = 60
+
+try:
+    import ray
+    from ray.util import placement_group_table
+    from ray.util.placement_group import PlacementGroup
+    try:
+        from ray._private.state import available_resources_per_node
+    except ImportError:
+        # Ray 2.9.x doesn't expose `available_resources_per_node`
+        from ray._private.state import state as _state
+        available_resources_per_node = _state._available_resources_per_node
+
+    class RayWorkerWrapper(WorkerWrapperBase):
+
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread. It will be removed soon.
+            self.compiled_dag_cuda_device_set = False
+
+        def get_node_ip(self) -> str:
+            return get_ip()
+
+        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+            node_id = ray.get_runtime_context().get_node_id()
+            gpu_ids = ray.get_gpu_ids()
+            return node_id, gpu_ids
+
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> ModelRunnerOutput:
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+
+    ray_import_err = None
+
+except ImportError as e:
+    ray = None  # type: ignore
+    ray_import_err = e
+    RayWorkerWrapper = None  # type: ignore
+
+
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """
+    Raise an exception if Ray is not available.
+    """
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
+
+
+def _verify_bundles(placement_group: "PlacementGroup",
+                    parallel_config: ParallelConfig, device_str: str):
+    """
+    Verify a given placement group has bundles located in the right place.
+
+    There are 2 rules.
+    - Warn if all tensor parallel workers cannot fit in a single node.
+    - Fail if driver node is not included in a placement group.
+
+    Args:
+        placement_group: The placement group to verify.
+        parallel_config: The parallel configuration.
+        device_str: The required device.
+    """
+    assert ray.is_initialized(), (
+        "Ray is not initialized although distributed-executor-backend is ray.")
+    pg_data = placement_group_table(placement_group)
+    # bundle_idx -> node_id
+    bundle_to_node_ids = pg_data["bundles_to_node_id"]
+    # bundle_idx -> bundle (e.g., {"GPU": 1})
+    bundles = pg_data["bundles"]
+    # node_id -> List of bundle (e.g., {"GPU": 1})
+    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
+
+    for bundle_idx, node_id in bundle_to_node_ids.items():
+        node_id_to_bundle[node_id].append(bundles[bundle_idx])
+    driver_node_id = ray.get_runtime_context().get_node_id()
+
+    if driver_node_id not in node_id_to_bundle:
+        raise RuntimeError(
+            f"driver node id {driver_node_id} is not included in a placement "
+            f"group {placement_group.id}. Node id -> bundles "
+            f"{node_id_to_bundle}. "
+            "You don't have enough GPUs available in a current node. Check "
+            "`ray status` to see if you have available GPUs in a node "
+            f"{driver_node_id} before starting an vLLM engine.")
+
+    for node_id, bundles in node_id_to_bundle.items():
+        if len(bundles) < parallel_config.tensor_parallel_size:
+            logger.warning(
+                "tensor_parallel_size=%d "
+                "is bigger than a reserved number of %ss (%d "
+                "%ss) in a node %s. Tensor parallel workers can be "
+                "spread out to 2+ nodes which can degrade the performance "
+                "unless you have fast interconnect across nodes, like "
+                "Infiniband. To resolve this issue, make sure you have more "
+                "than %d GPUs available at each node.",
+                parallel_config.tensor_parallel_size, device_str, len(bundles),
+                device_str, node_id, parallel_config.tensor_parallel_size)
+
+
+def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
+    """Wait until a placement group is ready.
+
+    It prints the informative log messages if the placement group is
+    not created within time.
+
+    """
+    # Wait until PG is ready - this will block until all
+    # requested resources are available, and will timeout
+    # if they cannot be provisioned.
+    placement_group_specs = current_placement_group.bundle_specs
+
+    s = time.time()
+    pg_ready_ref = current_placement_group.ready()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
+        if len(ready) > 0:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for creating a placement group of specs for "
+            "%d seconds. specs=%s. Check "
+            "`ray status` to see if you have enough resources.",
+            int(time.time() - s), placement_group_specs)
+
+    try:
+        ray.get(pg_ready_ref, timeout=0)
+    except ray.exceptions.GetTimeoutError:
+        raise ValueError(
+            "Cannot provide a placement group of "
+            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
+            "`ray status` to make sure the cluster has enough resources."
+        ) from None
+
+
+def initialize_ray_cluster(
+    parallel_config: ParallelConfig,
+    ray_address: Optional[str] = None,
+):
+    """Initialize the distributed cluster with Ray.
+
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+    """
+    assert_ray_available()
+
+    # Connect to a ray cluster.
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        # Try to connect existing ray instance and create a new one if not found
+        try:
+            ray.init("auto")
+        except ConnectionError:
+            logger.warning(
+                "No existing RAY instance detected. "
+                "A new instance will be launched with current node resources.")
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size)
+    else:
+        ray.init(address=ray_address, ignore_reinit_error=True)
+
+    if parallel_config.placement_group:
+        # Placement group is already set.
+        return
+
+    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    # Create placement group for worker processes
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        device_bundles = 0
+        for bundle in bundles:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group."
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
+    else:
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        if parallel_config.world_size > num_devices_in_cluster:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group.")
+        # Create a new placement group
+        placement_group_specs: List[Dict[str, float]] = ([{
+            device_str: 1.0
+        } for _ in range(parallel_config.world_size)])
+
+        # vLLM engine is also a worker to execute model with an accelerator,
+        # so it requires to have the device in a current node. Check if
+        # the current node has at least one device.
+        current_ip = get_ip()
+        current_node_id = ray.get_runtime_context().get_node_id()
+        current_node_resource = available_resources_per_node()[current_node_id]
+        if current_node_resource.get(device_str, 0) < 1:
+            raise ValueError(
+                f"Current node has no {device_str} available. "
+                f"{current_node_resource=}. vLLM engine cannot start without "
+                f"{device_str}. Make sure you have at least 1 {device_str} "
+                f"available in a node {current_node_id=} {current_ip=}.")
+        # This way, at least bundle is required to be created in a current
+        # node.
+        placement_group_specs[0][f"node:{current_ip}"] = 0.001
+
+        # By default, Ray packs resources as much as possible.
+        current_placement_group = ray.util.placement_group(
+            placement_group_specs, strategy="PACK")
+        _wait_until_pg_ready(current_placement_group)
+
+    assert current_placement_group is not None
+    _verify_bundles(current_placement_group, parallel_config, device_str)
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 1737d096e811d..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,5 @@
 import enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -9,6 +9,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import BlockHashType
+
 
 class Request:
 
@@ -45,6 +48,7 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -56,6 +60,12 @@ def __init__(
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
 
+        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+
+        # Cache the computed kv block hashes of the request to avoid
+        # recomputing.
+        self._kv_block_hashes: List[BlockHashType] = []
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
@@ -65,6 +75,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 prompt=request.prompt,
                 multi_modal_data=None,
                 multi_modal_inputs=request.mm_inputs,
+                multi_modal_hashes=request.mm_hashes,
                 multi_modal_placeholders=request.mm_placeholders,
                 mm_processor_kwargs=None,
             ),
@@ -121,6 +132,17 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
+        # Prevent directly appending to the kv_block_hashes.
+        return ConstantList(self._kv_block_hashes)
+
+    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
+        self._kv_block_hashes = value
+
+    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
+        self._kv_block_hashes.append(block_hash)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..d60f7eb5d76f9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, List, Optional, Set
 
 import torch
 
@@ -19,3 +19,13 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+
+    no_penalties: bool
+    prompt_token_ids: Optional[torch.Tensor]
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+
+    output_token_ids: List[List[int]]
+    min_tokens: List[int]
+    stop_token_ids: List[Set[int]]
diff --git a/vllm/v1/sample/ops/__init__.py b/vllm/v1/sample/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
new file mode 100644
index 0000000000000..2796d049457d0
--- /dev/null
+++ b/vllm/v1/sample/ops/penalties.py
@@ -0,0 +1,59 @@
+from typing import List, Set, Tuple
+
+import torch
+
+from vllm.model_executor.layers.utils import apply_penalties
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+
+
+def apply_min_token_penalties(logits: torch.Tensor,
+                              output_token_ids: List[List[int]],
+                              stop_token_ids: List[Set[int]],
+                              min_tokens: List[int]) -> None:
+    """
+    Applies minimum token penalty by setting the logits of the stop tokens
+    to -inf.
+    """
+    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
+    for index, min_token in enumerate(min_tokens):
+        if len(output_token_ids[index]) < min_token:
+            for stop_token_id in stop_token_ids[index]:
+                min_tokens_logits_to_penalize.append((index, stop_token_id))
+    if min_tokens_logits_to_penalize:
+        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
+
+
+def apply_all_penalties(
+    logits: torch.Tensor,
+    prompt_token_ids: torch.Tensor,
+    presence_penalties: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+    output_token_ids: List[List[int]],
+) -> torch.Tensor:
+    """
+    Applies presence, frequency and repetition penalties to the logits.
+    """
+    _, vocab_size = logits.shape
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
+                                          logits.device)
+    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                           presence_penalties, frequency_penalties,
+                           repetition_penalties)
+
+
+def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor.to(device, non_blocking=True)
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
new file mode 100644
index 0000000000000..c088c3c129ca5
--- /dev/null
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -0,0 +1,201 @@
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+try:
+    import flashinfer.sampling
+    is_flashinfer_available = True
+except ImportError:
+    is_flashinfer_available = False
+
+
+class TopKTopPSampler(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda:
+            if is_flashinfer_available:
+                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
+                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
+                    # default it is unused). For backward compatibility, we set
+                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
+                    # interpret it differently in V0 and V1 samplers: In V0,
+                    # None means False, while in V1, None means True. This is
+                    # why we use the condition
+                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    self.forward = self.forward_cuda
+                else:
+                    logger.warning(
+                        "FlashInfer is available, but it is not enabled. "
+                        "Falling back to the PyTorch-native implementation of "
+                        "top-p & top-k sampling. For the best performance, "
+                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
+                    self.forward = self.forward_native
+            else:
+                logger.warning(
+                    "FlashInfer is not available. Falling back to the PyTorch-"
+                    "native implementation of top-p & top-k sampling. For the "
+                    "best performance, please install FalshInfer.")
+                self.forward = self.forward_native
+        else:
+            self.forward = self.forward_native
+
+    def forward_native(
+        self,
+        logits: torch.Tensor,
+        generators: Dict[int, torch.Generator],
+        no_top_k: bool,
+        k: torch.Tensor,
+        no_top_p: bool,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        """PyTorch-native implementation of top-k and top-p sampling."""
+        logits = apply_top_k_top_p(logits, no_top_k, k, no_top_p, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
+
+    def forward_cuda(
+        self,
+        logits: torch.Tensor,
+        generators: Dict[int, torch.Generator],
+        no_top_k: bool,
+        k: torch.Tensor,
+        no_top_p: bool,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        """More optimized implementation for top-k and top-p sampling."""
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        if no_top_k and no_top_p:
+            # We prefer `random_sample` over `flashinfer_sample` when sorting is
+            # not needed. This is because `random_sample` does not require
+            # CPU-GPU synchronization while `flashinfer_sample` does.
+            return random_sample(probs, generators)
+        return flashinfer_sample(probs, no_top_k, k, no_top_p, p, generators)
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    no_top_k: bool,
+    k: torch.Tensor,
+    no_top_p: bool,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits.
+
+    This function sorts the logits tensor, which can be slow for large batches.
+    """
+    if no_top_k and no_top_p:
+        return logits
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    if not no_top_k:
+        # Apply top-k.
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)
+        # Get all the top_k values.
+        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+        top_k_mask = logits_sort < top_k_mask
+        logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    if not no_top_p:
+        # Apply top-p.
+        probs_sort = logits_sort.softmax(dim=-1)
+        probs_sum = probs_sort.cumsum(dim=-1)
+        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+        # at least one
+        top_p_mask[:, -1] = False
+        logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+
+
+def random_sample(
+    probs: torch.Tensor,
+    generators: Dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Randomly sample from the probabilities.
+
+    We use this function instead of torch.multinomial because torch.multinomial
+    causes CPU-GPU synchronization.
+    """
+    q = torch.empty_like(probs)
+    # NOTE(woosuk): To batch-process the requests without their own seeds,
+    # which is the common case, we first assume that every request does
+    # not have its own seed. Then, we overwrite the values for the requests
+    # that have their own seeds.
+    if len(generators) != probs.shape[0]:
+        q.exponential_()
+    if generators:
+        # TODO(woosuk): This can be slow because we handle each request
+        # one by one. Optimize this.
+        for i, generator in generators.items():
+            q[i].exponential_(generator=generator)
+    return probs.div_(q).argmax(dim=-1).view(-1)
+
+
+def flashinfer_sample(
+    probs: torch.Tensor,
+    no_top_k: bool,
+    k: torch.Tensor,
+    no_top_p: bool,
+    p: torch.Tensor,
+    generators: Dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Sample from the probabilities using FlashInfer.
+
+    Statistically, this function is equivalent to the `random_sample` function.
+    However, this function is faster because it avoids sorting the logits tensor
+    via rejection sampling.
+    
+    NOTE: The outputs of this function do not necessarily match the outputs of
+    the `random_sample` function. It only guarantees that the outputs are
+    statistically equivalent.
+
+    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
+    does not. Call this function at the end of the forward pass to minimize
+    the synchronization overhead.
+    """
+    assert not (no_top_k and no_top_p)
+    max_top_k_round = 32
+    batch_size = probs.shape[0]
+    uniform_samples = torch.empty((max_top_k_round, batch_size),
+                                  device=probs.device)
+    if len(generators) != batch_size:
+        uniform_samples.uniform_()
+    if generators:
+        for i, generator in generators.items():
+            uniform_samples[:, i].uniform_(generator=generator)
+
+    if no_top_k:
+        # Top-p only.
+        next_token_ids, success = flashinfer.sampling.top_p_sampling_from_probs(
+            probs, uniform_samples, p, deterministic=True)
+    elif no_top_p:
+        # Top-k only.
+        next_token_ids, success = flashinfer.sampling.top_k_sampling_from_probs(
+            probs, uniform_samples, k, deterministic=True)
+    else:
+        # Both top-k and top-p.
+        next_token_ids, success = (
+            flashinfer.sampling.top_k_top_p_sampling_from_probs(
+                probs, uniform_samples, k, p, deterministic=True))
+
+    # NOTE: CPU-GPU synchronization happens here.
+    if not success.all():
+        if not no_top_k:
+            probs = flashinfer.sampling.top_k_renorm_prob(probs, k)
+        if not no_top_p:
+            probs = flashinfer.sampling.top_p_renorm_prob(probs, p)
+        next_token_ids = flashinfer.sampling.sampling_from_probs(
+            probs, uniform_samples[0], deterministic=True)
+    return next_token_ids.view(-1)
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index d1a755be01ff7..7cd42ca211a22 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,42 +1,55 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Tuple
 
 import torch
 import torch.nn as nn
 
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.penalties import (apply_all_penalties,
+                                          apply_min_token_penalties)
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 
 _SAMPLING_EPS = 1e-5
 
 
 class Sampler(nn.Module):
 
+    def __init__(self):
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+
     def forward(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
-
-        probs = self.get_probs(logits)
-        sampled = self.sample(probs, sampling_metadata)
-        # Use int32 to reduce the tensor size.
-        sampled = sampled.to(torch.int32)
-
-        if sampling_metadata.max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, sampling_metadata.max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
+        needs_logprobs = sampling_metadata.max_num_logprobs > 0
+        if needs_logprobs:
+            # NOTE(woosuk): Use the original logits (before any penalties or
+            # temperature scaling) for the top-k logprobs.
+            # This is different from the V0 sampler, which uses the logits that
+            # is used for sampling (after penalties and temperature scaling).
+            # NOTE: We compute logprobs first because the below ops may
+            # modify the logits tensor in-place (and we don't want to clone
+            # the logits tensor for memory efficiency).
+            topk_logprobs, topk_indices = self.get_topk_logprobs(
+                logits, sampling_metadata)
         else:
             topk_logprobs = None
             topk_indices = None
 
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Apply penalties (e.g., min_tokens, freq_penalties).
+        logits = self.apply_penalties(logits, sampling_metadata)
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
+
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
             sampled_token_ids=sampled.tolist(),
@@ -52,71 +65,37 @@ def apply_temperature(
         logits: torch.Tensor,
         temp: torch.Tensor,
     ) -> torch.Tensor:
-        # Use float32 to apply temperature scaling.
-        logits = logits.to(torch.float32)
         # Avoid division by zero.
         temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
         # Use in-place division to avoid creating a new tensor.
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def apply_top_k_top_p(
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        return _apply_top_k_top_p(
+        assert not (sampling_metadata.all_greedy
+                    and sampling_metadata.all_random)
+        if sampling_metadata.all_greedy:
+            return self.greedy_sample(logits)
+
+        random_sampled = self.topk_topp_sampler(
             logits,
+            sampling_metadata.generators,
             sampling_metadata.no_top_k,
             sampling_metadata.top_k,
             sampling_metadata.no_top_p,
             sampling_metadata.top_p,
         )
-
-    def get_probs(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.softmax(logits, dim=-1, dtype=torch.float32)
-
-    def get_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.log_softmax(logits, dim=-1, dtype=torch.float32)
-
-    def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor:
-        return probs.argmax(dim=-1).view(-1)
-
-    def random_sample(
-        self,
-        probs: torch.Tensor,
-        generators: Dict[int, torch.Generator],
-    ) -> torch.Tensor:
-        q = torch.empty_like(probs)
-        # NOTE(woosuk): To batch-process the requests without their own seeds,
-        # which is the common case, we first assume that every request does
-        # not have its own seed. Then, we overwrite the values for the requests
-        # that have their own seeds.
-        if len(generators) != probs.shape[0]:
-            # This might still be done here unnecessarily if there are greedies
-            q.exponential_()
-        if generators:
-            # TODO(woosuk): This can be slow because we handle each request
-            # one by one. Optimize this.
-            for i, generator in generators.items():
-                q[i].exponential_(generator=generator)
-        return probs.div_(q).argmax(dim=-1).view(-1)
-
-    def sample(
-        self,
-        probs: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        assert not (sampling_metadata.all_greedy
-                    and sampling_metadata.all_random)
-        if sampling_metadata.all_greedy:
-            return self.greedy_sample(probs)
         if sampling_metadata.all_random:
-            return self.random_sample(probs, sampling_metadata.generators)
+            return random_sampled
 
-        greedy_sampled = self.greedy_sample(probs)
-        random_sampled = self.random_sample(probs,
-                                            sampling_metadata.generators)
+        greedy_sampled = self.greedy_sample(logits)
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
@@ -124,36 +103,34 @@ def sample(
         )
         return sampled
 
+    def get_topk_logprobs(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
+        # FIXME: Mask the sampled token_id, get topk logprobs,
+        # and concatenate the topk with the sampled token_id.
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        # Use int32 to reduce the tensor size.
+        topk_indices = topk_indices.to(torch.int32)
+        return topk_logprobs, topk_indices
 
-# TODO(woosuk): Optimize this with a custom kernel.
-def _apply_top_k_top_p(
-    logits: torch.Tensor,
-    no_top_k: bool,
-    k: torch.Tensor,
-    no_top_p: bool,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    if no_top_k and no_top_p:
+    def apply_penalties(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
+                                  sampling_metadata.stop_token_ids,
+                                  sampling_metadata.min_tokens)
+        if not sampling_metadata.no_penalties:
+            assert sampling_metadata.prompt_token_ids is not None
+            logits = apply_all_penalties(
+                logits, sampling_metadata.prompt_token_ids,
+                sampling_metadata.presence_penalties,
+                sampling_metadata.frequency_penalties,
+                sampling_metadata.repetition_penalties,
+                sampling_metadata.output_token_ids)
         return logits
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
-
-    if not no_top_k:
-        # Apply top-k.
-        top_k_mask = logits_sort.size(1) - k.to(torch.long)
-        # Get all the top_k values.
-        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
-        top_k_mask = logits_sort < top_k_mask
-        logits_sort.masked_fill_(top_k_mask, -float("inf"))
-
-    if not no_top_p:
-        # Apply top-p.
-        probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
-        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
-        # at least one
-        top_p_mask[:, -1] = False
-        logits_sort.masked_fill_(top_p_mask, -float("inf"))
-
-    # Re-sort the probabilities.
-    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
-    return logits
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5f327d7066830..e802c6439b740 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,4 +1,3 @@
-from collections import OrderedDict
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
@@ -102,27 +101,3 @@ def make_zmq_socket(
 
     finally:
         ctx.destroy(linger=0)
-
-
-K = TypeVar('K')
-V = TypeVar('V')
-
-
-class LRUDictCache(Generic[K, V]):
-
-    def __init__(self, size: int):
-        self.cache: OrderedDict[K, V] = OrderedDict()
-        self.size = size
-
-    def get(self, key: K, default=None) -> V:
-        if key not in self.cache:
-            return default
-
-        self.cache.move_to_end(key)
-        return self.cache[key]
-
-    def put(self, key: K, value: V):
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        if len(self.cache) > self.size:
-            self.cache.popitem(last=False)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5c113c74778df..6c4d300ec6efe 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,12 +43,14 @@ def __init__(
         max_num_blocks_per_req: int,
         device: torch.device,
         pin_memory: bool,
+        vocab_size: int,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_blocks_per_req = max_num_blocks_per_req
         self.device = device
         self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
 
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
@@ -63,6 +65,7 @@ def __init__(
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
         self.block_table = torch.zeros(
@@ -110,6 +113,50 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: Set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = \
+            self.presence_penalties_cpu_tensor.numpy()
+        self.presence_penalties_reqs: Set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: Set[str] = set()
+
+        self.min_tokens: List[int] = [0] * max_num_reqs
+        self.stop_token_ids: List[Set[int]] = [
+            set() for _ in range(max_num_reqs)
+        ]
+        self.prompt_token_ids: Optional[torch.Tensor] = None
+
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
@@ -133,6 +180,7 @@ def add_request(
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
         self.token_ids_cpu[
             req_index, :num_prompt_tokens] = request.prompt_token_ids
         start_idx = num_prompt_tokens
@@ -157,6 +205,20 @@ def add_request(
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
             self.top_k_reqs.add(req_id)
+        self.frequency_penalties_cpu[req_index] = \
+            sampling_params.frequency_penalty
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[req_index] = \
+            sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[req_index] = \
+            sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        self.min_tokens[req_index] = sampling_params.min_tokens
+        self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -179,6 +241,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
@@ -191,6 +256,9 @@ def clear(self) -> None:
         self.random_reqs.clear()
         self.top_p_reqs.clear()
         self.top_k_reqs.clear()
+        self.frequency_penalties_reqs.clear()
+        self.presence_penalties_reqs.clear()
+        self.repetition_penalties_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
         self.prompt_logprob_reqs.clear()
@@ -224,6 +292,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
             # block_table_cpu.
             self.token_ids_cpu[empty_index] = self.token_ids_cpu[
                 last_req_index]
+            self.num_prompt_tokens[empty_index] = \
+                self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
             self.block_table_cpu[empty_index] = self.block_table_cpu[
@@ -232,6 +302,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[empty_index] = \
+                self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[empty_index] = \
+                self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[empty_index] = \
+                self.repetition_penalties_cpu[last_req_index]
+            self.min_tokens[empty_index] = self.min_tokens[last_req_index]
+            self.stop_token_ids[empty_index] = \
+                self.stop_token_ids[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -241,6 +320,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
+        req_id_output_token_ids: Dict[str, List[int]],
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -250,6 +330,37 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+            if not self.no_penalties:
+                # Since syncing these tensors is expensive only copy them
+                # if necessary i.e. if there are requests which require
+                # penalties to be applied during sampling.
+                self.frequency_penalties[:self.num_reqs].copy_(
+                    self.frequency_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.presence_penalties[:self.num_reqs].copy_(
+                    self.presence_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.repetition_penalties[:self.num_reqs].copy_(
+                    self.repetition_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                # The prompt tokens are used only for applying penalties during
+                # the sampling process. Hence copy these tensors only when
+                # there are requests which need penalties to be applied.
+                self.prompt_token_ids = self._make_prompt_token_ids_tensor()
+
+        output_token_ids: List[List[int]] = []
+
+        for req_id in self.req_ids[:self.num_reqs]:
+            assert req_id is not None
+            # Currently we create a tensor for output_token_ids from scratch
+            # at each step. However, for the penalties computation what we
+            # need is stats about the token ids present in the output. This
+            # stats can be maintained incrementally instead of computing it
+            # from scratch at each step.
+            # TODO - Replace this with incremental update to output token
+            # statistics.
+            output_token_ids.append(req_id_output_token_ids[req_id])
+
         return SamplingMetadata(
             temperature=self.temperature[:self.num_reqs],
             all_greedy=self.all_greedy,
@@ -260,8 +371,33 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=self.prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:self.num_reqs],
+            presence_penalties=self.presence_penalties[:self.num_reqs],
+            repetition_penalties=self.repetition_penalties[:self.num_reqs],
+            output_token_ids=output_token_ids,
+            min_tokens=self.min_tokens[:self.num_reqs],
+            stop_token_ids=self.stop_token_ids[:self.num_reqs],
+            no_penalties=self.no_penalties,
         )
 
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory)
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = (
+            self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -282,6 +418,12 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
     @property
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c6fab5f05fcb3..509771b7e2e5a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -79,8 +79,14 @@ def __init__(
         # Multi-modal data support
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
-        # NOTE: mm_input_mapper is only used for memory profiling.
-        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+
+        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
+        # profiling.
+        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
+        self.mm_hasher = MMHasher()
+        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
+            cache_config.enable_prefix_caching
+
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
@@ -99,6 +105,7 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
+            vocab_size=model_config.get_vocab_size(),
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -377,7 +384,12 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        req_id_output_token_ids: Dict[str, List[int]] = \
+            {req_id: req.output_token_ids \
+                for req_id, req in self.requests.items()}
+
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            req_id_output_token_ids, skip_copy)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -628,11 +640,6 @@ def profile_run(self) -> None:
                 mm_registry=self.mm_registry,
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
-            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
-                mm_data=dummy_mm_data,
-                mm_hashes=None,
-                mm_processor_kwargs=None,
-                precomputed_mm_inputs=None)
 
             # NOTE: Currently model is profiled with a single non-text
             # modality even when it supports multiple.
@@ -648,8 +655,39 @@ def profile_run(self) -> None:
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1
             # they are scheduled to be processed separately.
+
+            # Case when models have a merged processor, their dummy data is
+            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
+            # and take the first item in each batched tensor.
+            # TODO (ywang96): This is somewhat hacky. Refactor this to be
+            # consistent with the other case.
+            if isinstance(dummy_mm_data, MultiModalKwargs):
+                dummy_mm_kwargs = {
+                    k: v[0].unsqueeze(0)
+                    for k, v in dummy_mm_data.items()
+                }
+
+            # Case when models have dummy data explicitly defined as
+            # `MultiModalDataDict`, so they need to be processed through input
+            # mapper.
+            else:
+                # Compute MM hashes (if enabled)
+                mm_hashes = None
+                if self.use_hash:
+                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
+                        dummy_mm_data)
+
+                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                    mm_data=dummy_mm_data,
+                    mm_hashes=mm_hashes,
+                    mm_processor_kwargs=None,
+                    precomputed_mm_inputs=None)
+
+                # Take the first `MultiModalKwargs`
+                dummy_mm_kwargs = mm_kwargs_list[0]
+
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs[0]] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs, device=self.device)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 33491f700de10..0000b09bfaa36 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -202,7 +202,6 @@ def execute_model(
     ) -> ModelRunnerOutput:
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.rank == 0 else None
-        return output
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 420aaf8a1b4cd..f1531e0fc0675 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -114,8 +114,7 @@ class ModelInputData:
         def __init__(self, use_mrope: bool):
             self.use_mrope = use_mrope
             self.input_tokens: List[int] = []
-            self.input_positions: Optional[
-                List[int]] = [] if not self.use_mrope else None
+            self.input_positions: List[int] = []
             self.token_type_ids: Optional[List[int]] = []
             self.seq_lens: List[int] = []
             self.query_lens: List[int] = []
@@ -130,9 +129,8 @@ def __init__(self, use_mrope: bool):
             self.multi_modal_placeholder_maps: Dict[
                 str, MultiModalPlaceholderMap] = defaultdict(
                     MultiModalPlaceholderMap)
-            self.input_mrope_positions: Optional[List[List[int]]] = [
-                [] for _ in range(3)
-            ] if self.use_mrope else None
+            self.input_mrope_positions: List[List[int]] = [[]
+                                                           for _ in range(3)]
 
     def __init__(self,
                  runner: "CPUModelRunner",
@@ -167,7 +165,8 @@ def build(self) -> ModelInputForCPU:
                                     device="cpu")
         input_positions = torch.tensor(
             input_data.input_positions
-            if not input_data.use_mrope else input_data.input_mrope_positions,
+            if not any(input_data.input_mrope_positions) else
+            input_data.input_mrope_positions,
             dtype=torch.long,
             device="cpu")
         token_type_ids = torch.tensor(input_data.token_type_ids,
@@ -236,7 +235,7 @@ def _compute_decode_input_tokens(self, data: ModelInputData,
             block_table = block_table[start_block:]
 
         # For MRotaryEmbedding
-        if data.input_positions is None:
+        if seq_data.mrope_position_delta is not None:
             next_pos = MRotaryEmbedding.get_next_input_positions(
                 seq_data.mrope_position_delta,
                 context_len,
@@ -309,8 +308,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData,
             data.slot_mapping.extend(slot_mapping)
 
         # The MROPE positions are prepared in _compute_multi_modal_input
-        if data.input_positions is not None:
-            data.input_positions.extend(token_positions)
+        data.input_positions.extend(token_positions)
 
         if data.token_type_ids is not None:
             data.token_type_ids.extend(token_types if token_types else [])
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 09758a5d9accf..b5dfebfce6f75 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -333,9 +333,8 @@ def execute_worker(
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
         assert execute_model_req is not None
-        virtual_engine = execute_model_req.virtual_engine
+        virtual_engine: int = execute_model_req.virtual_engine
         num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
-        blocks_to_copy = execute_model_req.blocks_to_copy
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                       device="cpu",
                                       dtype=torch.int64).view(-1, 2)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 6ff98a8f1bab2..2b545d1b28bd2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -13,6 +13,7 @@
 import torch
 import torch.distributed
 import torch.nn as nn
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
@@ -21,7 +22,8 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_kv_transfer_group, get_pp_group
-from vllm.distributed.parallel_state import graph_capture
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
+                                             graph_capture)
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -1413,8 +1415,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         logger.info("Capturing cudagraphs for decoding. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
-                    "use '--enforce-eager' in the CLI.")
-        logger.info("If out-of-memory error occurs during cudagraph capture,"
+                    "use '--enforce-eager' in the CLI. "
+                    "If out-of-memory error occurs during cudagraph capture,"
                     " consider decreasing `gpu_memory_utilization` or "
                     "switching to eager mode. You can also reduce the "
                     "`max_num_seqs` as needed to decrease memory usage.")
@@ -1451,8 +1453,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                for batch_size in \
-                    self.vllm_config.compilation_config.capture_sizes:
+                # Only rank 0 should print progress bar during capture
+                capture_sizes = (
+                    tqdm(
+                        self.vllm_config.compilation_config.capture_sizes,
+                        desc="Capturing CUDA graph shapes",
+                    ) if get_tensor_model_parallel_rank() == 0 else
+                    self.vllm_config.compilation_config.capture_sizes)
+                for batch_size in capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 18b03bf1bfb56..65d9bab0e2822 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -406,8 +406,9 @@ def _async_process_outputs(self, model_input: StatefulModelInput,
             if not cont:
                 break
 
-    def _final_process_outputs(self, model_input: StatefulModelInput,
-                               output_proc_callback: Optional[Callable]):
+    def _final_process_outputs(
+            self, model_input: StatefulModelInput,
+            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
         assert model_input.frozen_model_input is not None
 
         has_async_callback = output_proc_callback is not None
@@ -594,8 +595,8 @@ def execute_model(
         # should be [SamplerOutput]
         return output
 
-    def _update_sampling_metadata(self, sampling_metadata, num_seqs,
-                                  num_queries):
+    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
+                                  num_seqs: Optional[int], num_queries: int):
 
         assert sampling_metadata.num_prompts == 0
         assert len(sampling_metadata.seq_groups) == num_queries
@@ -820,7 +821,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
@@ -850,13 +851,13 @@ def _pythonize_sampler_output(
         seq_ids = seq_group.seq_ids
         next_token_ids = sample_result
         parent_ids = [0]
+        seq_outputs: List[SequenceOutput]
 
         if cache is not None:
             completion_seq_group_output: CompletionSequenceGroupOutput = \
                 cache.cached_completion_seq_group_output.get_object()
             completion_seq_group_output.samples.clear()
-            seq_outputs: List[
-                SequenceOutput] = completion_seq_group_output.samples
+            seq_outputs = completion_seq_group_output.samples
         else:
             seq_outputs = []
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 1beae1e3884c5..f79b3773bcbd2 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -91,6 +91,10 @@ def execute_model(
         ]
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start = torch.cuda.Event(enable_timing=True)
@@ -110,7 +114,8 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
-                **cross_enc_kwargs)
+                **cross_enc_kwargs,
+                **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 5f71ec0c14df8..8f2d343440d3e 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+    # Reminder: Please update docs/source/usage/compatibility_matrix.md
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6d00102e0a324..3ac7fb8dfb766 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -452,7 +452,7 @@ def init_worker(self, *args, **kwargs):
         self.worker = worker_class(*args, **kwargs)
         assert self.worker is not None
 
-    def execute_method(self, method, *args, **kwargs):
+    def execute_method(self, method: str, *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker
             executor = getattr(target, method)