Merge remote-tracking branch 'origin/main' into main_w8a8_fp8

HandH1998 · Jan 26, 2025 · a1b582e · a1b582e
2 parents 8c3dc13 + 27acf63
commit a1b582e
Show file tree

Hide file tree

Showing 29 changed files with 543 additions and 185 deletions.
diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
@@ -30,20 +30,55 @@ jobs:
           clangFormatVersion: 16
           style: file
 
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        cuda-version: ['12.4']
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
   unit-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
     runs-on: 1-gpu-runner
     steps:
       - uses: actions/checkout@v4
 
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
       - name: Install
         run: |
           pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.6.4.post1
           pip3 uninstall sgl-kernel -y || true
-          find . -name index.lock -delete
-          cd sgl-kernel
-          git submodule deinit --all --force && git submodule sync --recursive && git submodule update --init --force --recursive
-          pip3 install .
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
 
       - name: Run test

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -43,6 +43,8 @@ jobs:
 
       - name: Run test
         timeout-minutes: 10
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
           cd test/lang
           python3 run_suite.py --suite per-commit

diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
@@ -0,0 +1,92 @@
+name: Release SGLang Kernel Wheel (cu118)
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/version.py
+
+jobs:
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10', '3.11', '3.12']
+        cuda-version: ['11.8']
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "sgl-kernel/3rdparty/flashinfer"]
 	path = sgl-kernel/3rdparty/flashinfer
 	url = https://github.com/flashinfer-ai/flashinfer.git
+[submodule "sgl-kernel/3rdparty/turbomind"]
+	path = sgl-kernel/3rdparty/turbomind
+	url = https://github.com/InternLM/turbomind
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
@@ -1,6 +1,5 @@
 import argparse
 import itertools
-import time
 
 import torch
 import triton

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
@@ -28,6 +28,7 @@
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
+- Phi-3 / Phi-4
 - Phi-3-Small
 - IBM Granite 3
 

diff --git a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
@@ -3,11 +3,16 @@
 import torch
 
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import get_compiler_backend
 
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def apply_scaling_penalties(logits, scaling_penalties):
+    logits[:] = torch.where(
+        logits > 0,
+        logits / scaling_penalties,
+        logits * scaling_penalties,
+    )
 
 
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -61,16 +66,7 @@ def _cumulate_output_tokens(self, output_ids: _TokenIDs):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
 
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if is_cuda:
-            return sampling_scaling_penalties(
-                logits, self.cumulated_repetition_penalties
-            )
-        else:
-            return torch.where(
-                logits > 0,
-                logits / self.cumulated_repetition_penalties,
-                logits * self.cumulated_repetition_penalties,
-            )
+        apply_scaling_penalties(logits, self.cumulated_repetition_penalties)
 
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -7,14 +7,11 @@
 
 import torch
 
-from sglang.srt.utils import is_cuda_available
-
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
-
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.penaltylib.penalizers.repetition_penalty import (
+    apply_scaling_penalties,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -386,14 +383,7 @@ def apply_logits_bias(self, logits: torch.Tensor):
 
         # repetition
         if self.scaling_penalties is not None:
-            if is_cuda:
-                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
-            else:
-                logits[:] = torch.where(
-                    logits > 0,
-                    logits / self.scaling_penalties,
-                    logits * self.scaling_penalties,
-                )
+            apply_scaling_penalties(logits, self.scaling_penalties)
 
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

diff --git a/scripts/update_kernel_whl_index.py b/scripts/update_kernel_whl_index.py
@@ -0,0 +1,16 @@
+# Reference: https://github.com/flashinfer-ai/flashinfer/blob/v0.2.0/scripts/update_whl_index.py
+
+import hashlib
+import pathlib
+import re
+
+for path in sorted(pathlib.Path("sgl-kernel/dist").glob("*.whl")):
+    with open(path, "rb") as f:
+        sha256 = hashlib.sha256(f.read()).hexdigest()
+    ver = re.findall(r"sgl_kernel-([0-9.]+(?:\.post[0-9]+)?)-", path.name)[0]
+    index_dir = pathlib.Path(f"sgl-whl/cu118/sgl-kernel")
+    index_dir.mkdir(exist_ok=True)
+    base_url = "https://github.com/sgl-project/whl/releases/download"
+    full_url = f"{base_url}/v{ver}/{path.name}#sha256={sha256}"
+    with (index_dir / "index.html").open("a") as f:
+        f.write(f'<a href="{full_url}">{path.name}</a><br>\n')
diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
diff --git a/sgl-kernel/3rdparty/turbomind b/sgl-kernel/3rdparty/turbomind
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
@@ -1,4 +1,4 @@
-.PHONY: tree ln submodule install build clean test format
+.PHONY: tree ln submodule install build clean rebuild test format
 
 tree:
 	@tree --prune -I "__pycache__|*.egg-info|*.so|build|3rdparty|dist"
@@ -13,11 +13,14 @@ install: submodule
 	@pip install -e .
 
 build: submodule
-	@export MAX_JOBS=$(nproc) && python3 setup.py bdist_wheel
+	@rm -rf dist/* || true && export MAX_JOBS=$(nproc) && python3 setup.py bdist_wheel && pip3 install dist/*whl --force-reinstall --no-deps
 
 clean:
 	@rm -rf build dist *.egg-info
 
+rebuild: clean submodule build
+	@echo "Succeed to rebuild"
+
 test:
 	@find tests -name "test_*.py" | xargs -n 1 python3
 

diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md
@@ -1,5 +1,19 @@
 # SGL Kernel
 
-Kernel Library for SGLang
+[Kernel Library](https://github.com/sgl-project/sglang/tree/main/sgl-kernel) for SGLang
 
 [![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel)
+
+## Installation
+
+For CUDA 11.8:
+
+```bash
+pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
+```
+
+For CUDA 12.1 or CUDA 12.4:
+
+```bash
+pip3 install sgl-kernel
+```
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
@@ -4,6 +4,12 @@ PYTHON_VERSION=$1
 CUDA_VERSION=$2
 PYTHON_ROOT_PATH=/opt/python/cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}
 
+if (( ${CUDA_VERSION%.*} < 12 )); then
+    ENABLE_SM90A=0
+else
+    ENABLE_SM90A=1
+fi
+
 docker run --rm \
     -v "$(pwd)":/sgl-kernel \
     pytorch/manylinux-builder:cuda${CUDA_VERSION} \
@@ -13,7 +19,7 @@ docker run --rm \
     export CUDA_VERSION=${CUDA_VERSION} && \
     export SGL_KERNEL_ENABLE_BF16=1 && \
     export SGL_KERNEL_ENABLE_FP8=1 && \
-    export SGL_KERNEL_ENABLE_SM90A=1 && \
+    export SGL_KERNEL_ENABLE_SM90A=${ENABLE_SM90A} && \
     mkdir -p /usr/lib/x86_64-linux-gnu/ && \
     ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
     cd /sgl-kernel && \

diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
@@ -19,25 +19,25 @@ Third-party libraries:
 - [CCCL](https://github.com/NVIDIA/cccl)
 - [CUTLASS](https://github.com/NVIDIA/cutlass)
 - [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
+- [TurboMind](https://github.com/InternLM/turbomind)
 
 ### Kernel Development
 
 Steps to add a new kernel:
 
 1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
-2. Expose interface in [csrc/sgl_kernel_ops.cu](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu) with pybind11
-3. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
-4. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
-5. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
+2. Expose interface in [src/sgl-kernel/include/sgl_kernel_ops.h](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/include/sgl_kernel_ops.h)
+3. Create torch extension in [src/sgl-kernel/torch_extension.cc](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/torch_extension.cc)
+4. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
+5. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
+6. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
 
 ### Build & Install
 
 Development build:
 
 ```bash
 make build
-pip3 install dist/*whl --force-reinstall --no-deps
-# Or use: make install (runs pip install -e .)
 ```
 
 ### Testing & Benchmarking

diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post16"
+version = "0.0.2.post17"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -17,7 +17,7 @@ classifiers = [
 dependencies = []
 
 [project.urls]
-"Homepage" = "https://github.com/sgl-project/sglang"
+"Homepage" = "https://github.com/sgl-project/sglang/tree/main/sgl-kernel"
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
 
 [tool.setuptools]
+1 −0		.gitignore
+3 −3		CHANGELOG.md
+1 −1		CMakeLists.txt
+1 −1		README.md
+0 −14		aot_build_utils/generate.py
+12 −0		aot_build_utils/generate_batch_paged_decode_inst.py
+12 −0		aot_build_utils/generate_dispatch_inc.py
+13 −0		aot_build_utils/generate_single_decode_inst.py
+1 −1		cmake/config.cmake
+1 −1		cmake/modules/FindThrust.cmake
+0 −66		csrc/aot_default_additional_params.h
+3 −0		csrc/aot_extension_utils.h
+17 −15		csrc/batch_prefill_sm90_config.inc
+3 −3		csrc/flashinfer_ops.cu
+7 −7		csrc/flashinfer_ops_sm90.cu
+1 −1		csrc/pytorch_extension_utils.h
+16 −14		csrc/single_prefill_sm90_config.inc
+1 −1		docs/tutorials/kv_layout.rst
+1 −1		docs/tutorials/recursive_attention.rst
+1 −1		flashinfer/activation.py
+1 −1		flashinfer/cascade.py
+54 −50		flashinfer/decode.py
+2 −2		flashinfer/jit/__init__.py
+1 −1		flashinfer/jit/attention.py
+12 −6		flashinfer/jit/core.py
+41 −27		flashinfer/prefill.py
+10 −10		flashinfer/rope.py
+3 −3		flashinfer/sampling.py
+6 −5		flashinfer/sparse.py
+1 −1		flashinfer/triton/__init__.py
+1 −1		flashinfer/triton/utils.py
+4 −4		flashinfer/utils.py
+314 −0		format.sh
+1 −1		include/flashinfer/attention/cascade.cuh
+52 −0		pyproject.toml
+33 −5		setup.py
+1 −3		tests/alibi_reference.py
+0 −1		tests/test_alibi.py
+0 −1		tests/test_batch_decode_kernels.py
+0 −2		tests/test_decode_prefill_lse.py
+0 −1		tests/test_logits_cap.py
+1 −3		tests/test_mla_decode_kernel.py
+1 −1		tests/test_sampling.py
+2 −2		tests/test_shared_prefix_kernels.py