Support sanity checking weight consistency especially for RL #49528
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PR Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| workflow_dispatch: | |
| inputs: | |
| version: | |
| description: "FlashInfer version" | |
| required: true | |
| type: choice | |
| default: "release" | |
| options: | |
| - "release" | |
| - "nightly" | |
| concurrency: | |
| group: pr-test-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| # =============================================== check changes ==================================================== | |
| check-changes: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| main_package: ${{ steps.filter.outputs.main_package }} | |
| sgl_kernel: ${{ steps.filter.outputs.sgl_kernel }} | |
| multimodal_gen: ${{ steps.filter.outputs.multimodal_gen }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Detect file changes | |
| id: filter | |
| uses: dorny/paths-filter@v3 | |
| with: | |
| filters: | | |
| main_package: | |
| - "python/sglang/!(multimodal_gen)/**" | |
| - "python/*.toml" | |
| - "scripts/ci/**" | |
| - "test/**" | |
| - ".github/workflows/pr-test.yml" | |
| sgl_kernel: | |
| - "sgl-kernel/**" | |
| multimodal_gen: | |
| - "python/sglang/multimodal_gen/**" | |
| - "python/sglang/cli/**" | |
| - "python/*.toml" | |
| - ".github/workflows/pr-test.yml" | |
| - name: Show filter results in summary (table) | |
| run: | | |
| { | |
| echo "## Change Detection" | |
| echo "" | |
| echo "| Component | Changed |" | |
| echo "|----------------|---------|" | |
| echo "| main_package | ${{ steps.filter.outputs.main_package }} |" | |
| echo "| sgl_kernel | ${{ steps.filter.outputs.sgl_kernel }} |" | |
| echo "| multimodal_gen | ${{ steps.filter.outputs.multimodal_gen }} |" | |
| } >> $GITHUB_STEP_SUMMARY | |
| # =============================================== PR Gate ==================================================== | |
| call-gate: | |
| needs: check-changes | |
| if: | | |
| needs.check-changes.outputs.main_package == 'true' || | |
| needs.check-changes.outputs.sgl_kernel == 'true' || | |
| needs.check-changes.outputs.multimodal_gen == 'true' | |
| uses: ./.github/workflows/pr-gate.yml | |
| secrets: inherit | |
| # =============================================== sgl-kernel ==================================================== | |
| sgl-kernel-build-wheels: | |
| needs: [check-changes, call-gate] | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| runs-on: x64-kernel-build-node | |
| strategy: | |
| matrix: | |
| include: | |
| - python-version: "3.10" | |
| cuda-version: "12.9" | |
| # Add back when CUDA 13.0 is supported on CI | |
| # - python-version: "3.10" | |
| # cuda-version: "13.0" | |
| name: Build Wheel | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| sudo rm -rf $GITHUB_WORKSPACE/* || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: "recursive" | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }} | |
| run: | | |
| cd sgl-kernel | |
| ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" | |
| env: | |
| USE_CCACHE: 1 | |
| - name: Upload artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }} | |
| path: sgl-kernel/dist/* | |
| sgl-kernel-build-wheels-arm: | |
| needs: [check-changes, call-gate] | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| runs-on: arm-kernel-build-node | |
| strategy: | |
| matrix: | |
| include: | |
| - python-version: "3.10" | |
| cuda-version: "12.9" | |
| name: Build Wheel Arm | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| if [ -d "$GITHUB_WORKSPACE" ]; then | |
| sudo rm -rf "$GITHUB_WORKSPACE"/* || true | |
| else | |
| echo "$GITHUB_WORKSPACE does not exist, nothing to clean" | |
| fi | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: "recursive" | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }} | |
| run: | | |
| cd sgl-kernel | |
| ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" | |
| env: | |
| USE_CCACHE: 1 | |
| - name: Upload artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64 | |
| path: sgl-kernel/dist/* | |
| sgl-kernel-unit-test: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Cleanup | |
| run: | | |
| ls -alh sgl-kernel/dist || true | |
| rm -rf sgl-kernel/dist/* || true | |
| - name: Download artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd sgl-kernel | |
| pytest tests/ | |
| sgl-kernel-mla-test: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Cleanup | |
| run: | | |
| ls -alh sgl-kernel/dist || true | |
| rm -rf sgl-kernel/dist/* || true | |
| - name: Download artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd test/srt | |
| python3 test_mla_deepseek_v3.py | |
| sgl-kernel-benchmark-test: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| runs-on: 1-gpu-runner | |
| env: | |
| CI: true | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Cleanup | |
| run: | | |
| ls -alh sgl-kernel/dist || true | |
| rm -rf sgl-kernel/dist/* || true | |
| - name: Download artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run benchmark tests | |
| timeout-minutes: 45 | |
| run: | | |
| cd sgl-kernel/benchmark | |
| echo "Running sgl-kernel benchmark tests in CI mode..." | |
| echo "CI environment variable: $CI" | |
| echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS" | |
| for bench_file in bench_*.py; do | |
| echo "Testing $bench_file..." | |
| timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..." | |
| echo "Completed $bench_file" | |
| echo "---" | |
| done | |
| echo "All benchmark tests completed!" | |
| # sgl-kernel-b200-test: | |
| # needs: [check-changes, sgl-kernel-build-wheels] | |
| # if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| # runs-on: 4-gpu-b200 | |
| # env: | |
| # RUNNER_LABELS: 4-gpu-b200 | |
| # steps: | |
| # - uses: actions/checkout@v4 | |
| # - name: Cleanup | |
| # run: | | |
| # ls -alh sgl-kernel/dist || true | |
| # rm -rf sgl-kernel/dist/* || true | |
| # - name: Download artifacts | |
| # uses: actions/download-artifact@v4 | |
| # with: | |
| # path: sgl-kernel/dist/ | |
| # merge-multiple: true | |
| # pattern: wheel-python3.10-cuda12.9 | |
| # - name: Install dependencies | |
| # run: | | |
| # CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh | |
| # - name: Run sgl-kernel unit tests on B200 | |
| # timeout-minutes: 30 | |
| # run: | | |
| # cd sgl-kernel | |
| # pytest tests/ | |
| # Adding a single CUDA13 smoke test to verify that the kernel builds and runs | |
| # TODO: Add back this test when it can pass on CI | |
| # cuda13-kernel-smoke-test: | |
| # needs: [check-changes, sgl-kernel-build-wheels] | |
| # if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| # runs-on: x64-cu13-kernel-tests | |
| # steps: | |
| # - uses: actions/checkout@v4 | |
| # - name: Cleanup | |
| # run: | | |
| # ls -alh sgl-kernel/dist || true | |
| # rm -rf sgl-kernel/dist/* || true | |
| # - name: Download CUDA 13.0 artifacts | |
| # uses: actions/download-artifact@v4 | |
| # with: | |
| # path: sgl-kernel/dist/ | |
| # merge-multiple: true | |
| # pattern: wheel-python3.10-cuda13.0 | |
| # - name: Install dependencies | |
| # run: | | |
| # CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| # - name: Run kernel unit tests | |
| # timeout-minutes: 30 | |
| # run: | | |
| # cd sgl-kernel | |
| # pytest tests/ | |
| # =============================================== primary ==================================================== | |
| stage-a-test-1: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/ | |
| python3 run_suite.py --hw cuda --suite stage-a-test-1 | |
| # temporarily put backend-independent cpu tests here | |
| python3 run_suite.py --hw cpu --suite default | |
| multimodal-gen-test-1-gpu: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: (always() && !failure() && !cancelled()) && needs.check-changes.outputs.multimodal_gen == 'true' | |
| runs-on: 1-gpu-runner | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Cleanup | |
| run: | | |
| ls -alh sgl-kernel/dist || true | |
| rm -rf sgl-kernel/dist/* || true | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion | |
| - name: Run diffusion server tests | |
| timeout-minutes: 60 | |
| run: | | |
| cd python | |
| python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 1-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 2 | |
| multimodal-gen-test-2-gpu: | |
| needs: [check-changes, call-gate, sgl-kernel-build-wheels] | |
| if: (always() && !failure() && !cancelled()) && needs.check-changes.outputs.multimodal_gen == 'true' | |
| runs-on: 2-gpu-runner | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Cleanup | |
| run: | | |
| ls -alh sgl-kernel/dist || true | |
| rm -rf sgl-kernel/dist/* || true | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion | |
| - name: Run diffusion server tests | |
| timeout-minutes: 60 | |
| run: | | |
| cd python | |
| python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 2-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 2 | |
| quantization-test: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| pip install "bitsandbytes>=0.44.0" | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite quantization_test | |
| unit-test-backend-1-gpu: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 5 | |
| matrix: | |
| part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-1-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 15 | |
| unit-test-backend-2-gpu: | |
| needs: [check-changes, call-gate, unit-test-backend-1-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 2-gpu-runner | |
| env: | |
| RUNNER_LABELS: 2-gpu-runner | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 | |
| unit-test-backend-4-gpu: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 4-gpu-h100 | |
| env: | |
| RUNNER_LABELS: 4-gpu-h100 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 | |
| unit-test-backend-8-gpu-h200: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 8-gpu-h200 | |
| env: | |
| RUNNER_LABELS: 8-gpu-h200 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1, 2] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 | |
| unit-test-backend-8-gpu-h20: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 8-gpu-h20 | |
| env: | |
| SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" | |
| RUNNER_LABELS: 8-gpu-h20 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 | |
| performance-test-1-gpu-part-1: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Benchmark single latency | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small | |
| python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default | |
| - name: Benchmark online latency | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default | |
| - name: Benchmark offline throughput | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default | |
| - name: Benchmark offline throughput (Non-streaming, small batch size) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size | |
| - name: Benchmark online latency (EAGLE) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle | |
| - name: Benchmark online latency (LoRA) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates | |
| performance-test-1-gpu-part-2: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Benchmark offline throughput (w/o RadixAttention) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache | |
| - name: Benchmark offline throughput (w/ Triton) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend | |
| - name: Benchmark offline throughput (w/ FP8) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 | |
| - name: Benchmark VLM offline throughput | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput | |
| - name: Benchmark VLM online latency | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency | |
| performance-test-1-gpu-part-3: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Benchmark Scores online latency and throughput | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput | |
| - name: Benchmark Scores online latency and throughput (batch size scaling) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling | |
| - name: Benchmark Embeddings online latency and throughput | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_latency_throughput | |
| - name: Benchmark Embeddings online latency and throughput (batch size scaling) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_batch_scaling | |
| performance-test-2-gpu: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 2-gpu-runner | |
| env: | |
| RUNNER_LABELS: 2-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| - name: Benchmark single latency (TP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 | |
| - name: Benchmark single latency + torch.compile (TP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 | |
| - name: Benchmark offline throughput (TP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default | |
| - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache | |
| - name: Benchmark offline PP decode throughput (PP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode | |
| - name: Benchmark offline PP prefill throughput (PP=2) | |
| timeout-minutes: 10 | |
| run: | | |
| cd test/srt | |
| python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill | |
| accuracy-test-1-gpu: | |
| needs: [check-changes, call-gate, stage-a-test-1] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 1-gpu-runner | |
| env: | |
| RUNNER_LABELS: 1-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| git clone https://github.com/merrymercy/human-eval.git | |
| cd human-eval | |
| pip install -e . | |
| - name: Evaluate accuracy | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 test_eval_accuracy_large.py | |
| accuracy-test-2-gpu: | |
| needs: [check-changes, call-gate, accuracy-test-1-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 2-gpu-runner | |
| env: | |
| RUNNER_LABELS: 2-gpu-runner | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh | |
| git clone https://github.com/merrymercy/human-eval.git | |
| cd human-eval | |
| pip install -e . | |
| - name: Evaluate accuracy (TP=2) | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 test_moe_eval_accuracy_large.py | |
| unit-test-deepep-4-gpu: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 4-gpu-h100 | |
| env: | |
| RUNNER_LABELS: 4-gpu-h100 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-4-gpu-deepep | |
| unit-test-deepep-8-gpu: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 8-gpu-h200 | |
| env: | |
| RUNNER_LABELS: 8-gpu-h200 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-8-gpu-h200-deepep | |
| unit-test-backend-4-gpu-b200: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 4-gpu-b200 | |
| env: | |
| RUNNER_LABELS: 4-gpu-b200 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| part: [0, 1] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v6 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 | |
| unit-test-backend-4-gpu-gb200: | |
| needs: [check-changes, call-gate, unit-test-backend-2-gpu, sgl-kernel-build-wheels-arm] | |
| if: always() && !failure() && !cancelled() && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| runs-on: 4-gpu-gb200 | |
| env: | |
| RUNNER_LABELS: 4-gpu-gb200 | |
| strategy: | |
| fail-fast: false | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9-aarch64 | |
| - name: Install dependencies | |
| run: | | |
| CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/ci_install_deepep.sh | |
| - name: Run test | |
| timeout-minutes: 45 | |
| run: | | |
| cd test/srt | |
| python3 run_suite.py --suite per-commit-4-gpu-gb200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600 | |
| pr-test-finish: | |
| needs: | |
| [ | |
| call-gate, | |
| check-changes, | |
| sgl-kernel-build-wheels, | |
| sgl-kernel-unit-test, | |
| sgl-kernel-mla-test, | |
| sgl-kernel-benchmark-test, | |
| multimodal-gen-test-1-gpu, | |
| multimodal-gen-test-2-gpu, | |
| stage-a-test-1, | |
| quantization-test, | |
| unit-test-backend-1-gpu, | |
| unit-test-backend-2-gpu, | |
| unit-test-backend-4-gpu, | |
| unit-test-backend-8-gpu-h20, | |
| unit-test-backend-8-gpu-h200, | |
| performance-test-1-gpu-part-1, | |
| performance-test-1-gpu-part-2, | |
| performance-test-1-gpu-part-3, | |
| performance-test-2-gpu, | |
| accuracy-test-1-gpu, | |
| accuracy-test-2-gpu, | |
| unit-test-deepep-4-gpu, | |
| unit-test-deepep-8-gpu, | |
| unit-test-backend-4-gpu-b200, | |
| unit-test-backend-4-gpu-gb200, | |
| ] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check all dependent job statuses | |
| run: | | |
| # Convert the 'needs' context to a JSON string | |
| json_needs='${{ toJson(needs) }}' | |
| # Get a list of all job names from the JSON keys | |
| job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') | |
| for job in $job_names; do | |
| # For each job, extract its result | |
| result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') | |
| # Print the job name and its result | |
| echo "$job: $result" | |
| # Check for failure or cancellation and exit if found | |
| if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then | |
| echo "The above jobs failed." | |
| exit 1 | |
| fi | |
| done | |
| # If the loop completes, all jobs were successful | |
| echo "All jobs completed successfully" | |
| exit 0 |