diff --git a/.github/workflows/bot-bump-kernel-version-to-sglang.yml b/.github/workflows/bot-bump-kernel-version-to-sglang.yml index 6a46c2c7edb..1621c924179 100644 --- a/.github/workflows/bot-bump-kernel-version-to-sglang.yml +++ b/.github/workflows/bot-bump-kernel-version-to-sglang.yml @@ -62,7 +62,7 @@ jobs: run-nightly-tests: needs: bump-kernel-version-to-sglang if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true' - uses: ./.github/workflows/nightly-test.yml + uses: ./.github/workflows/nightly-test-nvidia.yml with: ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }} secrets: inherit diff --git a/.github/workflows/bot-bump-sglang-version.yml b/.github/workflows/bot-bump-sglang-version.yml index 4131397f12e..cbfd8e761a5 100644 --- a/.github/workflows/bot-bump-sglang-version.yml +++ b/.github/workflows/bot-bump-sglang-version.yml @@ -55,7 +55,7 @@ jobs: run-nightly-tests: needs: bump-sglang-version - uses: ./.github/workflows/nightly-test.yml + uses: ./.github/workflows/nightly-test-nvidia.yml with: ref: ${{ needs.bump-sglang-version.outputs.branch_name }} secrets: inherit diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index 68f533c8489..0e9e0c937d8 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -130,26 +130,24 @@ jobs: run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_glm_4_6 - # MiniMax-M2 test temporarily disabled due to compatibility issues - # See MINIMAX_M2_ISSUES.md for details - # - name: Run MiniMax-M2 nightly performance test - # timeout-minutes: 180 - # env: - # TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - # PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - # GPU_CONFIG: "8-gpu-h200" - # run: | - # rm -rf test/performance_profiles_minimax_m2/ - # cd test - # python3 nightly/test_minimax_m2_perf.py - - # - name: Publish MiniMax-M2 traces to storage repo - # env: - # GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - # GITHUB_RUN_ID: ${{ github.run_id }} - # GITHUB_RUN_NUMBER: ${{ github.run_number }} - # run: | - # python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 + - name: Run MiniMax-M2 nightly performance test + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + GPU_CONFIG: "8-gpu-h200" + run: | + rm -rf test/performance_profiles_minimax_m2/ + cd test + python3 nightly/test_minimax_m2_perf.py + + - name: Publish MiniMax-M2 traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 # General tests - 8 GPU H20 nightly-test-general-8-gpu-h20: @@ -460,26 +458,24 @@ jobs: run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_glm_4_6 - # MiniMax-M2 test temporarily disabled due to compatibility issues - # See MINIMAX_M2_ISSUES.md for details - # - name: Run MiniMax-M2 nightly performance test - # timeout-minutes: 180 - # env: - # TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - # PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - # GPU_CONFIG: "8-gpu-b200" - # run: | - # rm -rf test/performance_profiles_minimax_m2/ - # cd test - # IS_BLACKWELL=1 python3 nightly/test_minimax_m2_perf.py - - # - name: Publish MiniMax-M2 traces to storage repo - # env: - # GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - # GITHUB_RUN_ID: ${{ github.run_id }} - # GITHUB_RUN_NUMBER: ${{ github.run_number }} - # run: | - # python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 + - name: Run MiniMax-M2 nightly performance test + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + GPU_CONFIG: "8-gpu-b200" + run: | + rm -rf test/performance_profiles_minimax_m2/ + cd test + IS_BLACKWELL=1 python3 nightly/test_minimax_m2_perf.py + + - name: Publish MiniMax-M2 traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_minimax_m2 # Final check job check-all-jobs: diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml deleted file mode 100644 index 0ae6097d9b6..00000000000 --- a/.github/workflows/nightly-test.yml +++ /dev/null @@ -1,256 +0,0 @@ -name: Nightly Test - -on: - schedule: - - cron: '0 0 * * *' - push: - branches: - - main - paths: - - "python/sglang/version.py" - workflow_dispatch: - workflow_call: - inputs: - ref: - description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' - required: false - type: string - default: '' - -concurrency: - group: nightly-test-${{ github.ref }} - cancel-in-progress: true - -jobs: - nightly-test-eval-text-models: - if: github.repository == 'sgl-project/sglang' - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run eval test for text models - timeout-minutes: 120 - run: | - cd test/srt - python3 nightly/test_text_models_gsm8k_eval.py - - nightly-test-perf-text-models: - if: github.repository == 'sgl-project/sglang' - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run performance test for text models - timeout-minutes: 180 - env: - TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - run: | - cd test/srt - rm -rf performance_profiles_text_models/ - python3 nightly/test_text_models_perf.py - - - name: Publish traces to storage repo - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - run: | - python3 scripts/ci/publish_traces.py --traces-dir test/srt/performance_profiles_text_models - - nightly-test-eval-vlms: - if: github.repository == 'sgl-project/sglang' - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run eval test for VLM models (fixed MMMU-100) - timeout-minutes: 240 - run: | - cd test/srt - python3 nightly/test_vlms_mmmu_eval.py - - nightly-test-perf-vlms: - if: github.repository == 'sgl-project/sglang' - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run perf test for VLM models (MMMU) - timeout-minutes: 240 - env: - TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - run: | - cd test/srt - rm -rf performance_profiles_vlms/ - python3 nightly/test_vlms_perf.py - - - name: Publish traces to storage repo - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - run: | - python3 scripts/ci/publish_traces.py --traces-dir test/srt/performance_profiles_vlms - - nightly-test-1-gpu: - if: github.repository == 'sgl-project/sglang' - runs-on: 1-gpu-runner - - env: - RUNNER_LABELS: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 60 - run: | - cd test/srt - python3 run_suite.py --suite nightly-1-gpu --continue-on-error - - nightly-test-4-gpu: - if: github.repository == 'sgl-project/sglang' - runs-on: 4-gpu-h100 - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite nightly-4-gpu --continue-on-error - - nightly-test-8-gpu-h200: - if: github.repository == 'sgl-project/sglang' - runs-on: 8-gpu-h200 - env: - RUNNER_LABELS: 8-gpu-h200 - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite nightly-8-gpu-h200 --continue-on-error - - nightly-test-8-gpu-h20: - if: github.repository == 'sgl-project/sglang' - runs-on: 8-gpu-h20 - env: - SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite nightly-8-gpu-h20 --continue-on-error - - nightly-test-8-gpu-b200: - if: github.repository == 'sgl-project/sglang' - runs-on: 8-gpu-b200 - env: - RUNNER_LABELS: 8-gpu-b200 - strategy: - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: | - IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 45 - run: | - cd test/srt - python3 run_suite.py --suite nightly-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600 - - check-all-jobs: - if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'workflow_dispatch') - needs: - - nightly-test-eval-text-models - - nightly-test-perf-text-models - - nightly-test-eval-vlms - - nightly-test-perf-vlms - - nightly-test-1-gpu - - nightly-test-4-gpu - - nightly-test-8-gpu-h200 - - nightly-test-8-gpu-h20 - - nightly-test-8-gpu-b200 - runs-on: ubuntu-latest - steps: - - name: Check if any job failed - run: | - # Now that continue-on-error is removed, failures will be properly reported - if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then - echo "One or more nightly test jobs failed" - exit 1 - fi - if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then - echo "One or more nightly test jobs were cancelled" - exit 1 - fi - echo "All nightly test jobs passed"