Skip to content

Commit 80dfb82

Browse files
authored
feat: slurm jobs added fp4 and 8k1k (#4747)
1 parent 3fea2e1 commit 80dfb82

File tree

11 files changed

+859
-203
lines changed

11 files changed

+859
-203
lines changed

examples/backends/sglang/launch/disagg.sh

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,31 +49,36 @@ OTEL_SERVICE_NAME=dynamo-frontend \
4949
python3 -m dynamo.frontend &
5050
DYNAMO_PID=$!
5151

52+
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
53+
5254
# run prefill worker
5355
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
5456
python3 -m dynamo.sglang \
55-
--model-path Qwen/Qwen3-0.6B \
56-
--served-model-name Qwen/Qwen3-0.6B \
57+
--model-path silence09/DeepSeek-R1-Small-2layers \
58+
--served-model-name silence09/DeepSeek-R1-Small-2layers \
5759
--page-size 16 \
58-
--tp 1 \
60+
--tp 2 --dp-size 2 --enable-dp-attention \
61+
--load-balance-method round_robin \
5962
--trust-remote-code \
6063
--disaggregation-mode prefill \
6164
--disaggregation-bootstrap-port 12345 \
6265
--host 0.0.0.0 \
66+
--port 40000 \
6367
--disaggregation-transfer-backend nixl \
64-
--enable-metrics &
68+
--enable-metrics --log-level debug &
6569
PREFILL_PID=$!
6670

6771
# run decode worker
6872
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
69-
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
70-
--model-path Qwen/Qwen3-0.6B \
71-
--served-model-name Qwen/Qwen3-0.6B \
73+
CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \
74+
--model-path silence09/DeepSeek-R1-Small-2layers \
75+
--served-model-name silence09/DeepSeek-R1-Small-2layers \
7276
--page-size 16 \
73-
--tp 1 \
77+
--prefill-round-robin-balance \
78+
--tp 2 --dp-size 2 --enable-dp-attention \
7479
--trust-remote-code \
7580
--disaggregation-mode decode \
7681
--disaggregation-bootstrap-port 12345 \
7782
--host 0.0.0.0 \
7883
--disaggregation-transfer-backend nixl \
79-
--enable-metrics
84+
--enable-metrics --log-level debug
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/bin/bash
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# Function to print usage
6+
print_usage() {
7+
echo "Usage: $0 <mode>"
8+
echo " mode: prefill or decode"
9+
echo ""
10+
echo "Examples:"
11+
echo " $0 prefill"
12+
echo " $0 decode"
13+
exit 1
14+
}
15+
16+
# Check if correct number of arguments provided
17+
if [ $# -ne 1 ]; then
18+
echo "Error: Expected 1 argument, got $#"
19+
print_usage
20+
fi
21+
22+
# Parse arguments
23+
mode=$1
24+
25+
# Validate mode argument
26+
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
27+
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
28+
print_usage
29+
fi
30+
31+
echo "Mode: $mode"
32+
echo "Command: dynamo"
33+
34+
# Check if required environment variables are set
35+
if [ -z "$HOST_IP_MACHINE" ]; then
36+
echo "Error: HOST_IP_MACHINE environment variable is not set"
37+
exit 1
38+
fi
39+
40+
if [ -z "$PORT" ]; then
41+
echo "Error: PORT environment variable is not set"
42+
exit 1
43+
fi
44+
45+
if [ -z "$TOTAL_GPUS" ]; then
46+
echo "Error: TOTAL_GPUS environment variable is not set"
47+
exit 1
48+
fi
49+
50+
if [ -z "$RANK" ]; then
51+
echo "Error: RANK environment variable is not set"
52+
exit 1
53+
fi
54+
55+
if [ -z "$TOTAL_NODES" ]; then
56+
echo "Error: TOTAL_NODES environment variable is not set"
57+
exit 1
58+
fi
59+
60+
if [ -z "$USE_INIT_LOCATIONS" ]; then
61+
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
62+
exit 1
63+
fi
64+
65+
if [ -z "$RUN_IN_CI" ]; then
66+
echo "Error: RUN_IN_CI environment variable is not set"
67+
exit 1
68+
fi
69+
70+
# Construct command based on mode
71+
if [ "$mode" = "prefill" ]; then
72+
set -x
73+
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
74+
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
75+
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
76+
fi
77+
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
78+
79+
command_suffix=""
80+
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
81+
82+
PYTHONUNBUFFERED=1 \
83+
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
84+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
85+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
86+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
87+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
88+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
89+
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
90+
MC_FORCE_MNNVL=1 \
91+
NCCL_MNNVL_ENABLE=1 \
92+
NCCL_CUMEM_ENABLE=1 \
93+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
94+
SGLANG_ENABLE_JIT_DEEPGEMM=false \
95+
SGLANG_ENABLE_FLASHINFER_GEMM=true \
96+
python3 -m dynamo.sglang \
97+
--disaggregation-mode prefill \
98+
--served-model-name deepseek-ai/DeepSeek-R1 \
99+
--model-path /model/ \
100+
--trust-remote-code \
101+
--disable-radix-cache \
102+
--kv-cache-dtype fp8_e4m3 \
103+
--attention-backend trtllm_mla \
104+
--quantization modelopt_fp4 \
105+
--moe-runner-backend flashinfer_trtllm \
106+
--stream-interval 10 \
107+
--watchdog-timeout 1000000 \
108+
--context-length 2200 \
109+
--mem-fraction-static 0.95 \
110+
--max-total-tokens 8192 \
111+
--chunked-prefill-size 8192 \
112+
--cuda-graph-max-bs 256 \
113+
--max-running-requests 512 \
114+
--scheduler-recv-interval 10 \
115+
--enable-symm-mem \
116+
--moe-dense-tp-size 1 \
117+
--load-balance-method round_robin \
118+
--disaggregation-bootstrap-port 30001 \
119+
--data-parallel-size 1 \
120+
--tensor-parallel-size "$TOTAL_GPUS" \
121+
--expert-parallel-size 1 \
122+
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
123+
--nnodes "$TOTAL_NODES" \
124+
--node-rank "$RANK" \
125+
--host 0.0.0.0 ${command_suffix}
126+
127+
elif [ "$mode" = "decode" ]; then
128+
set -x
129+
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
130+
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
131+
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
132+
fi
133+
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
134+
135+
command_suffix=""
136+
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
137+
138+
PYTHONUNBUFFERED=1 \
139+
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
140+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
141+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
142+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
143+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
144+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
145+
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
146+
MC_FORCE_MNNVL=1 \
147+
NCCL_MNNVL_ENABLE=1 \
148+
NCCL_CUMEM_ENABLE=1 \
149+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
150+
SGLANG_ENABLE_JIT_DEEPGEMM=false \
151+
SGLANG_ENABLE_FLASHINFER_GEMM=true \
152+
python3 -m dynamo.sglang \
153+
--disaggregation-mode decode \
154+
--served-model-name deepseek-ai/DeepSeek-R1 \
155+
--model-path /model/ \
156+
--prefill-round-robin-balance \
157+
--trust-remote-code \
158+
--disable-radix-cache \
159+
--kv-cache-dtype fp8_e4m3 \
160+
--attention-backend trtllm_mla \
161+
--quantization modelopt_fp4 \
162+
--moe-runner-backend flashinfer_trtllm \
163+
--disaggregation-bootstrap-port 30001 \
164+
--stream-interval 10 \
165+
--watchdog-timeout 1000000 \
166+
--context-length 2200 \
167+
--mem-fraction-static 0.95 \
168+
--chunked-prefill-size 8192 \
169+
--cuda-graph-max-bs 256 \
170+
--scheduler-recv-interval 10 \
171+
--enable-symm-mem \
172+
--moe-dense-tp-size 1 \
173+
--tensor-parallel-size "$TOTAL_GPUS" \
174+
--expert-parallel-size 1 \
175+
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
176+
--nnodes "$TOTAL_NODES" \
177+
--node-rank "$RANK" \
178+
--host 0.0.0.0 ${command_suffix}
179+
fi

0 commit comments

Comments
 (0)