1+ #! /bin/bash
2+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+ # SPDX-License-Identifier: Apache-2.0
4+
5+ # Function to print usage
6+ print_usage () {
7+ echo " Usage: $0 <mode>"
8+ echo " mode: prefill or decode"
9+ echo " "
10+ echo " Examples:"
11+ echo " $0 prefill"
12+ echo " $0 decode"
13+ exit 1
14+ }
15+
16+ # Check if correct number of arguments provided
17+ if [ $# -ne 1 ]; then
18+ echo " Error: Expected 1 argument, got $# "
19+ print_usage
20+ fi
21+
22+ # Parse arguments
23+ mode=$1
24+
25+ # Validate mode argument
26+ if [ " $mode " != " prefill" ] && [ " $mode " != " decode" ]; then
27+ echo " Error: mode must be 'prefill' or 'decode', got '$mode '"
28+ print_usage
29+ fi
30+
31+ echo " Mode: $mode "
32+ echo " Command: dynamo"
33+
34+ # Check if required environment variables are set
35+ if [ -z " $HOST_IP_MACHINE " ]; then
36+ echo " Error: HOST_IP_MACHINE environment variable is not set"
37+ exit 1
38+ fi
39+
40+ if [ -z " $PORT " ]; then
41+ echo " Error: PORT environment variable is not set"
42+ exit 1
43+ fi
44+
45+ if [ -z " $TOTAL_GPUS " ]; then
46+ echo " Error: TOTAL_GPUS environment variable is not set"
47+ exit 1
48+ fi
49+
50+ if [ -z " $RANK " ]; then
51+ echo " Error: RANK environment variable is not set"
52+ exit 1
53+ fi
54+
55+ if [ -z " $TOTAL_NODES " ]; then
56+ echo " Error: TOTAL_NODES environment variable is not set"
57+ exit 1
58+ fi
59+
60+ if [ -z " $USE_INIT_LOCATIONS " ]; then
61+ echo " Error: USE_INIT_LOCATIONS environment variable is not set"
62+ exit 1
63+ fi
64+
65+ if [ -z " $RUN_IN_CI " ]; then
66+ echo " Error: RUN_IN_CI environment variable is not set"
67+ exit 1
68+ fi
69+
70+ # Construct command based on mode
71+ if [ " $mode " = " prefill" ]; then
72+ set -x
73+ if [[ " ${RUN_IN_CI,,} " == " true" ]]; then
74+ python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
75+ python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
76+ fi
77+ export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
78+
79+ command_suffix=" "
80+ if [[ -n " ${DUMP_CONFIG_PATH} " ]]; then command_suffix=" ${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH} " ; fi
81+
82+ PYTHONUNBUFFERED=1 \
83+ DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
84+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
85+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
86+ SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
87+ SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
88+ SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
89+ SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
90+ MC_FORCE_MNNVL=1 \
91+ NCCL_MNNVL_ENABLE=1 \
92+ NCCL_CUMEM_ENABLE=1 \
93+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
94+ SGLANG_ENABLE_JIT_DEEPGEMM=false \
95+ SGLANG_ENABLE_FLASHINFER_GEMM=true \
96+ python3 -m dynamo.sglang \
97+ --disaggregation-mode prefill \
98+ --served-model-name deepseek-ai/DeepSeek-R1 \
99+ --model-path /model/ \
100+ --trust-remote-code \
101+ --disable-radix-cache \
102+ --kv-cache-dtype fp8_e4m3 \
103+ --attention-backend trtllm_mla \
104+ --quantization modelopt_fp4 \
105+ --moe-runner-backend flashinfer_trtllm \
106+ --stream-interval 10 \
107+ --watchdog-timeout 1000000 \
108+ --context-length 2200 \
109+ --mem-fraction-static 0.95 \
110+ --max-total-tokens 8192 \
111+ --chunked-prefill-size 8192 \
112+ --cuda-graph-max-bs 256 \
113+ --max-running-requests 512 \
114+ --scheduler-recv-interval 10 \
115+ --enable-symm-mem \
116+ --moe-dense-tp-size 1 \
117+ --load-balance-method round_robin \
118+ --disaggregation-bootstrap-port 30001 \
119+ --data-parallel-size 1 \
120+ --tensor-parallel-size " $TOTAL_GPUS " \
121+ --expert-parallel-size 1 \
122+ --dist-init-addr " $HOST_IP_MACHINE :$PORT " \
123+ --nnodes " $TOTAL_NODES " \
124+ --node-rank " $RANK " \
125+ --host 0.0.0.0 ${command_suffix}
126+
127+ elif [ " $mode " = " decode" ]; then
128+ set -x
129+ if [[ " ${RUN_IN_CI,,} " == " true" ]]; then
130+ python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
131+ python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
132+ fi
133+ export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
134+
135+ command_suffix=" "
136+ if [[ -n " ${DUMP_CONFIG_PATH} " ]]; then command_suffix=" ${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH} " ; fi
137+
138+ PYTHONUNBUFFERED=1 \
139+ DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
140+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
141+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
142+ SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
143+ SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
144+ SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
145+ SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
146+ MC_FORCE_MNNVL=1 \
147+ NCCL_MNNVL_ENABLE=1 \
148+ NCCL_CUMEM_ENABLE=1 \
149+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
150+ SGLANG_ENABLE_JIT_DEEPGEMM=false \
151+ SGLANG_ENABLE_FLASHINFER_GEMM=true \
152+ python3 -m dynamo.sglang \
153+ --disaggregation-mode decode \
154+ --served-model-name deepseek-ai/DeepSeek-R1 \
155+ --model-path /model/ \
156+ --prefill-round-robin-balance \
157+ --trust-remote-code \
158+ --disable-radix-cache \
159+ --kv-cache-dtype fp8_e4m3 \
160+ --attention-backend trtllm_mla \
161+ --quantization modelopt_fp4 \
162+ --moe-runner-backend flashinfer_trtllm \
163+ --disaggregation-bootstrap-port 30001 \
164+ --stream-interval 10 \
165+ --watchdog-timeout 1000000 \
166+ --context-length 2200 \
167+ --mem-fraction-static 0.95 \
168+ --chunked-prefill-size 8192 \
169+ --cuda-graph-max-bs 256 \
170+ --scheduler-recv-interval 10 \
171+ --enable-symm-mem \
172+ --moe-dense-tp-size 1 \
173+ --tensor-parallel-size " $TOTAL_GPUS " \
174+ --expert-parallel-size 1 \
175+ --dist-init-addr " $HOST_IP_MACHINE :$PORT " \
176+ --nnodes " $TOTAL_NODES " \
177+ --node-rank " $RANK " \
178+ --host 0.0.0.0 ${command_suffix}
179+ fi
0 commit comments