Skip to content

Commit e41b80d

Browse files
committed
Fixes for gb300
Signed-off-by: jthomson04 <[email protected]>
1 parent bb6486a commit e41b80d

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ MULTI_ROUND="${MULTI_ROUND:-8}"
55

66
# set MOUNT_DIR
77
MOUNT_DIR="${MOUNT_DIR:-${PWD}}"
8-
CONTAINER_NAME=disaggr-test
9-
8+
CONTAINER_NAME=disaggr-test-$(date +%s)-$$
109

1110
STREAMING=true
1211
CTX_GPU_FRAC=0.85
@@ -35,6 +34,7 @@ image=${21}
3534
isl=${22}
3635
osl=${23}
3736
benchmark_kind=${24}
37+
ntasks_per_node=${25}
3838

3939
CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
4040

@@ -179,7 +179,9 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
179179
--nodes ${num_gen_nodes} \
180180
--ntasks $gen_tp_size \
181181
--oversubscribe \
182+
--gpus-per-node $ntasks_per_node \
182183
--overlap \
184+
-e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
183185
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
184186
echo "$!" >> "$PID_FILE"
185187
done
@@ -203,8 +205,10 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
203205
--mpi=pmix --overlap -w ${nodes[node_idx]} \
204206
--oversubscribe \
205207
--overlap \
206-
--ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
208+
--ntasks $(( ctx_tp_size < ntasks_per_node ? ctx_tp_size : ntasks_per_node )) \
209+
--gpus-per-node $ntasks_per_node \
207210
--nodes 1 \
211+
-e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
208212
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
209213
prefill_pids+=($!)
210214
echo "$!" >> "$PID_FILE"

components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,6 @@ if [[ "${model_path,,}" != *r1* ]]; then
5555
export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8
5656
fi
5757

58-
# NOTE: Set (or unset) these depending on what cluster you're using
59-
export TRTLLM_UCX_INTERFACE=enP6p9s0np0
60-
export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_3:1,mlx5_4:1,enP6p9s0np0
61-
6258
trtllm-llmapi-launch python3 -m dynamo.trtllm \
6359
--model-path ${model_path} \
6460
--served-model-name ${model_name} \

components/backends/trtllm/performance_sweeps/submit_disagg.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ run_single() {
9595
# TODO: expose kind to the command line
9696
local kind="dynamo_disagg"
9797

98-
gen_nodes=$(((gen_tp_size + 3)/4 * gen_num))
98+
gen_nodes=$(((gen_tp_size + NTASKS_PER_NODE - 1)/NTASKS_PER_NODE * gen_num))
9999
total_nodes=$((ctx_num + gen_nodes))
100-
total_tasks=$((total_nodes * 4))
100+
total_tasks=$((total_nodes * NTASKS_PER_NODE))
101101
set -x
102-
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND}
102+
sbatch --nodes=${total_nodes} --gpus-per-node ${NTASKS_PER_NODE} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND} ${NTASKS_PER_NODE}
103103
set +x
104104
}
105105

0 commit comments

Comments
 (0)