ai-dynamo · jthomson04 · Nov 26, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 28, 2025
diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -5,34 +5,38 @@ MULTI_ROUND="${MULTI_ROUND:-8}"
 
 # set MOUNT_DIR
 MOUNT_DIR="${MOUNT_DIR:-${PWD}}"
-CONTAINER_NAME=disaggr-test
-
+CONTAINER_NAME=disaggr-test-$(date +%s)-$$
 
 STREAMING=true
 CTX_GPU_FRAC=0.85
-CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}
 
 num_ctx_servers=$1
 ctx_tp_size=$2
-ctx_batch_size=$3
-ctx_max_num_tokens=$4
-ctx_enable_attention_dp=$5
-num_gen_servers=$6
-gen_tp_size=$7
-gen_batch_size=$8
-gen_max_num_tokens=$9
-gen_enable_attention_dp=${10}
-gen_gpu_memory_fraction=${11}
-eplb_num_slots=${12}
-mtp_size=${13}
-concurrency_list=${14}
-gen_nodes=${15}
-kind=${16}
-model_path=${17}
-served_model_name=${18}
-image=${19}
-isl=${20}
-osl=${21}
+ctx_ep_size=$3
+ctx_enable_attention_dp=$4
+ctx_batch_size=$5
+ctx_max_num_tokens=$6
+num_gen_servers=$7
+gen_tp_size=$8
+gen_ep_size=$9
+gen_batch_size=${10}
+gen_max_num_tokens=${11}
+gen_enable_attention_dp=${12}
+gen_gpu_memory_fraction=${13}
+eplb_num_slots=${14}
+mtp_size=${15}
+concurrency_list=${16}
+gen_nodes=${17}
+kind=${18}
+model_path=${19}
+served_model_name=${20}
+image=${21}
+isl=${22}
+osl=${23}
+benchmark_kind=${24}
+ntasks_per_node=${25}
+
+CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
 
 ctx_max_seq_len=$((${isl} + 203))
 gen_max_seq_len=$((${isl} + ${osl} + 203))
@@ -44,7 +48,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh"
 mkdir -p ${LOG_DIR}
 echo "trying to submit job"
 
-sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
 echo "concurrency_list: ${concurrency_list}"
 
@@ -53,11 +57,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size))
 
 echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}"
 
-enable_pdl=false
 if [ "${gen_enable_attention_dp}" = "false" ]; then
-    enable_pdl=true
-    echo "enable_pdl: ${enable_pdl}"
-    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 fi
 
 full_logdir=${sub_dir}
@@ -84,13 +85,15 @@ srun -l --container-name=${CONTAINER_NAME} \
             --model ${model_path} \
             --num_ctx_servers ${num_ctx_servers} \
             --ctx_tp_size ${ctx_tp_size} \
+            --ctx_ep_size ${ctx_ep_size} \
             --ctx_batch_size ${ctx_batch_size} \
             --ctx_max_num_tokens ${ctx_max_num_tokens} \
             --ctx_max_seq_len ${ctx_max_seq_len} \
             --ctx_free_gpu_memory_fraction ${CTX_GPU_FRAC} \
             --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \
             --num_gen_servers ${num_gen_servers} \
             --gen_tp_size ${gen_tp_size} \
+            --gen_ep_size ${gen_ep_size} \
             --gen_batch_size ${gen_batch_size} \
             --gen_max_num_tokens ${gen_max_num_tokens} \
             --gen_max_seq_len ${gen_max_seq_len} \
@@ -176,8 +179,10 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
       --nodes ${num_gen_nodes} \
       --ntasks $gen_tp_size \
       --oversubscribe \
+      --gpus-per-node $ntasks_per_node \
       --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
+      -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
   echo "$!" >> "$PID_FILE"
 done
 
@@ -200,9 +205,11 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
         --mpi=pmix --overlap -w ${nodes[node_idx]} \
         --oversubscribe \
         --overlap \
-        --ntasks 4 \
+        --ntasks $(( ctx_tp_size < ntasks_per_node ? ctx_tp_size : ntasks_per_node )) \
+        --gpus-per-node $ntasks_per_node \
         --nodes 1 \
-        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
+        -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
+        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
   prefill_pids+=($!)
   echo "$!" >> "$PID_FILE"
 done
@@ -214,7 +221,7 @@ srun -l --container-name=${CONTAINER_NAME} \
         --container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
         --mpi=pmix --overlap -N 1 -n 1 \
 	    -w ${nodes[0]} \
-        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
+        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1
 
 
 # Cleanup will be handled by the EXIT trap
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench b/components/backends/trtllm/performance_sweeps/scripts/bench
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
@@ -37,10 +37,11 @@ model_path=${9}
 isl=${10}
 osl=${11}
 kind=${12}
+benchmark_kind=${13}
 
-if [ "$#" -ne 12 ]; then
-    echo "Error: Expected 12 arguments, got $#"
-    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
+if [ "$#" -ne 13 ]; then
+    echo "Error: Expected 13 arguments, got $#"
+    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
     exit 1
 fi
 
@@ -58,8 +59,12 @@ echo "  model_path: $model_path"
 echo "  isl: $isl"
 echo "  osl: $osl"
 echo "  kind: $kind"
+echo "  benchmark_kind: $benchmark_kind"
 
-
+if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
+    echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
+    exit 0
+fi
 
 # check process id is not 0
 if [[ ${SLURM_PROCID} != "0" ]]; then
@@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
     # https://github.com/ai-dynamo/dynamo/pull/2683
     if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
         if [[ "$kind" == *disagg* ]]; then
-            if echo "$body" | grep -q '"tensorrt_llm_next"'; then
+            if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
                 echo "Health check succeeded on attempt $i"
                 echo "$body"
                 failed=false
                 break
             else
-                echo "Attempt $i: tensorrt_llm_next key not found in etcd."
+                echo "Attempt $i: prefill generate endpoint not found in etcd."
             fi
         else
             echo "Health check succeeded on attempt $i"
@@ -150,7 +155,9 @@ curl -v  -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
   "max_tokens": 30
 }'
 
-python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+# aiperf already does a warmup
+if [[ "$benchmark_kind" == "sa" ]]; then
+    python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --served-model-name ${model} \
         --model ${model_path} \
         --dataset-name random \
@@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --max-concurrency "1" \
         --host ${hostname} \
         --port ${port}
+fi
 
 mkdir -p ${log_path}/results
 echo "Starting benchmark..."
@@ -175,27 +183,55 @@ for concurrency in ${concurrency_list}; do
     num_prompts=$((concurrency * multi_round))
     echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
     mkdir -p ${log_path}/concurrency_${concurrency}
-
-    python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
-        --served-model-name ${model} \
-        --model ${model_path} \
-        --dataset-name random \
-        --num-prompts "$num_prompts" \
-        --random-input-len ${isl} \
-        --random-output-len ${osl} \
-        --random-range-ratio 0.8 \
-        --use-chat-template \
-        --ignore-eos \
-        --use-chat-template \
-        --backend "dynamo" \
-        --endpoint "/v1/completions" \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --max-concurrency "$concurrency" \
-        --host ${hostname} \
-        --port ${port} \
-        --save-result \
-        --result-dir "${log_path}/results" \
-        --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
+
+    if [[ "$benchmark_kind" == "sa" ]]; then
+        python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+            --served-model-name ${model} \
+            --model ${model_path} \
+            --dataset-name random \
+            --num-prompts "$num_prompts" \
+            --random-input-len ${isl} \
+            --random-output-len ${osl} \
+            --random-range-ratio 0.8 \
+            --use-chat-template \
+            --ignore-eos \
+            --use-chat-template \
+            --backend "dynamo" \
+            --endpoint "/v1/completions" \
+            --percentile-metrics ttft,tpot,itl,e2el \
+            --max-concurrency "$concurrency" \
+            --host ${hostname} \
+            --port ${port} \
+            --save-result \
+            --result-dir "${log_path}/results" \
+            --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
+    else
+        aiperf profile \
+    	    --model ${model} \
+    	    --tokenizer ${model_path} \
+    	    --endpoint-type completions \
+    	    --endpoint /v1/completions \
+    	    --streaming \
+    	    --url ${hostname}:${port} \
+    	    --synthetic-input-tokens-mean ${isl} \
+    	    --synthetic-input-tokens-stddev 0 \
+    	    --output-tokens-mean ${osl} \
+    	    --output-tokens-stddev 0 \
+    	    --extra-inputs max_tokens:${osl} \
+    	    --extra-inputs min_tokens:${osl} \
+    	    --extra-inputs ignore_eos:true \
+	    --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+    	    --concurrency $concurrency \
+    	    --request-count $num_prompts \
+    	    --warmup-request-count $(($concurrency*2)) \
+	    --num-dataset-entries ${num_prompts} \
+    	    --random-seed 100 \
+    	    --artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
+    	    --ui simple \
+	    -v \
+    	    -H 'Authorization: Bearer NOT USED' \
+    	    -H 'Accept: text/event-stream'    
+    fi
 
     echo "Benchmark with concurrency ${concurrency} done"
 done