port 0.7.0 fixes + aiperf/sa bench support

jthomson04 · jthomson04 · commit bb6486a3bcdb · 2025-12-08T10:37:06.000-08:00
Signed-off-by: jthomson04 &lt;jwillthomson19@gmail.com&gt;
diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -20,7 +20,7 @@ ctx_max_num_tokens=$6
 num_gen_servers=$7
 gen_tp_size=$8
 gen_ep_size=$9
-gen_batch_size=$10
+gen_batch_size=${10}
 gen_max_num_tokens=${11}
 gen_enable_attention_dp=${12}
 gen_gpu_memory_fraction=${13}
@@ -34,6 +34,7 @@ served_model_name=${20}
 image=${21}
 isl=${22}
 osl=${23}
+benchmark_kind=${24}
 
 CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
 
@@ -179,7 +180,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
       --ntasks $gen_tp_size \
       --oversubscribe \
       --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
   echo "$!" >> "$PID_FILE"
 done
 
@@ -204,7 +205,7 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
         --overlap \
         --ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
         --nodes 1 \
-        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
+        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
   prefill_pids+=($!)
   echo "$!" >> "$PID_FILE"
 done
@@ -216,7 +217,7 @@ srun -l --container-name=${CONTAINER_NAME} \
         --container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
         --mpi=pmix --overlap -N 1 -n 1 \
 	    -w ${nodes[0]} \
-        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
+        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1
 
 
 # Cleanup will be handled by the EXIT trap
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
@@ -37,10 +37,11 @@ model_path=${9}
 isl=${10}
 osl=${11}
 kind=${12}
+benchmark_kind=${13}
 
-if [ "$#" -ne 12 ]; then
-    echo "Error: Expected 12 arguments, got $#"
-    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
+if [ "$#" -ne 13 ]; then
+    echo "Error: Expected 13 arguments, got $#"
+    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
     exit 1
 fi
 
@@ -58,8 +59,12 @@ echo "  model_path: $model_path"
 echo "  isl: $isl"
 echo "  osl: $osl"
 echo "  kind: $kind"
+echo "  benchmark_kind: $benchmark_kind"
 
-
+if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
+    echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
+    exit 0
+fi
 
 # check process id is not 0
 if [[ ${SLURM_PROCID} != "0" ]]; then
@@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
     # https://github.com/ai-dynamo/dynamo/pull/2683
     if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
         if [[ "$kind" == *disagg* ]]; then
-            if echo "$body" | grep -q '"tensorrt_llm_next"'; then
+            if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
                 echo "Health check succeeded on attempt $i"
                 echo "$body"
                 failed=false
                 break
             else
-                echo "Attempt $i: tensorrt_llm_next key not found in etcd."
+                echo "Attempt $i: prefill generate endpoint not found in etcd."
             fi
         else
             echo "Health check succeeded on attempt $i"
@@ -150,7 +155,9 @@ curl -v  -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
   "max_tokens": 30
 }'
 
-python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+# aiperf already does a warmup
+if [[ "$benchmark_kind" == "sa" ]]; then
+    python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --served-model-name ${model} \
         --model ${model_path} \
         --dataset-name random \
@@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --max-concurrency "1" \
         --host ${hostname} \
         --port ${port}
+fi
 
 mkdir -p ${log_path}/results
 echo "Starting benchmark..."
@@ -175,7 +183,55 @@ for concurrency in ${concurrency_list}; do
     num_prompts=$((concurrency * multi_round))
     echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
     mkdir -p ${log_path}/concurrency_${concurrency}
-
+    
+    if [[ "$benchmark_kind" == "sa" ]]; then
+        python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+            --served-model-name ${model} \
+            --model ${model_path} \
+            --dataset-name random \
+            --num-prompts "$num_prompts" \
+            --random-input-len ${isl} \
+            --random-output-len ${osl} \
+            --random-range-ratio 0.8 \
+            --use-chat-template \
+            --ignore-eos \
+            --use-chat-template \
+            --backend "dynamo" \
+            --endpoint "/v1/completions" \
+            --percentile-metrics ttft,tpot,itl,e2el \
+            --max-concurrency "$concurrency" \
+            --host ${hostname} \
+            --port ${port} \
+            --save-result \
+            --result-dir "${log_path}/results" \
+            --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
+    else
+        aiperf profile \
+    	    --model ${model} \
+    	    --tokenizer ${model_path} \
+    	    --endpoint-type completions \
+    	    --endpoint /v1/completions \
+    	    --streaming \
+    	    --url ${hostname}:${port} \
+    	    --synthetic-input-tokens-mean ${isl} \
+    	    --synthetic-input-tokens-stddev 0 \
+    	    --output-tokens-mean ${osl} \
+    	    --output-tokens-stddev 0 \
+    	    --extra-inputs max_tokens:${osl} \
+    	    --extra-inputs min_tokens:${osl} \
+    	    --extra-inputs ignore_eos:true \
+	    --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+    	    --concurrency $concurrency \
+    	    --request-count $num_prompts \
+    	    --warmup-request-count $(($concurrency*2)) \
+	    --num-dataset-entries ${num_prompts} \
+    	    --random-seed 100 \
+    	    --artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
+    	    --ui simple \
+	    -v \
+    	    -H 'Authorization: Bearer NOT USED' \
+    	    -H 'Accept: text/event-stream'    
+    fi
     python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --served-model-name ${model} \
         --model ${model_path} \
diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
@@ -7,8 +7,10 @@ ctx_gpus=$2
 model_name=$3
 model_path=$4
 disaggregation_mode=$5
+is_dep=$6
+
 unset UCX_TLS
-echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
+echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}, is_dep: ${is_dep}"
 
 # Read configuration values from the YAML config file
 if [ ! -f "${config_file}" ]; then
@@ -39,16 +41,15 @@ echo "  max_batch_size: ${max_batch_size}"
 echo "  max_seq_len: ${max_seq_len}"
 
 export TLLM_LOG_LEVEL=INFO
-# NOTE: This var is default behavior in recent trtllm commits, and can
-# be removed. Keeping it here in case the script is ran with older commits.
-export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
-# NOTE: This var was replaced with an LLM API / yaml engine config field
-# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
-# can be removed. Keeping it here in case the script is ran with older commits.
-export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
-# TODO: Is there ever a case where we don't want this enabled?
 export TRTLLM_ENABLE_PDL=1
 
+if [ "$is_dep" = "true" ]; then
+    echo "Using DEP. Setting env vars."
+    export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput"
+    export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL"
+    export TRTLLM_MOE_A2A_WORKSPACE_MB="2048"
+fi 
+
 if [[ "${model_path,,}" != *r1* ]]; then
     echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8"
     export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8
diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
@@ -22,6 +22,7 @@ NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}"
 
 ISL="${ISL:-8150}"
 OSL="${OSL:-1024}"
+BENCHMARK_KIND="${BENCHMARK_KIND:-sa}"
 
 # Build slurm_args step-by-step with validation and defaults
 slurm_args="--time=04:00:00"
@@ -98,7 +99,7 @@ run_single() {
     total_nodes=$((ctx_num + gen_nodes))
     total_tasks=$((total_nodes * 4))
     set -x
-    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
+    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND}
     set +x
 }