Skip to content

Commit bb6486a

Browse files
committed
port 0.7.0 fixes + aiperf/sa bench support
Signed-off-by: jthomson04 <[email protected]>
1 parent 5477ae5 commit bb6486a

File tree

4 files changed

+81
-22
lines changed

4 files changed

+81
-22
lines changed

components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ ctx_max_num_tokens=$6
2020
num_gen_servers=$7
2121
gen_tp_size=$8
2222
gen_ep_size=$9
23-
gen_batch_size=$10
23+
gen_batch_size=${10}
2424
gen_max_num_tokens=${11}
2525
gen_enable_attention_dp=${12}
2626
gen_gpu_memory_fraction=${13}
@@ -34,6 +34,7 @@ served_model_name=${20}
3434
image=${21}
3535
isl=${22}
3636
osl=${23}
37+
benchmark_kind=${24}
3738

3839
CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
3940

@@ -179,7 +180,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
179180
--ntasks $gen_tp_size \
180181
--oversubscribe \
181182
--overlap \
182-
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
183+
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
183184
echo "$!" >> "$PID_FILE"
184185
done
185186

@@ -204,7 +205,7 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
204205
--overlap \
205206
--ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
206207
--nodes 1 \
207-
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
208+
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
208209
prefill_pids+=($!)
209210
echo "$!" >> "$PID_FILE"
210211
done
@@ -216,7 +217,7 @@ srun -l --container-name=${CONTAINER_NAME} \
216217
--container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
217218
--mpi=pmix --overlap -N 1 -n 1 \
218219
-w ${nodes[0]} \
219-
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
220+
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1
220221

221222

222223
# Cleanup will be handled by the EXIT trap

components/backends/trtllm/performance_sweeps/scripts/bench.sh

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,11 @@ model_path=${9}
3737
isl=${10}
3838
osl=${11}
3939
kind=${12}
40+
benchmark_kind=${13}
4041

41-
if [ "$#" -ne 12 ]; then
42-
echo "Error: Expected 12 arguments, got $#"
43-
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
42+
if [ "$#" -ne 13 ]; then
43+
echo "Error: Expected 13 arguments, got $#"
44+
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
4445
exit 1
4546
fi
4647

@@ -58,8 +59,12 @@ echo " model_path: $model_path"
5859
echo " isl: $isl"
5960
echo " osl: $osl"
6061
echo " kind: $kind"
62+
echo " benchmark_kind: $benchmark_kind"
6163

62-
64+
if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
65+
echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
66+
exit 0
67+
fi
6368

6469
# check process id is not 0
6570
if [[ ${SLURM_PROCID} != "0" ]]; then
@@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
112117
# https://github.com/ai-dynamo/dynamo/pull/2683
113118
if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
114119
if [[ "$kind" == *disagg* ]]; then
115-
if echo "$body" | grep -q '"tensorrt_llm_next"'; then
120+
if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
116121
echo "Health check succeeded on attempt $i"
117122
echo "$body"
118123
failed=false
119124
break
120125
else
121-
echo "Attempt $i: tensorrt_llm_next key not found in etcd."
126+
echo "Attempt $i: prefill generate endpoint not found in etcd."
122127
fi
123128
else
124129
echo "Health check succeeded on attempt $i"
@@ -150,7 +155,9 @@ curl -v -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
150155
"max_tokens": 30
151156
}'
152157

153-
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
158+
# aiperf already does a warmup
159+
if [[ "$benchmark_kind" == "sa" ]]; then
160+
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
154161
--served-model-name ${model} \
155162
--model ${model_path} \
156163
--dataset-name random \
@@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
166173
--max-concurrency "1" \
167174
--host ${hostname} \
168175
--port ${port}
176+
fi
169177

170178
mkdir -p ${log_path}/results
171179
echo "Starting benchmark..."
@@ -175,7 +183,55 @@ for concurrency in ${concurrency_list}; do
175183
num_prompts=$((concurrency * multi_round))
176184
echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
177185
mkdir -p ${log_path}/concurrency_${concurrency}
178-
186+
187+
if [[ "$benchmark_kind" == "sa" ]]; then
188+
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
189+
--served-model-name ${model} \
190+
--model ${model_path} \
191+
--dataset-name random \
192+
--num-prompts "$num_prompts" \
193+
--random-input-len ${isl} \
194+
--random-output-len ${osl} \
195+
--random-range-ratio 0.8 \
196+
--use-chat-template \
197+
--ignore-eos \
198+
--use-chat-template \
199+
--backend "dynamo" \
200+
--endpoint "/v1/completions" \
201+
--percentile-metrics ttft,tpot,itl,e2el \
202+
--max-concurrency "$concurrency" \
203+
--host ${hostname} \
204+
--port ${port} \
205+
--save-result \
206+
--result-dir "${log_path}/results" \
207+
--result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
208+
else
209+
aiperf profile \
210+
--model ${model} \
211+
--tokenizer ${model_path} \
212+
--endpoint-type completions \
213+
--endpoint /v1/completions \
214+
--streaming \
215+
--url ${hostname}:${port} \
216+
--synthetic-input-tokens-mean ${isl} \
217+
--synthetic-input-tokens-stddev 0 \
218+
--output-tokens-mean ${osl} \
219+
--output-tokens-stddev 0 \
220+
--extra-inputs max_tokens:${osl} \
221+
--extra-inputs min_tokens:${osl} \
222+
--extra-inputs ignore_eos:true \
223+
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
224+
--concurrency $concurrency \
225+
--request-count $num_prompts \
226+
--warmup-request-count $(($concurrency*2)) \
227+
--num-dataset-entries ${num_prompts} \
228+
--random-seed 100 \
229+
--artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
230+
--ui simple \
231+
-v \
232+
-H 'Authorization: Bearer NOT USED' \
233+
-H 'Accept: text/event-stream'
234+
fi
179235
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
180236
--served-model-name ${model} \
181237
--model ${model_path} \

components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ ctx_gpus=$2
77
model_name=$3
88
model_path=$4
99
disaggregation_mode=$5
10+
is_dep=$6
11+
1012
unset UCX_TLS
11-
echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
13+
echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}, is_dep: ${is_dep}"
1214

1315
# Read configuration values from the YAML config file
1416
if [ ! -f "${config_file}" ]; then
@@ -39,16 +41,15 @@ echo " max_batch_size: ${max_batch_size}"
3941
echo " max_seq_len: ${max_seq_len}"
4042

4143
export TLLM_LOG_LEVEL=INFO
42-
# NOTE: This var is default behavior in recent trtllm commits, and can
43-
# be removed. Keeping it here in case the script is ran with older commits.
44-
export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
45-
# NOTE: This var was replaced with an LLM API / yaml engine config field
46-
# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
47-
# can be removed. Keeping it here in case the script is ran with older commits.
48-
export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
49-
# TODO: Is there ever a case where we don't want this enabled?
5044
export TRTLLM_ENABLE_PDL=1
5145

46+
if [ "$is_dep" = "true" ]; then
47+
echo "Using DEP. Setting env vars."
48+
export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput"
49+
export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL"
50+
export TRTLLM_MOE_A2A_WORKSPACE_MB="2048"
51+
fi
52+
5253
if [[ "${model_path,,}" != *r1* ]]; then
5354
echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8"
5455
export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8

components/backends/trtllm/performance_sweeps/submit_disagg.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}"
2222

2323
ISL="${ISL:-8150}"
2424
OSL="${OSL:-1024}"
25+
BENCHMARK_KIND="${BENCHMARK_KIND:-sa}"
2526

2627
# Build slurm_args step-by-step with validation and defaults
2728
slurm_args="--time=04:00:00"
@@ -98,7 +99,7 @@ run_single() {
9899
total_nodes=$((ctx_num + gen_nodes))
99100
total_tasks=$((total_nodes * 4))
100101
set -x
101-
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
102+
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND}
102103
set +x
103104
}
104105

0 commit comments

Comments
 (0)