@@ -37,10 +37,11 @@ model_path=${9}
3737isl=${10}
3838osl=${11}
3939kind=${12}
40+ benchmark_kind=${13}
4041
41- if [ " $# " -ne 12 ]; then
42- echo " Error: Expected 12 arguments, got $# "
43- echo " Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
42+ if [ " $# " -ne 13 ]; then
43+ echo " Error: Expected 13 arguments, got $# "
44+ echo " Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind> "
4445 exit 1
4546fi
4647
@@ -58,8 +59,12 @@ echo " model_path: $model_path"
5859echo " isl: $isl "
5960echo " osl: $osl "
6061echo " kind: $kind "
62+ echo " benchmark_kind: $benchmark_kind "
6163
62-
64+ if ! ( [[ " $benchmark_kind " == " sa" || " $benchmark_kind " == " aiperf" ]] ); then
65+ echo " Invalid benchmark kind! Expected 'sa' or 'aiperf'"
66+ exit 0
67+ fi
6368
6469# check process id is not 0
6570if [[ ${SLURM_PROCID} != " 0" ]]; then
@@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
112117 # https://github.com/ai-dynamo/dynamo/pull/2683
113118 if [[ " $http_code " == " 200" ]] && echo " $body " | grep -q ' "status":"healthy"' && echo " $body " | grep -q ' "endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"' ; then
114119 if [[ " $kind " == * disagg* ]]; then
115- if echo " $body " | grep -q ' "tensorrt_llm_next "' ; then
120+ if echo " $body " | grep -q ' "dyn://dynamo.prefill.generate "' ; then
116121 echo " Health check succeeded on attempt $i "
117122 echo " $body "
118123 failed=false
119124 break
120125 else
121- echo " Attempt $i : tensorrt_llm_next key not found in etcd."
126+ echo " Attempt $i : prefill generate endpoint not found in etcd."
122127 fi
123128 else
124129 echo " Health check succeeded on attempt $i "
@@ -150,7 +155,9 @@ curl -v -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
150155 "max_tokens": 30
151156}'
152157
153- python3 ${SCRIPTS_DIR} /scripts/bench/benchmark_serving.py \
158+ # aiperf already does a warmup
159+ if [[ " $benchmark_kind " == " sa" ]]; then
160+ python3 ${SCRIPTS_DIR} /scripts/bench/benchmark_serving.py \
154161 --served-model-name ${model} \
155162 --model ${model_path} \
156163 --dataset-name random \
@@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
166173 --max-concurrency " 1" \
167174 --host ${hostname} \
168175 --port ${port}
176+ fi
169177
170178mkdir -p ${log_path} /results
171179echo " Starting benchmark..."
@@ -175,7 +183,55 @@ for concurrency in ${concurrency_list}; do
175183 num_prompts=$(( concurrency * multi_round))
176184 echo " Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
177185 mkdir -p ${log_path} /concurrency_${concurrency}
178-
186+
187+ if [[ " $benchmark_kind " == " sa" ]]; then
188+ python3 ${SCRIPTS_DIR} /scripts/bench/benchmark_serving.py \
189+ --served-model-name ${model} \
190+ --model ${model_path} \
191+ --dataset-name random \
192+ --num-prompts " $num_prompts " \
193+ --random-input-len ${isl} \
194+ --random-output-len ${osl} \
195+ --random-range-ratio 0.8 \
196+ --use-chat-template \
197+ --ignore-eos \
198+ --use-chat-template \
199+ --backend " dynamo" \
200+ --endpoint " /v1/completions" \
201+ --percentile-metrics ttft,tpot,itl,e2el \
202+ --max-concurrency " $concurrency " \
203+ --host ${hostname} \
204+ --port ${port} \
205+ --save-result \
206+ --result-dir " ${log_path} /results" \
207+ --result-filename " results_concurrency_${original_concurrency} _gpus_${total_gpus} _ctx_${prefill_gpus} _gen_${decode_gpus} .json"
208+ else
209+ aiperf profile \
210+ --model ${model} \
211+ --tokenizer ${model_path} \
212+ --endpoint-type completions \
213+ --endpoint /v1/completions \
214+ --streaming \
215+ --url ${hostname} :${port} \
216+ --synthetic-input-tokens-mean ${isl} \
217+ --synthetic-input-tokens-stddev 0 \
218+ --output-tokens-mean ${osl} \
219+ --output-tokens-stddev 0 \
220+ --extra-inputs max_tokens:${osl} \
221+ --extra-inputs min_tokens:${osl} \
222+ --extra-inputs ignore_eos:true \
223+ --extra-inputs " {\" nvext\" :{\" ignore_eos\" :true}}" \
224+ --concurrency $concurrency \
225+ --request-count $num_prompts \
226+ --warmup-request-count $(( $concurrency * 2 )) \
227+ --num-dataset-entries ${num_prompts} \
228+ --random-seed 100 \
229+ --artifact-dir " ${log_path} /results/concurrency_${original_concurrency} " \
230+ --ui simple \
231+ -v \
232+ -H ' Authorization: Bearer NOT USED' \
233+ -H ' Accept: text/event-stream'
234+ fi
179235 python3 ${SCRIPTS_DIR} /scripts/bench/benchmark_serving.py \
180236 --served-model-name ${model} \
181237 --model ${model_path} \
0 commit comments