Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,38 @@ MULTI_ROUND="${MULTI_ROUND:-8}"

# set MOUNT_DIR
MOUNT_DIR="${MOUNT_DIR:-${PWD}}"
CONTAINER_NAME=disaggr-test

CONTAINER_NAME=disaggr-test-$(date +%s)-$$

STREAMING=true
CTX_GPU_FRAC=0.85
CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}

num_ctx_servers=$1
ctx_tp_size=$2
ctx_batch_size=$3
ctx_max_num_tokens=$4
ctx_enable_attention_dp=$5
num_gen_servers=$6
gen_tp_size=$7
gen_batch_size=$8
gen_max_num_tokens=$9
gen_enable_attention_dp=${10}
gen_gpu_memory_fraction=${11}
eplb_num_slots=${12}
mtp_size=${13}
concurrency_list=${14}
gen_nodes=${15}
kind=${16}
model_path=${17}
served_model_name=${18}
image=${19}
isl=${20}
osl=${21}
ctx_ep_size=$3
ctx_enable_attention_dp=$4
ctx_batch_size=$5
ctx_max_num_tokens=$6
num_gen_servers=$7
gen_tp_size=$8
gen_ep_size=$9
gen_batch_size=${10}
gen_max_num_tokens=${11}
gen_enable_attention_dp=${12}
gen_gpu_memory_fraction=${13}
eplb_num_slots=${14}
mtp_size=${15}
concurrency_list=${16}
gen_nodes=${17}
kind=${18}
model_path=${19}
served_model_name=${20}
image=${21}
isl=${22}
osl=${23}
benchmark_kind=${24}
ntasks_per_node=${25}

CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}

ctx_max_seq_len=$((${isl} + 203))
gen_max_seq_len=$((${isl} + ${osl} + 203))
Expand All @@ -44,7 +48,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh"
mkdir -p ${LOG_DIR}
echo "trying to submit job"

sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}

echo "concurrency_list: ${concurrency_list}"

Expand All @@ -53,11 +57,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size))

echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}"

enable_pdl=false
if [ "${gen_enable_attention_dp}" = "false" ]; then
enable_pdl=true
echo "enable_pdl: ${enable_pdl}"
sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
fi

full_logdir=${sub_dir}
Expand All @@ -84,13 +85,15 @@ srun -l --container-name=${CONTAINER_NAME} \
--model ${model_path} \
--num_ctx_servers ${num_ctx_servers} \
--ctx_tp_size ${ctx_tp_size} \
--ctx_ep_size ${ctx_ep_size} \
--ctx_batch_size ${ctx_batch_size} \
--ctx_max_num_tokens ${ctx_max_num_tokens} \
--ctx_max_seq_len ${ctx_max_seq_len} \
--ctx_free_gpu_memory_fraction ${CTX_GPU_FRAC} \
--cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \
--num_gen_servers ${num_gen_servers} \
--gen_tp_size ${gen_tp_size} \
--gen_ep_size ${gen_ep_size} \
--gen_batch_size ${gen_batch_size} \
--gen_max_num_tokens ${gen_max_num_tokens} \
--gen_max_seq_len ${gen_max_seq_len} \
Expand Down Expand Up @@ -176,8 +179,10 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
--nodes ${num_gen_nodes} \
--ntasks $gen_tp_size \
--oversubscribe \
--gpus-per-node $ntasks_per_node \
--overlap \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
-e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
echo "$!" >> "$PID_FILE"
done

Expand All @@ -200,9 +205,11 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
--mpi=pmix --overlap -w ${nodes[node_idx]} \
--oversubscribe \
--overlap \
--ntasks 4 \
--ntasks $(( ctx_tp_size < ntasks_per_node ? ctx_tp_size : ntasks_per_node )) \
--gpus-per-node $ntasks_per_node \
--nodes 1 \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
-e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
prefill_pids+=($!)
echo "$!" >> "$PID_FILE"
done
Expand All @@ -214,7 +221,7 @@ srun -l --container-name=${CONTAINER_NAME} \
--container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
--mpi=pmix --overlap -N 1 -n 1 \
-w ${nodes[0]} \
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1


# Cleanup will be handled by the EXIT trap
Submodule bench updated 1 files
+9 −3 backend_request_func.py
92 changes: 64 additions & 28 deletions components/backends/trtllm/performance_sweeps/scripts/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ model_path=${9}
isl=${10}
osl=${11}
kind=${12}
benchmark_kind=${13}

if [ "$#" -ne 12 ]; then
echo "Error: Expected 12 arguments, got $#"
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
if [ "$#" -ne 13 ]; then
echo "Error: Expected 13 arguments, got $#"
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
exit 1
fi

Expand All @@ -58,8 +59,12 @@ echo " model_path: $model_path"
echo " isl: $isl"
echo " osl: $osl"
echo " kind: $kind"
echo " benchmark_kind: $benchmark_kind"


if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
exit 0
fi

# check process id is not 0
if [[ ${SLURM_PROCID} != "0" ]]; then
Expand Down Expand Up @@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
# https://github.com/ai-dynamo/dynamo/pull/2683
if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
if [[ "$kind" == *disagg* ]]; then
if echo "$body" | grep -q '"tensorrt_llm_next"'; then
if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
echo "Health check succeeded on attempt $i"
echo "$body"
failed=false
break
else
echo "Attempt $i: tensorrt_llm_next key not found in etcd."
echo "Attempt $i: prefill generate endpoint not found in etcd."
fi
else
echo "Health check succeeded on attempt $i"
Expand Down Expand Up @@ -150,7 +155,9 @@ curl -v -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
"max_tokens": 30
}'

python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
# aiperf already does a warmup
if [[ "$benchmark_kind" == "sa" ]]; then
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
--dataset-name random \
Expand All @@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--max-concurrency "1" \
--host ${hostname} \
--port ${port}
fi

mkdir -p ${log_path}/results
echo "Starting benchmark..."
Expand All @@ -175,27 +183,55 @@ for concurrency in ${concurrency_list}; do
num_prompts=$((concurrency * multi_round))
echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
mkdir -p ${log_path}/concurrency_${concurrency}

python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
--dataset-name random \
--num-prompts "$num_prompts" \
--random-input-len ${isl} \
--random-output-len ${osl} \
--random-range-ratio 0.8 \
--use-chat-template \
--ignore-eos \
--use-chat-template \
--backend "dynamo" \
--endpoint "/v1/completions" \
--percentile-metrics ttft,tpot,itl,e2el \
--max-concurrency "$concurrency" \
--host ${hostname} \
--port ${port} \
--save-result \
--result-dir "${log_path}/results" \
--result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"

if [[ "$benchmark_kind" == "sa" ]]; then
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
--dataset-name random \
--num-prompts "$num_prompts" \
--random-input-len ${isl} \
--random-output-len ${osl} \
--random-range-ratio 0.8 \
--use-chat-template \
--ignore-eos \
--use-chat-template \
--backend "dynamo" \
--endpoint "/v1/completions" \
--percentile-metrics ttft,tpot,itl,e2el \
--max-concurrency "$concurrency" \
--host ${hostname} \
--port ${port} \
--save-result \
--result-dir "${log_path}/results" \
--result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
else
aiperf profile \
--model ${model} \
--tokenizer ${model_path} \
--endpoint-type completions \
--endpoint /v1/completions \
--streaming \
--url ${hostname}:${port} \
--synthetic-input-tokens-mean ${isl} \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean ${osl} \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:${osl} \
--extra-inputs min_tokens:${osl} \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $num_prompts \
--warmup-request-count $(($concurrency*2)) \
--num-dataset-entries ${num_prompts} \
--random-seed 100 \
--artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
--ui simple \
-v \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
fi

echo "Benchmark with concurrency ${concurrency} done"
done
Expand Down
Loading
Loading