diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm index b5650738a7..9a30b711cc 100755 --- a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm +++ b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm @@ -5,34 +5,38 @@ MULTI_ROUND="${MULTI_ROUND:-8}" # set MOUNT_DIR MOUNT_DIR="${MOUNT_DIR:-${PWD}}" -CONTAINER_NAME=disaggr-test - +CONTAINER_NAME=disaggr-test-$(date +%s)-$$ STREAMING=true CTX_GPU_FRAC=0.85 -CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608} num_ctx_servers=$1 ctx_tp_size=$2 -ctx_batch_size=$3 -ctx_max_num_tokens=$4 -ctx_enable_attention_dp=$5 -num_gen_servers=$6 -gen_tp_size=$7 -gen_batch_size=$8 -gen_max_num_tokens=$9 -gen_enable_attention_dp=${10} -gen_gpu_memory_fraction=${11} -eplb_num_slots=${12} -mtp_size=${13} -concurrency_list=${14} -gen_nodes=${15} -kind=${16} -model_path=${17} -served_model_name=${18} -image=${19} -isl=${20} -osl=${21} +ctx_ep_size=$3 +ctx_enable_attention_dp=$4 +ctx_batch_size=$5 +ctx_max_num_tokens=$6 +num_gen_servers=$7 +gen_tp_size=$8 +gen_ep_size=$9 +gen_batch_size=${10} +gen_max_num_tokens=${11} +gen_enable_attention_dp=${12} +gen_gpu_memory_fraction=${13} +eplb_num_slots=${14} +mtp_size=${15} +concurrency_list=${16} +gen_nodes=${17} +kind=${18} +model_path=${19} +served_model_name=${20} +image=${21} +isl=${22} +osl=${23} +benchmark_kind=${24} +ntasks_per_node=${25} + +CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))} ctx_max_seq_len=$((${isl} + 203)) gen_max_seq_len=$((${isl} + ${osl} + 203)) @@ -44,7 +48,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh" mkdir -p ${LOG_DIR} echo "trying to submit job" -sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} +sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} echo "concurrency_list: ${concurrency_list}" @@ -53,11 +57,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size)) echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}" -enable_pdl=false if [ "${gen_enable_attention_dp}" = "false" ]; then - enable_pdl=true - echo "enable_pdl: ${enable_pdl}" - sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} + sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} fi full_logdir=${sub_dir} @@ -84,6 +85,7 @@ srun -l --container-name=${CONTAINER_NAME} \ --model ${model_path} \ --num_ctx_servers ${num_ctx_servers} \ --ctx_tp_size ${ctx_tp_size} \ + --ctx_ep_size ${ctx_ep_size} \ --ctx_batch_size ${ctx_batch_size} \ --ctx_max_num_tokens ${ctx_max_num_tokens} \ --ctx_max_seq_len ${ctx_max_seq_len} \ @@ -91,6 +93,7 @@ srun -l --container-name=${CONTAINER_NAME} \ --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \ --num_gen_servers ${num_gen_servers} \ --gen_tp_size ${gen_tp_size} \ + --gen_ep_size ${gen_ep_size} \ --gen_batch_size ${gen_batch_size} \ --gen_max_num_tokens ${gen_max_num_tokens} \ --gen_max_seq_len ${gen_max_seq_len} \ @@ -176,8 +179,10 @@ for ((i=1; i<=DECODE_COUNT; i++)); do --nodes ${num_gen_nodes} \ --ntasks $gen_tp_size \ --oversubscribe \ + --gpus-per-node $ntasks_per_node \ --overlap \ - bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log & + -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \ + bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log & echo "$!" >> "$PID_FILE" done @@ -200,9 +205,11 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do --mpi=pmix --overlap -w ${nodes[node_idx]} \ --oversubscribe \ --overlap \ - --ntasks 4 \ + --ntasks $(( ctx_tp_size < ntasks_per_node ? ctx_tp_size : ntasks_per_node )) \ + --gpus-per-node $ntasks_per_node \ --nodes 1 \ - bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log & + -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \ + bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log & prefill_pids+=($!) echo "$!" >> "$PID_FILE" done @@ -214,7 +221,7 @@ srun -l --container-name=${CONTAINER_NAME} \ --container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \ --mpi=pmix --overlap -N 1 -n 1 \ -w ${nodes[0]} \ - bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1 + bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1 # Cleanup will be handled by the EXIT trap diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench b/components/backends/trtllm/performance_sweeps/scripts/bench index f3ea022a57..2a35585f0c 160000 --- a/components/backends/trtllm/performance_sweeps/scripts/bench +++ b/components/backends/trtllm/performance_sweeps/scripts/bench @@ -1 +1 @@ -Subproject commit f3ea022a5780de5d0babc5fffa53634e2023d28f +Subproject commit 2a35585f0cb2c98d18934088c867f1ba52d373b4 diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh index ee9c08d632..581cf654d9 100755 --- a/components/backends/trtllm/performance_sweeps/scripts/bench.sh +++ b/components/backends/trtllm/performance_sweeps/scripts/bench.sh @@ -37,10 +37,11 @@ model_path=${9} isl=${10} osl=${11} kind=${12} +benchmark_kind=${13} -if [ "$#" -ne 12 ]; then - echo "Error: Expected 12 arguments, got $#" - echo "Usage: $0 " +if [ "$#" -ne 13 ]; then + echo "Error: Expected 13 arguments, got $#" + echo "Usage: $0 " exit 1 fi @@ -58,8 +59,12 @@ echo " model_path: $model_path" echo " isl: $isl" echo " osl: $osl" echo " kind: $kind" +echo " benchmark_kind: $benchmark_kind" - +if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then + echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'" + exit 0 +fi # check process id is not 0 if [[ ${SLURM_PROCID} != "0" ]]; then @@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do # https://github.com/ai-dynamo/dynamo/pull/2683 if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then if [[ "$kind" == *disagg* ]]; then - if echo "$body" | grep -q '"tensorrt_llm_next"'; then + if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then echo "Health check succeeded on attempt $i" echo "$body" failed=false break else - echo "Attempt $i: tensorrt_llm_next key not found in etcd." + echo "Attempt $i: prefill generate endpoint not found in etcd." fi else echo "Health check succeeded on attempt $i" @@ -150,7 +155,9 @@ curl -v -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \ "max_tokens": 30 }' -python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \ +# aiperf already does a warmup +if [[ "$benchmark_kind" == "sa" ]]; then + python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \ --served-model-name ${model} \ --model ${model_path} \ --dataset-name random \ @@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \ --max-concurrency "1" \ --host ${hostname} \ --port ${port} +fi mkdir -p ${log_path}/results echo "Starting benchmark..." @@ -175,27 +183,55 @@ for concurrency in ${concurrency_list}; do num_prompts=$((concurrency * multi_round)) echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts" mkdir -p ${log_path}/concurrency_${concurrency} - - python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \ - --served-model-name ${model} \ - --model ${model_path} \ - --dataset-name random \ - --num-prompts "$num_prompts" \ - --random-input-len ${isl} \ - --random-output-len ${osl} \ - --random-range-ratio 0.8 \ - --use-chat-template \ - --ignore-eos \ - --use-chat-template \ - --backend "dynamo" \ - --endpoint "/v1/completions" \ - --percentile-metrics ttft,tpot,itl,e2el \ - --max-concurrency "$concurrency" \ - --host ${hostname} \ - --port ${port} \ - --save-result \ - --result-dir "${log_path}/results" \ - --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json" + + if [[ "$benchmark_kind" == "sa" ]]; then + python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \ + --served-model-name ${model} \ + --model ${model_path} \ + --dataset-name random \ + --num-prompts "$num_prompts" \ + --random-input-len ${isl} \ + --random-output-len ${osl} \ + --random-range-ratio 0.8 \ + --use-chat-template \ + --ignore-eos \ + --use-chat-template \ + --backend "dynamo" \ + --endpoint "/v1/completions" \ + --percentile-metrics ttft,tpot,itl,e2el \ + --max-concurrency "$concurrency" \ + --host ${hostname} \ + --port ${port} \ + --save-result \ + --result-dir "${log_path}/results" \ + --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json" + else + aiperf profile \ + --model ${model} \ + --tokenizer ${model_path} \ + --endpoint-type completions \ + --endpoint /v1/completions \ + --streaming \ + --url ${hostname}:${port} \ + --synthetic-input-tokens-mean ${isl} \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean ${osl} \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:${osl} \ + --extra-inputs min_tokens:${osl} \ + --extra-inputs ignore_eos:true \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $num_prompts \ + --warmup-request-count $(($concurrency*2)) \ + --num-dataset-entries ${num_prompts} \ + --random-seed 100 \ + --artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \ + --ui simple \ + -v \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream' + fi echo "Benchmark with concurrency ${concurrency} done" done diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py index 8c256e82dd..06d2e9a24f 100644 --- a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py +++ b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py @@ -2,12 +2,262 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +from enum import Enum import os import re from typing import Any, Dict, List import yaml +class ModelType(Enum): + """ + Model type. + """ + GPT_OSS = "gpt_oss" + DSR1 = "dsr1" + +def get_model_type(model_path: str) -> str: + if "r1" in model_path.lower(): + print("Inferring DSR1-type model") + return ModelType.DSR1 + else: + print("Inferring GPT-oss-type model") + return ModelType.GPT_OSS + +def generate_dsr1_config( + config_path: str, + decode_config_path: str, + instance_config_path: str, + args: argparse.Namespace +): + gen_cuda_graph_batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 384, + 512, + 768, + 1024, + 2048, + args.gen_batch_size, + ] + + gen_moe_backend = "CUTLASS" + if args.gen_tp_size >= 16 and args.gen_enable_attention_dp: + gen_moe_backend = "WIDEEP" + if not args.gen_enable_attention_dp: + gen_moe_backend = "TRTLLM" + + prefill_config: Dict[str, Any] = { + "max_batch_size": args.ctx_batch_size, + "max_num_tokens": args.ctx_max_num_tokens, + "max_seq_len": args.ctx_max_seq_len, + "tensor_parallel_size": args.ctx_tp_size, + "moe_expert_parallel_size": args.ctx_ep_size, + "enable_attention_dp": args.ctx_enable_attention_dp, + "pipeline_parallel_size": 1, + "cuda_graph_config": None, + "print_iter_log": True, + "disable_overlap_scheduler": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction, + "dtype": "fp8", + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + } + + decode_config: Dict[str, Any] = { + "tensor_parallel_size": args.gen_tp_size, + "moe_expert_parallel_size": args.gen_tp_size, + "enable_attention_dp": args.gen_enable_attention_dp, + "pipeline_parallel_size": 1, + "max_batch_size": args.gen_batch_size, + "max_num_tokens": args.gen_max_num_tokens, + "max_seq_len": args.gen_max_seq_len, + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": gen_cuda_graph_batch_sizes, + }, + "print_iter_log": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.gen_gpu_memory_fraction, + "dtype": "fp8", + }, + "moe_config": { + "backend": gen_moe_backend, + "use_low_precision_moe_combine": True, + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + "stream_interval": 100, + } + + if args.gen_tp_size == 8 and not args.gen_enable_attention_dp: + decode_config["allreduce_strategy"] = "MNNVL" + + if args.eplb_num_slots > 0: + moe_load_balancer_file = os.path.join( + os.path.dirname(config_path), "moe_load_balancer.yaml" + ) + # Ensure the directory exists before writing the file + os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) + moe_load_balancer_config = { + "num_slots": args.eplb_num_slots, + "layer_updates_per_iter": 1, + } + with open(moe_load_balancer_file, "w") as f: + yaml.dump( + moe_load_balancer_config, f, default_flow_style=False, sort_keys=False + ) + decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file + + if args.mtp_size > 0: + prefill_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + decode_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + + return prefill_config, decode_config + +def generate_gpt_oss_config( + config_path: str, + decode_config_path: str, + instance_config_path: str, + args: argparse.Namespace +): + gen_cuda_graph_batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 384, + 512, + 768, + 1024, + 2048, + args.gen_batch_size, + ] + + gen_moe_backend = "TRTLLM" + + prefill_config: Dict[str, Any] = { + "max_batch_size": args.ctx_batch_size, + "max_num_tokens": args.ctx_max_num_tokens, + "max_seq_len": args.ctx_max_seq_len, + "tensor_parallel_size": args.ctx_tp_size, + "moe_expert_parallel_size": args.ctx_ep_size, + "enable_attention_dp": args.ctx_enable_attention_dp, + "pipeline_parallel_size": 1, + "cuda_graph_config": None, + "print_iter_log": True, + "disable_overlap_scheduler": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction, + "dtype": "fp8", + }, + "cuda_graph_config": { + "enable_padding": True, + "max_batch_size": 30, + }, + "num_postprocess_workers": 4, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + "moe_config": { + "backend": "TRTLLM" + } + } + + decode_config: Dict[str, Any] = { + "allreduce_strategy": "AUTO", + "attention_dp_config": { + "enable_balance": True + }, + "disable_overlap_scheduler": False, + "tensor_parallel_size": args.gen_tp_size, + "moe_expert_parallel_size": args.gen_ep_size, + "enable_attention_dp": args.gen_enable_attention_dp, + "pipeline_parallel_size": 1, + "max_batch_size": args.gen_batch_size, + "max_num_tokens": args.gen_max_num_tokens, + "max_seq_len": args.gen_max_seq_len, + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": gen_cuda_graph_batch_sizes, + }, + "print_iter_log": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.gen_gpu_memory_fraction, + "dtype": "fp8", + }, + "moe_config": { + "backend": gen_moe_backend, + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + "stream_interval": 20, + "num_postprocess_workers": 4 + } + + if args.eplb_num_slots > 0: + moe_load_balancer_file = os.path.join( + os.path.dirname(config_path), "moe_load_balancer.yaml" + ) + # Ensure the directory exists before writing the file + os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) + moe_load_balancer_config = { + "num_slots": args.eplb_num_slots, + "layer_updates_per_iter": 1, + } + with open(moe_load_balancer_file, "w") as f: + yaml.dump( + moe_load_balancer_config, f, default_flow_style=False, sort_keys=False + ) + decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file + + if args.mtp_size > 0: + prefill_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + decode_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + + return prefill_config, decode_config + +CONFIG_MAPPING = { + ModelType.GPT_OSS: generate_gpt_oss_config, + ModelType.DSR1: generate_dsr1_config, +} def process_node_and_task() -> tuple[int, List[str], List[str]]: """ @@ -144,6 +394,7 @@ def gen_config_file( ctx_enable_attention_dp: bool, num_gen_servers: int, gen_tp_size: int, + gen_ep_size: int, gen_batch_size: int, gen_max_num_tokens: int, gen_max_seq_len: int, @@ -153,7 +404,7 @@ def gen_config_file( mtp_size: int = 0, worker_start_port: int = 8001, server_port: int = 8000, - cache_transceiver_max_num_tokens: int = 4608, + cache_transceiver_max_num_tokens: int = 9216, ) -> None: """ Generate configuration YAML file for disaggregated inference. @@ -170,6 +421,7 @@ def gen_config_file( ctx_enable_attention_dp: Enable attention DP for context servers num_gen_servers: Number of generation servers gen_tp_size: Tensor parallel size for generation servers + gen_ep_size: Expert parallel size for generation servers gen_batch_size: Batch size for generation servers gen_max_num_tokens: Max number of tokens for generation servers gen_enable_attention_dp: Enable attention DP for generation servers @@ -178,109 +430,15 @@ def gen_config_file( worker_start_port: Start port for workers server_port: Server port """ - gen_cuda_graph_batch_sizes = [ - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128, - 256, - 384, - 512, - 768, - 1024, - 2048, - gen_batch_size, - ] - - gen_moe_backend = "CUTLASS" - if gen_tp_size >= 16 and gen_enable_attention_dp: - gen_moe_backend = "WIDEEP" - if not gen_enable_attention_dp: - gen_moe_backend = "TRTLLM" - - prefill_config: Dict[str, Any] = { - "max_batch_size": ctx_batch_size, - "max_num_tokens": ctx_max_num_tokens, - "max_seq_len": ctx_max_seq_len, - "tensor_parallel_size": ctx_tp_size, - "moe_expert_parallel_size": ctx_tp_size, - "enable_attention_dp": ctx_enable_attention_dp, - "pipeline_parallel_size": 1, - "cuda_graph_config": None, - "print_iter_log": True, - "disable_overlap_scheduler": True, - "kv_cache_config": { - "enable_block_reuse": False, - "free_gpu_memory_fraction": ctx_free_gpu_memory_fraction, - "dtype": "fp8", - }, - "cache_transceiver_config": { - "max_tokens_in_buffer": cache_transceiver_max_num_tokens, - "backend": "DEFAULT", - }, - } - - decode_config: Dict[str, Any] = { - "tensor_parallel_size": gen_tp_size, - "moe_expert_parallel_size": gen_tp_size, - "enable_attention_dp": gen_enable_attention_dp, - "pipeline_parallel_size": 1, - "max_batch_size": gen_batch_size, - "max_num_tokens": gen_max_num_tokens, - "max_seq_len": gen_max_seq_len, - "cuda_graph_config": { - "enable_padding": True, - "batch_sizes": gen_cuda_graph_batch_sizes, - }, - "print_iter_log": True, - "kv_cache_config": { - "enable_block_reuse": False, - "free_gpu_memory_fraction": gen_gpu_memory_fraction, - "dtype": "fp8", - }, - "moe_config": { - "backend": gen_moe_backend, - "use_low_precision_moe_combine": True, - }, - "cache_transceiver_config": { - "max_tokens_in_buffer": cache_transceiver_max_num_tokens, - "backend": "DEFAULT", - }, - "stream_interval": 20, - } - - if gen_tp_size == 8 and not gen_enable_attention_dp: - decode_config["allreduce_strategy"] = "MNNVL" - if eplb_num_slots > 0: - moe_load_balancer_file = os.path.join( - os.path.dirname(config_path), "moe_load_balancer.yaml" - ) - # Ensure the directory exists before writing the file - os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) - moe_load_balancer_config = { - "num_slots": eplb_num_slots, - "layer_updates_per_iter": 1, - } - with open(moe_load_balancer_file, "w") as f: - yaml.dump( - moe_load_balancer_config, f, default_flow_style=False, sort_keys=False - ) - decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file + model_type = get_model_type(model_path) - if mtp_size > 0: - prefill_config["speculative_config"] = { - "decoding_type": "MTP", - "num_nextn_predict_layers": mtp_size, - } - decode_config["speculative_config"] = { - "decoding_type": "MTP", - "num_nextn_predict_layers": mtp_size, - } + prefill_config, decode_config = CONFIG_MAPPING[model_type]( + config_path, + decode_config_path, + instance_config_path, + args + ) counts = {"prefill_count": num_ctx_servers, "decode_count": num_gen_servers} @@ -309,6 +467,12 @@ def gen_config_file( required=True, help="Tensor parallel size for context servers", ) + parser.add_argument( + "--ctx_ep_size", + type=int, + required=True, + help="Expert parallel size for context servers", + ) parser.add_argument( "--ctx_batch_size", type=int, @@ -351,6 +515,12 @@ def gen_config_file( required=True, help="Tensor parallel size for generation servers", ) + parser.add_argument( + "--gen_ep_size", + type=int, + required=True, + help="Expert parallel size for generation servers", + ) parser.add_argument( "--gen_batch_size", type=int, diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh index 305fd157ec..3ca750119b 100755 --- a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh +++ b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh @@ -3,13 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 config_file=$1 -enable_pdl=$2 -ctx_gpus=$3 -model_name=$4 -model_path=$5 -disaggregation_mode=$6 +ctx_gpus=$2 +model_name=$3 +model_path=$4 +disaggregation_mode=$5 +is_dep=$6 + unset UCX_TLS -echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}" +echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}, is_dep: ${is_dep}" # Read configuration values from the YAML config file if [ ! -f "${config_file}" ]; then @@ -40,21 +41,22 @@ echo " max_batch_size: ${max_batch_size}" echo " max_seq_len: ${max_seq_len}" export TLLM_LOG_LEVEL=INFO -# NOTE: This var is default behavior in recent trtllm commits, and can -# be removed. Keeping it here in case the script is ran with older commits. -export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 -# NOTE: This var was replaced with an LLM API / yaml engine config field -# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and -# can be removed. Keeping it here in case the script is ran with older commits. -export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1 +export TRTLLM_ENABLE_PDL=1 -if [ "${enable_pdl}" = "true" ]; then - export TRTLLM_ENABLE_PDL=1 -fi +export TRTLLM_SERVER_DISABLE_GC=1 +export TRTLLM_WORKER_DISABLE_GC=1 +export NCCL_GRAPH_MIXING_SUPPORT=0 -# NOTE: Set (or unset) these depending on what cluster you're using -export TRTLLM_UCX_INTERFACE=enP6p9s0np0 -export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_3:1,mlx5_4:1,enP6p9s0np0 +if [[ "${model_path,,}" != *r1* ]]; then + echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8" + export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8 + if [ "$is_dep" = "true" ]; then + echo "Using DEP with gpt-oss. Setting env vars." + export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" + export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" + export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" + fi +fi trtllm-llmapi-launch python3 -m dynamo.trtllm \ --model-path ${model_path} \ diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh index 5300cc9c27..a43cbb7c67 100755 --- a/components/backends/trtllm/performance_sweeps/submit_disagg.sh +++ b/components/backends/trtllm/performance_sweeps/submit_disagg.sh @@ -22,6 +22,7 @@ NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}" ISL="${ISL:-8150}" OSL="${OSL:-1024}" +BENCHMARK_KIND="${BENCHMARK_KIND:-sa}" # Build slurm_args step-by-step with validation and defaults slurm_args="--time=04:00:00" @@ -77,196 +78,31 @@ usage() { # Run single task run_single() { local ctx_num=$1 - local gen_num=$2 - local gen_tp_size=$3 - local gen_batch_size=$4 - local gen_max_num_tokens=$5 - local gen_enable_attention_dp=$6 - local gen_gpu_memory_fraction=$7 - local gen_mtp_size=$8 - local gen_eplb_num_slots=$9 - local gen_concurrency_list=${10} + local ctx_tp_size=$2 + local ctx_ep_size=$3 + local ctx_enable_attention_dp=$4 + local gen_num=$5 + local gen_tp_size=$6 + local gen_ep_size=$7 + local gen_batch_size=$8 + local gen_max_num_tokens=$9 + local gen_enable_attention_dp=${10} + local gen_gpu_memory_fraction=${11} + local gen_eplb_num_slots=${12} + local gen_mtp_size=${13} + local gen_concurrency_list=${14} # TODO: expose kind to the command line local kind="dynamo_disagg" - gen_nodes=$(((gen_tp_size + 3)/4 * gen_num)) + gen_nodes=$(((gen_tp_size + NTASKS_PER_NODE - 1)/NTASKS_PER_NODE * gen_num)) total_nodes=$((ctx_num + gen_nodes)) - total_tasks=$((total_nodes * 4)) + total_tasks=$((total_nodes * NTASKS_PER_NODE)) set -x - if (( ISL == OSL )); then - sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 4 4608 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} - else - sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} - fi + sbatch --nodes=${total_nodes} --gpus-per-node ${NTASKS_PER_NODE} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND} ${NTASKS_PER_NODE} set +x } -# MTP0 Configuration (gen_mtp_size=0) -run_4_gpus_mtp0() { - echo "Running 4 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 5 4 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192" - run_single 1 5 4 64 64 true "0.85" 0 0 "256 384" - run_single 1 4 4 128 128 true "0.85" 0 0 "512 768" - run_single 2 5 4 256 256 true "0.85" 0 0 "1024 1536" - run_single 1 2 4 512 512 true "0.85" 0 0 "2048 3072" - run_single 2 3 4 768 768 true "0.85" 0 0 "3072 4096" - else - run_single 1 5 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24" - run_single 1 4 4 32 32 false "0.9" 0 0 "32 48" - run_single 2 5 4 64 64 false "0.9" 0 0 "64 96" - run_single 1 2 4 128 128 false "0.9" 0 0 "128 192" - run_single 1 1 4 64 64 true "0.8" 0 0 "256 384" - run_single 3 2 4 128 128 true "0.8" 0 0 "512 768" - fi -} - -run_8_gpus_mtp0() { - echo "Running 8 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192 256" - run_single 1 4 8 32 32 true "0.8" 0 0 "256 384" - run_single 1 3 8 64 64 true "0.8" 0 0 "512 768" - run_single 1 2 8 128 128 true "0.8" 0 0 "1024 1536" - run_single 1 1 8 256 256 true "0.8" 0 0 "2048 3072" - run_single 1 1 8 512 512 true "0.8" 0 0 "4096 6144" - run_single 3 2 8 768 768 true "0.8" 0 0 "6144 8192" - run_single 3 2 8 1024 1024 true "0.8" 0 0 "8192 12288" - else - run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24" - run_single 1 3 8 32 32 false "0.9" 0 0 "32 48" - run_single 1 2 8 64 64 false "0.9" 0 0 "64 96" - run_single 1 1 8 128 128 false "0.9" 0 0 "128 192" - run_single 3 2 8 32 32 true "0.8" 0 0 "256 384" - run_single 5 2 8 64 64 true "0.8" 0 0 "512 768" - run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536" - run_single 5 1 8 256 256 true "0.8" 0 0 "2048 3072" - fi -} - -run_16_gpus_mtp0() { - echo "Running 16 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 1 16 64 64 true "0.75" 0 0 "16 32 64 128 256 512 1024 1536" - run_single 2 1 16 128 128 true "0.75" 0 256 "2048 3072" - run_single 2 1 16 256 256 true "0.75" 0 256 "4096 6144" - run_single 3 1 16 512 512 true "0.75" 0 256 "8192 12288" - run_single 3 1 16 768 768 true "0.75" 0 256 "12288 16384" - run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480" - else - run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5 - run_single 2 1 16 16 16 true "0.8" 0 0 "256 384" # 6 - run_single 3 1 16 32 32 true "0.8" 0 0 "512 768" # 7 - run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536" # 10 - run_single 8 1 16 128 128 true "0.8" 0 256 "2048 3072" # 12 - run_single 10 1 16 256 256 true "0.8" 0 256 "4096 6144" # 14 - fi -} - -run_32_gpus_mtp0() { - echo "Running 32 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 1 32 32 32 true "0.7" 0 0 "32 64 128 256 512 1024 1536" - run_single 2 1 32 64 64 true "0.7" 0 256 "2048 3072" - run_single 3 1 32 128 128 true "0.7" 0 288 "4096 6144" - run_single 4 1 32 256 256 true "0.7" 0 288 "8192 12288" - run_single 5 1 32 512 512 true "0.7" 0 288 "16384 20480" - else - run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192" # 9 - run_single 2 1 32 8 8 true "0.7" 0 0 "256 384" # 10 - run_single 4 1 32 16 16 true "0.7" 0 0 "512 768" # 12 - run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536" # 15 - fi -} - -# MTP Configuration (gen_mtp_size=1,2,3) -run_4_gpus_mtp() { - echo "Running 4 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 5 4 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48" - run_single 1 5 4 32 128 true "0.9" 3 0 "64 128 192" - run_single 1 4 4 64 256 true "0.9" 3 0 "256 384" - run_single 1 3 4 128 512 true "0.9" 3 0 "512 768" - run_single 1 2 4 256 768 true "0.9" 2 0 "1024 1536" - run_single 2 3 4 512 1024 true "0.9" 1 0 "2048 3072" - run_single 1 1 4 768 1536 true "0.9" 1 0 "3072 4096" - else - run_single 1 5 4 8 32 false "0.9" 3 0 "1 2 4 8 12" - run_single 1 4 4 16 64 false "0.9" 3 0 "16 24" - run_single 1 3 4 32 128 false "0.9" 3 0 "32 48" - run_single 2 3 4 16 64 true "0.8" 3 0 "64 96" - run_single 1 1 4 32 128 true "0.8" 3 0 "128 192" - run_single 2 1 4 64 256 true "0.8" 2 0 "256 384" - run_single 5 2 4 128 512 true "0.8" 1 0 "512 768" - fi -} - -run_8_gpus_mtp() { - echo "Running 8 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48" - run_single 1 4 8 16 64 true "0.8" 3 0 "64 128 192" - run_single 1 3 8 32 128 true "0.8" 3 0 "256 384" - run_single 1 2 8 64 256 true "0.8" 3 0 "512 768" - run_single 1 1 8 128 512 true "0.8" 3 0 "1024 1536" - run_single 1 1 8 256 512 true "0.8" 1 0 "2048 3072" - run_single 3 2 8 512 1024 true "0.8" 1 0 "4096 6144" - run_single 3 2 8 768 1536 true "0.8" 1 0 "6144 8192" - run_single 3 2 8 1024 2048 true "0.8" 1 0 "8192 12288" - else - run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12" - run_single 1 3 8 16 64 false "0.9" 3 0 "16 24" - run_single 1 2 8 32 128 false "0.9" 3 0 "32 48" - run_single 1 1 8 8 32 true "0.8" 3 0 "64 96" - run_single 3 2 8 16 64 true "0.8" 3 0 "128 192" - run_single 5 2 8 32 128 true "0.8" 3 0 "256 384" - run_single 7 2 8 64 256 true "0.8" 2 0 "512 768" - run_single 5 1 8 128 256 true "0.8" 1 0 "1024 1536" - run_single 6 1 8 256 512 true "0.8" 1 0 "2048 3072" - fi -} - -run_16_gpus_mtp() { - echo "Running 16 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 1 16 32 128 true "0.7" 3 0 "16 32 64 128 256 512 768" - run_single 1 1 16 64 256 true "0.7" 3 256 "1024 1536" - run_single 2 1 16 128 256 true "0.7" 1 288 "2048 3072" - run_single 2 1 16 256 512 true "0.7" 1 288 "4096 6144" - run_single 3 1 16 512 1024 true "0.7" 1 288 "8192 12288" - run_single 3 1 16 768 1536 true "0.7" 1 288 "12288 16384" - run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480" - else - run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5 - run_single 2 1 16 8 32 true "0.8" 3 0 "128 192" # 6 - run_single 4 1 16 16 64 true "0.8" 3 0 "256 384" # 8 - run_single 6 1 16 32 128 true "0.8" 3 0 "512 768" # 10 - run_single 8 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13 - run_single 10 1 16 128 256 true "0.8" 1 256 "2048 3072" # 15 - run_single 12 1 16 256 512 true "0.8" 1 256 "4096 6144" # 16 - fi - -} - -run_32_gpus_mtp() { - echo "Running 32 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 1 32 16 64 true "0.6" 3 0 "32 64 128 256 512 768" - run_single 2 1 32 32 128 true "0.6" 3 288 "1024 1536" - run_single 3 1 32 64 256 true "0.6" 3 288 "2048 3072" - run_single 3 1 32 128 256 true "0.6" 1 288 "4096 6144" - run_single 4 1 32 256 512 true "0.6" 1 288 "8192 12288" - run_single 5 1 32 512 1024 true "0.6" 1 288 "16384 20480" - else - run_single 1 1 32 1 4 true "0.7" 3 0 "32 48" # 9 - run_single 2 1 32 2 8 true "0.7" 3 0 "64 96" # 10 - run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11 - run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13 - run_single 8 1 32 16 64 true "0.7" 3 256 "512 768" # 16 - fi -} - # Main function main() { local mtp_mode=$1 @@ -279,139 +115,75 @@ main() { fi case $mode in - "all") - echo "Running all GPU configurations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_4_gpus_mtp0 - run_8_gpus_mtp0 - run_16_gpus_mtp0 - run_32_gpus_mtp0 - else - run_4_gpus_mtp - run_8_gpus_mtp - run_16_gpus_mtp - run_32_gpus_mtp - fi - ;; - "pareto") - # 1k/1k - export ISL=1024 - export OSL=1024 - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 - - if [[ "$mtp_mode" == "mtp=off" ]]; then - # 1k/1k mtp=off - run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141" - run_single 1 1 32 32 32 true "0.7" 0 0 "1075" - run_single 1 1 16 64 64 true "0.75" 0 0 "1075" - run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300" - run_single 1 1 8 512 512 true "0.8" 0 0 "4300" - - else - # 1k/1k mtp=on - run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36" - run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075" - run_single 2 1 16 128 256 true "0.7" 1 0 "2150" - run_single 1 1 32 16 64 true "0.6" 3 0 "512" - run_single 1 1 8 256 512 true "0.8" 1 0 "2252" - fi - - # 8k/1k - export ISL=8192 - export OSL=1024 - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 - - if [[ "$mtp_mode" == "mtp=off" ]]; then - # 8k/1k mtp=off - run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34" - run_single 4 1 32 16 16 true "0.7" 0 0 "256 538" - run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs - run_single 6 1 16 64 64 true "0.75" 0 0 "1075" - run_single 8 1 16 128 128 true "0.75" 0 0 "2150" - run_single 5 1 8 256 256 true "0.8" 0 0 "2150" - else - # 8k/1k mtp=on - run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18" - run_single 5 1 32 8 32 true "0.7" 3 0 "128 269" - run_single 8 1 32 16 64 true "0.7" 3 0 "538" - run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs - run_single 8 1 16 64 256 true "0.75" 2 0 "1075" - run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs - run_single 6 1 8 256 512 true "0.8" 1 0 "2150" - fi - ;; - "4GPU") - echo "Running 4 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_4_gpus_mtp0 - else - run_4_gpus_mtp - fi - ;; - "8GPU") - echo "Running 8 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_8_gpus_mtp0 - else - run_8_gpus_mtp - fi - ;; - "16GPU") - echo "Running 16 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_16_gpus_mtp0 - else - run_16_gpus_mtp - fi - ;; - "32GPU") - echo "Running 32 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_32_gpus_mtp0 - else - run_32_gpus_mtp - fi - ;; "tep") - if [ $# -ne 11 ]; then - echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)" + if [ $# -ne 14 ]; then + echo "Error: TEP mode requires 14 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 - local gen_num=$4 - local gen_tp_size=$5 - local gen_batch_size=$6 - local gen_max_num_tokens=$7 - local gen_gpu_memory_fraction=$8 - local gen_mtp_size=$9 - local gen_eplb_num_slots=${10} - local gen_concurrency_list=${11} - - echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" # TEP mode: Use false to disable attention dp - run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; "dep") - if [ $# -ne 11 ]; then - echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)" + if [ $# -ne 14 ]; then + echo "Error: DEP mode requires 14 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 - local gen_num=$4 - local gen_tp_size=$5 - local gen_batch_size=$6 - local gen_max_num_tokens=$7 - local gen_gpu_memory_fraction=$8 - local gen_mtp_size=$9 - local gen_eplb_num_slots=${10} - local gen_concurrency_list=${11} - - echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, ctx_tp_size=$ctx_tp_size, ctx_enable_attention_dp=$ctx_enable_attention_dp, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + ;; + "tp") + if [ $# -ne 14 ]; then + echo "Error: TP mode requires 14 additional parameters (including mtp_mode)" + usage + fi - run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + local ctx_num=$3 + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running TP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=1, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size 1 $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; *) echo "Error: Unknown mode '$mode'"