diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
index b5650738a7..9a30b711cc 100755
--- a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
+++ b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -5,34 +5,38 @@ MULTI_ROUND="${MULTI_ROUND:-8}"
 
 # set MOUNT_DIR
 MOUNT_DIR="${MOUNT_DIR:-${PWD}}"
-CONTAINER_NAME=disaggr-test
-
+CONTAINER_NAME=disaggr-test-$(date +%s)-$$
 
 STREAMING=true
 CTX_GPU_FRAC=0.85
-CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}
 
 num_ctx_servers=$1
 ctx_tp_size=$2
-ctx_batch_size=$3
-ctx_max_num_tokens=$4
-ctx_enable_attention_dp=$5
-num_gen_servers=$6
-gen_tp_size=$7
-gen_batch_size=$8
-gen_max_num_tokens=$9
-gen_enable_attention_dp=${10}
-gen_gpu_memory_fraction=${11}
-eplb_num_slots=${12}
-mtp_size=${13}
-concurrency_list=${14}
-gen_nodes=${15}
-kind=${16}
-model_path=${17}
-served_model_name=${18}
-image=${19}
-isl=${20}
-osl=${21}
+ctx_ep_size=$3
+ctx_enable_attention_dp=$4
+ctx_batch_size=$5
+ctx_max_num_tokens=$6
+num_gen_servers=$7
+gen_tp_size=$8
+gen_ep_size=$9
+gen_batch_size=${10}
+gen_max_num_tokens=${11}
+gen_enable_attention_dp=${12}
+gen_gpu_memory_fraction=${13}
+eplb_num_slots=${14}
+mtp_size=${15}
+concurrency_list=${16}
+gen_nodes=${17}
+kind=${18}
+model_path=${19}
+served_model_name=${20}
+image=${21}
+isl=${22}
+osl=${23}
+benchmark_kind=${24}
+ntasks_per_node=${25}
+
+CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
 
 ctx_max_seq_len=$((${isl} + 203))
 gen_max_seq_len=$((${isl} + ${osl} + 203))
@@ -44,7 +48,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh"
 mkdir -p ${LOG_DIR}
 echo "trying to submit job"
 
-sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
 echo "concurrency_list: ${concurrency_list}"
 
@@ -53,11 +57,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size))
 
 echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}"
 
-enable_pdl=false
 if [ "${gen_enable_attention_dp}" = "false" ]; then
-    enable_pdl=true
-    echo "enable_pdl: ${enable_pdl}"
-    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 fi
 
 full_logdir=${sub_dir}
@@ -84,6 +85,7 @@ srun -l --container-name=${CONTAINER_NAME} \
             --model ${model_path} \
             --num_ctx_servers ${num_ctx_servers} \
             --ctx_tp_size ${ctx_tp_size} \
+            --ctx_ep_size ${ctx_ep_size} \
             --ctx_batch_size ${ctx_batch_size} \
             --ctx_max_num_tokens ${ctx_max_num_tokens} \
             --ctx_max_seq_len ${ctx_max_seq_len} \
@@ -91,6 +93,7 @@ srun -l --container-name=${CONTAINER_NAME} \
             --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \
             --num_gen_servers ${num_gen_servers} \
             --gen_tp_size ${gen_tp_size} \
+            --gen_ep_size ${gen_ep_size} \
             --gen_batch_size ${gen_batch_size} \
             --gen_max_num_tokens ${gen_max_num_tokens} \
             --gen_max_seq_len ${gen_max_seq_len} \
@@ -176,8 +179,10 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
       --nodes ${num_gen_nodes} \
       --ntasks $gen_tp_size \
       --oversubscribe \
+      --gpus-per-node $ntasks_per_node \
       --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
+      -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
   echo "$!" >> "$PID_FILE"
 done
 
@@ -200,9 +205,11 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
         --mpi=pmix --overlap -w ${nodes[node_idx]} \
         --oversubscribe \
         --overlap \
-        --ntasks 4 \
+        --ntasks $(( ctx_tp_size < ntasks_per_node ? ctx_tp_size : ntasks_per_node )) \
+        --gpus-per-node $ntasks_per_node \
         --nodes 1 \
-        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
+        -e UCX_NET_DEVICES,TRTLLM_UCX_INTERFACE \
+        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
   prefill_pids+=($!)
   echo "$!" >> "$PID_FILE"
 done
@@ -214,7 +221,7 @@ srun -l --container-name=${CONTAINER_NAME} \
         --container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
         --mpi=pmix --overlap -N 1 -n 1 \
 	    -w ${nodes[0]} \
-        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
+        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1
 
 
 # Cleanup will be handled by the EXIT trap
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench b/components/backends/trtllm/performance_sweeps/scripts/bench
index f3ea022a57..2a35585f0c 160000
--- a/components/backends/trtllm/performance_sweeps/scripts/bench
+++ b/components/backends/trtllm/performance_sweeps/scripts/bench
@@ -1 +1 @@
-Subproject commit f3ea022a5780de5d0babc5fffa53634e2023d28f
+Subproject commit 2a35585f0cb2c98d18934088c867f1ba52d373b4
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
index ee9c08d632..581cf654d9 100755
--- a/components/backends/trtllm/performance_sweeps/scripts/bench.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
@@ -37,10 +37,11 @@ model_path=${9}
 isl=${10}
 osl=${11}
 kind=${12}
+benchmark_kind=${13}
 
-if [ "$#" -ne 12 ]; then
-    echo "Error: Expected 12 arguments, got $#"
-    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
+if [ "$#" -ne 13 ]; then
+    echo "Error: Expected 13 arguments, got $#"
+    echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
     exit 1
 fi
 
@@ -58,8 +59,12 @@ echo "  model_path: $model_path"
 echo "  isl: $isl"
 echo "  osl: $osl"
 echo "  kind: $kind"
+echo "  benchmark_kind: $benchmark_kind"
 
-
+if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
+    echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
+    exit 0
+fi
 
 # check process id is not 0
 if [[ ${SLURM_PROCID} != "0" ]]; then
@@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
     # https://github.com/ai-dynamo/dynamo/pull/2683
     if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
         if [[ "$kind" == *disagg* ]]; then
-            if echo "$body" | grep -q '"tensorrt_llm_next"'; then
+            if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
                 echo "Health check succeeded on attempt $i"
                 echo "$body"
                 failed=false
                 break
             else
-                echo "Attempt $i: tensorrt_llm_next key not found in etcd."
+                echo "Attempt $i: prefill generate endpoint not found in etcd."
             fi
         else
             echo "Health check succeeded on attempt $i"
@@ -150,7 +155,9 @@ curl -v  -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
   "max_tokens": 30
 }'
 
-python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+# aiperf already does a warmup
+if [[ "$benchmark_kind" == "sa" ]]; then
+    python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --served-model-name ${model} \
         --model ${model_path} \
         --dataset-name random \
@@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --max-concurrency "1" \
         --host ${hostname} \
         --port ${port}
+fi
 
 mkdir -p ${log_path}/results
 echo "Starting benchmark..."
@@ -175,27 +183,55 @@ for concurrency in ${concurrency_list}; do
     num_prompts=$((concurrency * multi_round))
     echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
     mkdir -p ${log_path}/concurrency_${concurrency}
-
-    python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
-        --served-model-name ${model} \
-        --model ${model_path} \
-        --dataset-name random \
-        --num-prompts "$num_prompts" \
-        --random-input-len ${isl} \
-        --random-output-len ${osl} \
-        --random-range-ratio 0.8 \
-        --use-chat-template \
-        --ignore-eos \
-        --use-chat-template \
-        --backend "dynamo" \
-        --endpoint "/v1/completions" \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --max-concurrency "$concurrency" \
-        --host ${hostname} \
-        --port ${port} \
-        --save-result \
-        --result-dir "${log_path}/results" \
-        --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
+    
+    if [[ "$benchmark_kind" == "sa" ]]; then
+        python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
+            --served-model-name ${model} \
+            --model ${model_path} \
+            --dataset-name random \
+            --num-prompts "$num_prompts" \
+            --random-input-len ${isl} \
+            --random-output-len ${osl} \
+            --random-range-ratio 0.8 \
+            --use-chat-template \
+            --ignore-eos \
+            --use-chat-template \
+            --backend "dynamo" \
+            --endpoint "/v1/completions" \
+            --percentile-metrics ttft,tpot,itl,e2el \
+            --max-concurrency "$concurrency" \
+            --host ${hostname} \
+            --port ${port} \
+            --save-result \
+            --result-dir "${log_path}/results" \
+            --result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
+    else
+        aiperf profile \
+    	    --model ${model} \
+    	    --tokenizer ${model_path} \
+    	    --endpoint-type completions \
+    	    --endpoint /v1/completions \
+    	    --streaming \
+    	    --url ${hostname}:${port} \
+    	    --synthetic-input-tokens-mean ${isl} \
+    	    --synthetic-input-tokens-stddev 0 \
+    	    --output-tokens-mean ${osl} \
+    	    --output-tokens-stddev 0 \
+    	    --extra-inputs max_tokens:${osl} \
+    	    --extra-inputs min_tokens:${osl} \
+    	    --extra-inputs ignore_eos:true \
+	    --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+    	    --concurrency $concurrency \
+    	    --request-count $num_prompts \
+    	    --warmup-request-count $(($concurrency*2)) \
+	    --num-dataset-entries ${num_prompts} \
+    	    --random-seed 100 \
+    	    --artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
+    	    --ui simple \
+	    -v \
+    	    -H 'Authorization: Bearer NOT USED' \
+    	    -H 'Accept: text/event-stream'    
+    fi
 
     echo "Benchmark with concurrency ${concurrency} done"
 done
diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
index 8c256e82dd..06d2e9a24f 100644
--- a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
+++ b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
@@ -2,12 +2,262 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+from enum import Enum
 import os
 import re
 from typing import Any, Dict, List
 
 import yaml
 
+class ModelType(Enum):
+    """
+    Model type.
+    """
+    GPT_OSS = "gpt_oss"
+    DSR1 = "dsr1"
+    
+def get_model_type(model_path: str) -> str:
+    if "r1" in model_path.lower():
+        print("Inferring DSR1-type model")
+        return ModelType.DSR1
+    else:
+        print("Inferring GPT-oss-type model")
+        return ModelType.GPT_OSS
+
+def generate_dsr1_config(    
+    config_path: str,
+    decode_config_path: str,
+    instance_config_path: str,
+    args: argparse.Namespace
+):
+    gen_cuda_graph_batch_sizes = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        384,
+        512,
+        768,
+        1024,
+        2048,
+        args.gen_batch_size,
+    ]
+
+    gen_moe_backend = "CUTLASS"
+    if args.gen_tp_size >= 16 and args.gen_enable_attention_dp:
+        gen_moe_backend = "WIDEEP"
+    if not args.gen_enable_attention_dp:
+        gen_moe_backend = "TRTLLM"
+
+    prefill_config: Dict[str, Any] = {
+        "max_batch_size": args.ctx_batch_size,
+        "max_num_tokens": args.ctx_max_num_tokens,
+        "max_seq_len": args.ctx_max_seq_len,
+        "tensor_parallel_size": args.ctx_tp_size,
+        "moe_expert_parallel_size": args.ctx_ep_size,
+        "enable_attention_dp": args.ctx_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "cuda_graph_config": None,
+        "print_iter_log": True,
+        "disable_overlap_scheduler": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+    }
+
+    decode_config: Dict[str, Any] = {
+        "tensor_parallel_size": args.gen_tp_size,
+        "moe_expert_parallel_size": args.gen_tp_size,
+        "enable_attention_dp": args.gen_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "max_batch_size": args.gen_batch_size,
+        "max_num_tokens": args.gen_max_num_tokens,
+        "max_seq_len": args.gen_max_seq_len,
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
+        },
+        "print_iter_log": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.gen_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "moe_config": {
+            "backend": gen_moe_backend,
+            "use_low_precision_moe_combine": True,
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+        "stream_interval": 100,
+    }
+
+    if args.gen_tp_size == 8 and not args.gen_enable_attention_dp:
+        decode_config["allreduce_strategy"] = "MNNVL"
+
+    if args.eplb_num_slots > 0:
+        moe_load_balancer_file = os.path.join(
+            os.path.dirname(config_path), "moe_load_balancer.yaml"
+        )
+        # Ensure the directory exists before writing the file
+        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
+        moe_load_balancer_config = {
+            "num_slots": args.eplb_num_slots,
+            "layer_updates_per_iter": 1,
+        }
+        with open(moe_load_balancer_file, "w") as f:
+            yaml.dump(
+                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
+            )
+        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
+
+    if args.mtp_size > 0:
+        prefill_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+        decode_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+    
+    return prefill_config, decode_config
+
+def generate_gpt_oss_config(
+    config_path: str,
+    decode_config_path: str,
+    instance_config_path: str,
+    args: argparse.Namespace
+):
+    gen_cuda_graph_batch_sizes = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        384,
+        512,
+        768,
+        1024,
+        2048,
+        args.gen_batch_size,
+    ]
+
+    gen_moe_backend = "TRTLLM"
+
+    prefill_config: Dict[str, Any] = {
+        "max_batch_size": args.ctx_batch_size,
+        "max_num_tokens": args.ctx_max_num_tokens,
+        "max_seq_len": args.ctx_max_seq_len,
+        "tensor_parallel_size": args.ctx_tp_size,
+        "moe_expert_parallel_size": args.ctx_ep_size,
+        "enable_attention_dp": args.ctx_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "cuda_graph_config": None,
+        "print_iter_log": True,
+        "disable_overlap_scheduler": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "max_batch_size": 30,
+        },
+        "num_postprocess_workers": 4,
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+        "moe_config": {
+            "backend": "TRTLLM"
+        }
+    }
+
+    decode_config: Dict[str, Any] = {
+        "allreduce_strategy": "AUTO",
+        "attention_dp_config": {
+            "enable_balance": True
+        },
+        "disable_overlap_scheduler": False,
+        "tensor_parallel_size": args.gen_tp_size,
+        "moe_expert_parallel_size": args.gen_ep_size,
+        "enable_attention_dp": args.gen_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "max_batch_size": args.gen_batch_size,
+        "max_num_tokens": args.gen_max_num_tokens,
+        "max_seq_len": args.gen_max_seq_len,
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
+        },
+        "print_iter_log": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.gen_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "moe_config": {
+            "backend": gen_moe_backend,
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+        "stream_interval": 20,
+        "num_postprocess_workers": 4
+    }
+
+    if args.eplb_num_slots > 0:
+        moe_load_balancer_file = os.path.join(
+            os.path.dirname(config_path), "moe_load_balancer.yaml"
+        )
+        # Ensure the directory exists before writing the file
+        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
+        moe_load_balancer_config = {
+            "num_slots": args.eplb_num_slots,
+            "layer_updates_per_iter": 1,
+        }
+        with open(moe_load_balancer_file, "w") as f:
+            yaml.dump(
+                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
+            )
+        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
+
+    if args.mtp_size > 0:
+        prefill_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+        decode_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+    
+    return prefill_config, decode_config
+
+CONFIG_MAPPING = {
+    ModelType.GPT_OSS: generate_gpt_oss_config,
+    ModelType.DSR1: generate_dsr1_config,
+}
 
 def process_node_and_task() -> tuple[int, List[str], List[str]]:
     """
@@ -144,6 +394,7 @@ def gen_config_file(
     ctx_enable_attention_dp: bool,
     num_gen_servers: int,
     gen_tp_size: int,
+    gen_ep_size: int,
     gen_batch_size: int,
     gen_max_num_tokens: int,
     gen_max_seq_len: int,
@@ -153,7 +404,7 @@ def gen_config_file(
     mtp_size: int = 0,
     worker_start_port: int = 8001,
     server_port: int = 8000,
-    cache_transceiver_max_num_tokens: int = 4608,
+    cache_transceiver_max_num_tokens: int = 9216,
 ) -> None:
     """
     Generate configuration YAML file for disaggregated inference.
@@ -170,6 +421,7 @@ def gen_config_file(
         ctx_enable_attention_dp: Enable attention DP for context servers
         num_gen_servers: Number of generation servers
         gen_tp_size: Tensor parallel size for generation servers
+        gen_ep_size: Expert parallel size for generation servers
         gen_batch_size: Batch size for generation servers
         gen_max_num_tokens: Max number of tokens for generation servers
         gen_enable_attention_dp: Enable attention DP for generation servers
@@ -178,109 +430,15 @@ def gen_config_file(
         worker_start_port: Start port for workers
         server_port: Server port
     """
-    gen_cuda_graph_batch_sizes = [
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        256,
-        384,
-        512,
-        768,
-        1024,
-        2048,
-        gen_batch_size,
-    ]
-
-    gen_moe_backend = "CUTLASS"
-    if gen_tp_size >= 16 and gen_enable_attention_dp:
-        gen_moe_backend = "WIDEEP"
-    if not gen_enable_attention_dp:
-        gen_moe_backend = "TRTLLM"
-
-    prefill_config: Dict[str, Any] = {
-        "max_batch_size": ctx_batch_size,
-        "max_num_tokens": ctx_max_num_tokens,
-        "max_seq_len": ctx_max_seq_len,
-        "tensor_parallel_size": ctx_tp_size,
-        "moe_expert_parallel_size": ctx_tp_size,
-        "enable_attention_dp": ctx_enable_attention_dp,
-        "pipeline_parallel_size": 1,
-        "cuda_graph_config": None,
-        "print_iter_log": True,
-        "disable_overlap_scheduler": True,
-        "kv_cache_config": {
-            "enable_block_reuse": False,
-            "free_gpu_memory_fraction": ctx_free_gpu_memory_fraction,
-            "dtype": "fp8",
-        },
-        "cache_transceiver_config": {
-            "max_tokens_in_buffer": cache_transceiver_max_num_tokens,
-            "backend": "DEFAULT",
-        },
-    }
-
-    decode_config: Dict[str, Any] = {
-        "tensor_parallel_size": gen_tp_size,
-        "moe_expert_parallel_size": gen_tp_size,
-        "enable_attention_dp": gen_enable_attention_dp,
-        "pipeline_parallel_size": 1,
-        "max_batch_size": gen_batch_size,
-        "max_num_tokens": gen_max_num_tokens,
-        "max_seq_len": gen_max_seq_len,
-        "cuda_graph_config": {
-            "enable_padding": True,
-            "batch_sizes": gen_cuda_graph_batch_sizes,
-        },
-        "print_iter_log": True,
-        "kv_cache_config": {
-            "enable_block_reuse": False,
-            "free_gpu_memory_fraction": gen_gpu_memory_fraction,
-            "dtype": "fp8",
-        },
-        "moe_config": {
-            "backend": gen_moe_backend,
-            "use_low_precision_moe_combine": True,
-        },
-        "cache_transceiver_config": {
-            "max_tokens_in_buffer": cache_transceiver_max_num_tokens,
-            "backend": "DEFAULT",
-        },
-        "stream_interval": 20,
-    }
-
-    if gen_tp_size == 8 and not gen_enable_attention_dp:
-        decode_config["allreduce_strategy"] = "MNNVL"
 
-    if eplb_num_slots > 0:
-        moe_load_balancer_file = os.path.join(
-            os.path.dirname(config_path), "moe_load_balancer.yaml"
-        )
-        # Ensure the directory exists before writing the file
-        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
-        moe_load_balancer_config = {
-            "num_slots": eplb_num_slots,
-            "layer_updates_per_iter": 1,
-        }
-        with open(moe_load_balancer_file, "w") as f:
-            yaml.dump(
-                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
-            )
-        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
+    model_type = get_model_type(model_path)
 
-    if mtp_size > 0:
-        prefill_config["speculative_config"] = {
-            "decoding_type": "MTP",
-            "num_nextn_predict_layers": mtp_size,
-        }
-        decode_config["speculative_config"] = {
-            "decoding_type": "MTP",
-            "num_nextn_predict_layers": mtp_size,
-        }
+    prefill_config, decode_config = CONFIG_MAPPING[model_type](
+        config_path,
+        decode_config_path,
+        instance_config_path,
+        args
+    )
 
     counts = {"prefill_count": num_ctx_servers, "decode_count": num_gen_servers}
 
@@ -309,6 +467,12 @@ def gen_config_file(
         required=True,
         help="Tensor parallel size for context servers",
     )
+    parser.add_argument(
+        "--ctx_ep_size",
+        type=int,
+        required=True,
+        help="Expert parallel size for context servers",
+    )
     parser.add_argument(
         "--ctx_batch_size",
         type=int,
@@ -351,6 +515,12 @@ def gen_config_file(
         required=True,
         help="Tensor parallel size for generation servers",
     )
+    parser.add_argument(
+        "--gen_ep_size",
+        type=int,
+        required=True,
+        help="Expert parallel size for generation servers",
+    )
     parser.add_argument(
         "--gen_batch_size",
         type=int,
diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
index 305fd157ec..3ca750119b 100755
--- a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
@@ -3,13 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 config_file=$1
-enable_pdl=$2
-ctx_gpus=$3
-model_name=$4
-model_path=$5
-disaggregation_mode=$6
+ctx_gpus=$2
+model_name=$3
+model_path=$4
+disaggregation_mode=$5
+is_dep=$6
+
 unset UCX_TLS
-echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
+echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}, is_dep: ${is_dep}"
 
 # Read configuration values from the YAML config file
 if [ ! -f "${config_file}" ]; then
@@ -40,21 +41,22 @@ echo "  max_batch_size: ${max_batch_size}"
 echo "  max_seq_len: ${max_seq_len}"
 
 export TLLM_LOG_LEVEL=INFO
-# NOTE: This var is default behavior in recent trtllm commits, and can
-# be removed. Keeping it here in case the script is ran with older commits.
-export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
-# NOTE: This var was replaced with an LLM API / yaml engine config field
-# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
-# can be removed. Keeping it here in case the script is ran with older commits.
-export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
+export TRTLLM_ENABLE_PDL=1
 
-if [ "${enable_pdl}" = "true" ]; then
-    export TRTLLM_ENABLE_PDL=1
-fi
+export TRTLLM_SERVER_DISABLE_GC=1
+export TRTLLM_WORKER_DISABLE_GC=1
+export NCCL_GRAPH_MIXING_SUPPORT=0
 
-# NOTE: Set (or unset) these depending on what cluster you're using
-export TRTLLM_UCX_INTERFACE=enP6p9s0np0
-export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_3:1,mlx5_4:1,enP6p9s0np0
+if [[ "${model_path,,}" != *r1* ]]; then
+    echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8"
+    export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8
+    if [ "$is_dep" = "true" ]; then
+        echo "Using DEP with gpt-oss. Setting env vars."
+        export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput"
+        export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL"
+        export TRTLLM_MOE_A2A_WORKSPACE_MB="2048"
+    fi
+fi
 
 trtllm-llmapi-launch python3 -m dynamo.trtllm \
     --model-path ${model_path} \
diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
index 5300cc9c27..a43cbb7c67 100755
--- a/components/backends/trtllm/performance_sweeps/submit_disagg.sh
+++ b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
@@ -22,6 +22,7 @@ NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}"
 
 ISL="${ISL:-8150}"
 OSL="${OSL:-1024}"
+BENCHMARK_KIND="${BENCHMARK_KIND:-sa}"
 
 # Build slurm_args step-by-step with validation and defaults
 slurm_args="--time=04:00:00"
@@ -77,196 +78,31 @@ usage() {
 # Run single task
 run_single() {
     local ctx_num=$1
-    local gen_num=$2
-    local gen_tp_size=$3
-    local gen_batch_size=$4
-    local gen_max_num_tokens=$5
-    local gen_enable_attention_dp=$6
-    local gen_gpu_memory_fraction=$7
-    local gen_mtp_size=$8
-    local gen_eplb_num_slots=$9
-    local gen_concurrency_list=${10}
+    local ctx_tp_size=$2
+    local ctx_ep_size=$3
+    local ctx_enable_attention_dp=$4
+    local gen_num=$5
+    local gen_tp_size=$6
+    local gen_ep_size=$7
+    local gen_batch_size=$8
+    local gen_max_num_tokens=$9
+    local gen_enable_attention_dp=${10}
+    local gen_gpu_memory_fraction=${11}
+    local gen_eplb_num_slots=${12}
+    local gen_mtp_size=${13}
+    local gen_concurrency_list=${14}
 
     # TODO: expose kind to the command line
     local kind="dynamo_disagg"
 
-    gen_nodes=$(((gen_tp_size + 3)/4 * gen_num))
+    gen_nodes=$(((gen_tp_size + NTASKS_PER_NODE - 1)/NTASKS_PER_NODE * gen_num))
     total_nodes=$((ctx_num + gen_nodes))
-    total_tasks=$((total_nodes * 4))
+    total_tasks=$((total_nodes * NTASKS_PER_NODE))
     set -x
-    if (( ISL == OSL )); then
-        sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 4 4608 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
-    else
-        sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
-    fi
+    sbatch --nodes=${total_nodes} --gpus-per-node ${NTASKS_PER_NODE} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND} ${NTASKS_PER_NODE}
     set +x
 }
 
-# MTP0 Configuration (gen_mtp_size=0)
-run_4_gpus_mtp0() {
-    echo "Running 4 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 5 4 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192"
-        run_single 1 5 4 64 64 true "0.85" 0 0 "256 384"
-        run_single 1 4 4 128 128 true "0.85" 0 0 "512 768"
-        run_single 2 5 4 256 256 true "0.85" 0 0 "1024 1536"
-        run_single 1 2 4 512 512 true "0.85" 0 0 "2048 3072"
-        run_single 2 3 4 768 768 true "0.85" 0 0 "3072 4096"
-    else
-        run_single 1 5 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24"
-        run_single 1 4 4 32 32 false "0.9" 0 0 "32 48"
-        run_single 2 5 4 64 64 false "0.9" 0 0 "64 96"
-        run_single 1 2 4 128 128 false "0.9" 0 0 "128 192"
-        run_single 1 1 4 64 64 true "0.8" 0 0 "256 384"
-        run_single 3 2 4 128 128 true "0.8" 0 0 "512 768"
-    fi
-}
-
-run_8_gpus_mtp0() {
-    echo "Running 8 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192 256"
-        run_single 1 4 8 32 32 true "0.8" 0 0 "256 384"
-        run_single 1 3 8 64 64 true "0.8" 0 0 "512 768"
-        run_single 1 2 8 128 128 true "0.8" 0 0 "1024 1536"
-        run_single 1 1 8 256 256 true "0.8" 0 0 "2048 3072"
-        run_single 1 1 8 512 512 true "0.8" 0 0 "4096 6144"
-        run_single 3 2 8 768 768 true "0.8" 0 0 "6144 8192"
-        run_single 3 2 8 1024 1024 true "0.8" 0 0 "8192 12288"
-    else
-        run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24"
-        run_single 1 3 8 32 32 false "0.9" 0 0 "32 48"
-        run_single 1 2 8 64 64 false "0.9" 0 0 "64 96"
-        run_single 1 1 8 128 128 false "0.9" 0 0 "128 192"
-        run_single 3 2 8 32 32 true "0.8" 0 0 "256 384"
-        run_single 5 2 8 64 64 true "0.8" 0 0 "512 768"
-        run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536"
-        run_single 5 1 8 256 256 true "0.8" 0 0 "2048 3072"
-    fi
-}
-
-run_16_gpus_mtp0() {
-    echo "Running 16 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 16 64 64 true "0.75" 0 0 "16 32 64 128 256 512 1024 1536"
-        run_single 2 1 16 128 128 true "0.75" 0 256 "2048 3072"
-        run_single 2 1 16 256 256 true "0.75" 0 256 "4096 6144"
-        run_single 3 1 16 512 512 true "0.75" 0 256 "8192 12288"
-        run_single 3 1 16 768 768 true "0.75" 0 256 "12288 16384"
-        run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480"
-    else
-        run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5
-        run_single 2 1 16 16 16 true "0.8" 0 0 "256 384"        # 6
-        run_single 3 1 16 32 32 true "0.8" 0 0 "512 768"       # 7
-        run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536"     # 10
-        run_single 8 1 16 128 128 true "0.8" 0 256 "2048 3072"   # 12
-        run_single 10 1 16 256 256 true "0.8" 0 256 "4096 6144" # 14
-    fi
-}
-
-run_32_gpus_mtp0() {
-    echo "Running 32 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 32 32 32 true "0.7" 0 0 "32 64 128 256 512 1024 1536"
-        run_single 2 1 32 64 64 true "0.7" 0 256 "2048 3072"
-        run_single 3 1 32 128 128 true "0.7" 0 288 "4096 6144"
-        run_single 4 1 32 256 256 true "0.7" 0 288 "8192 12288"
-        run_single 5 1 32 512 512 true "0.7" 0 288 "16384 20480"
-    else
-        run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192"  # 9
-        run_single 2 1 32 8 8 true "0.7" 0 0 "256 384"          # 10
-        run_single 4 1 32 16 16 true "0.7" 0 0 "512 768"       # 12
-        run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536"     # 15
-    fi
-}
-
-# MTP Configuration (gen_mtp_size=1,2,3)
-run_4_gpus_mtp() {
-    echo "Running 4 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 5 4 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48"
-        run_single 1 5 4 32 128 true "0.9" 3 0 "64 128 192"
-        run_single 1 4 4 64 256 true "0.9" 3 0 "256 384"
-        run_single 1 3 4 128 512 true "0.9" 3 0 "512 768"
-        run_single 1 2 4 256 768 true "0.9" 2 0 "1024 1536"
-        run_single 2 3 4 512 1024 true "0.9" 1 0 "2048 3072"
-        run_single 1 1 4 768 1536 true "0.9" 1 0 "3072 4096"
-    else
-        run_single 1 5 4 8 32 false "0.9" 3 0 "1 2 4 8 12"
-        run_single 1 4 4 16 64 false "0.9" 3 0 "16 24"
-        run_single 1 3 4 32 128 false "0.9" 3 0 "32 48"
-        run_single 2 3 4 16 64 true "0.8" 3 0 "64 96"
-        run_single 1 1 4 32 128 true "0.8" 3 0 "128 192"
-        run_single 2 1 4 64 256 true "0.8" 2 0 "256 384"
-        run_single 5 2 4 128 512 true "0.8" 1 0 "512 768"
-    fi
-}
-
-run_8_gpus_mtp() {
-    echo "Running 8 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48"
-        run_single 1 4 8 16 64 true "0.8" 3 0 "64 128 192"
-        run_single 1 3 8 32 128 true "0.8" 3 0 "256 384"
-        run_single 1 2 8 64 256 true "0.8" 3 0 "512 768"
-        run_single 1 1 8 128 512 true "0.8" 3 0 "1024 1536"
-        run_single 1 1 8 256 512 true "0.8" 1 0 "2048 3072"
-        run_single 3 2 8 512 1024 true "0.8" 1 0 "4096 6144"
-        run_single 3 2 8 768 1536 true "0.8" 1 0 "6144 8192"
-        run_single 3 2 8 1024 2048 true "0.8" 1 0 "8192 12288"
-    else
-        run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12"
-        run_single 1 3 8 16 64 false "0.9" 3 0 "16 24"
-        run_single 1 2 8 32 128 false "0.9" 3 0 "32 48"
-        run_single 1 1 8 8 32 true "0.8" 3 0 "64 96"
-        run_single 3 2 8 16 64 true "0.8" 3 0 "128 192"
-        run_single 5 2 8 32 128 true "0.8" 3 0 "256 384"
-        run_single 7 2 8 64 256 true "0.8" 2 0 "512 768"
-        run_single 5 1 8 128 256 true "0.8" 1 0 "1024 1536"
-        run_single 6 1 8 256 512 true "0.8" 1 0 "2048 3072"
-    fi
-}
-
-run_16_gpus_mtp() {
-    echo "Running 16 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 16 32 128 true "0.7" 3 0 "16 32 64 128 256 512 768"
-        run_single 1 1 16 64 256 true "0.7" 3 256 "1024 1536"
-        run_single 2 1 16 128 256 true "0.7" 1 288 "2048 3072"
-        run_single 2 1 16 256 512 true "0.7" 1 288 "4096 6144"
-        run_single 3 1 16 512 1024 true "0.7" 1 288 "8192 12288"
-        run_single 3 1 16 768 1536 true "0.7" 1 288 "12288 16384"
-        run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480"
-    else
-        run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5
-        run_single 2 1 16 8 32 true "0.8" 3 0 "128 192"       # 6
-        run_single 4 1 16 16 64 true "0.8" 3 0 "256 384"      # 8
-        run_single 6 1 16 32 128 true "0.8" 3 0 "512 768"    # 10
-        run_single 8 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13
-        run_single 10 1 16 128 256 true "0.8" 1 256 "2048 3072" # 15
-        run_single 12 1 16 256 512 true "0.8" 1 256 "4096 6144" # 16
-    fi
-
-}
-
-run_32_gpus_mtp() {
-    echo "Running 32 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 32 16 64 true "0.6" 3 0 "32 64 128 256 512 768"
-        run_single 2 1 32 32 128 true "0.6" 3 288 "1024 1536"
-        run_single 3 1 32 64 256 true "0.6" 3 288 "2048 3072"
-        run_single 3 1 32 128 256 true "0.6" 1 288 "4096 6144"
-        run_single 4 1 32 256 512 true "0.6" 1 288 "8192 12288"
-        run_single 5 1 32 512 1024 true "0.6" 1 288 "16384 20480"
-    else
-        run_single 1 1 32 1 4 true "0.7" 3 0 "32 48" # 9
-        run_single 2 1 32 2 8 true "0.7" 3 0 "64 96" # 10
-        run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11
-        run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13
-        run_single 8 1 32 16 64 true "0.7" 3 256 "512 768" # 16
-    fi
-}
-
 # Main function
 main() {
     local mtp_mode=$1
@@ -279,139 +115,75 @@ main() {
     fi
 
     case $mode in
-        "all")
-            echo "Running all GPU configurations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_4_gpus_mtp0
-                run_8_gpus_mtp0
-                run_16_gpus_mtp0
-                run_32_gpus_mtp0
-            else
-                run_4_gpus_mtp
-                run_8_gpus_mtp
-                run_16_gpus_mtp
-                run_32_gpus_mtp
-            fi
-            ;;
-        "pareto")
-            # 1k/1k
-            export ISL=1024
-            export OSL=1024
-            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608
-
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                # 1k/1k mtp=off
-                run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141"
-                run_single 1 1 32 32 32 true "0.7" 0 0 "1075"
-                run_single 1 1 16 64 64 true "0.75" 0 0 "1075"
-                run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300"
-                run_single 1 1 8 512 512 true "0.8" 0 0 "4300"
-
-            else
-                # 1k/1k mtp=on
-                run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36"
-                run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075"
-                run_single 2 1 16 128 256 true "0.7" 1 0 "2150"
-                run_single 1 1 32 16 64 true "0.6" 3 0 "512"
-                run_single 1 1 8 256 512 true "0.8" 1 0 "2252"
-            fi
-
-            # 8k/1k
-            export ISL=8192
-            export OSL=1024
-            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
-
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                # 8k/1k mtp=off
-                run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34"
-                run_single 4 1 32 16 16 true "0.7" 0 0 "256 538"
-                run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs
-                run_single 6 1 16 64 64 true "0.75" 0 0 "1075"
-                run_single 8 1 16 128 128 true "0.75" 0 0 "2150"
-                run_single 5 1 8 256 256 true "0.8" 0 0 "2150"
-            else
-                # 8k/1k mtp=on
-                run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18"
-                run_single 5 1 32 8 32 true "0.7" 3 0 "128 269"
-                run_single 8 1 32 16 64 true "0.7" 3 0 "538"
-                run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs
-                run_single 8 1 16 64 256 true "0.75" 2 0 "1075"
-                run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs
-                run_single 6 1 8 256 512 true "0.8" 1 0 "2150"
-            fi
-            ;;
-        "4GPU")
-            echo "Running 4 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_4_gpus_mtp0
-            else
-                run_4_gpus_mtp
-            fi
-            ;;
-        "8GPU")
-            echo "Running 8 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_8_gpus_mtp0
-            else
-                run_8_gpus_mtp
-            fi
-            ;;
-        "16GPU")
-            echo "Running 16 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_16_gpus_mtp0
-            else
-                run_16_gpus_mtp
-            fi
-            ;;
-        "32GPU")
-            echo "Running 32 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_32_gpus_mtp0
-            else
-                run_32_gpus_mtp
-            fi
-            ;;
         "tep")
-            if [ $# -ne 11 ]; then
-                echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)"
+            if [ $# -ne 14 ]; then
+                echo "Error: TEP mode requires 14 additional parameters (including mtp_mode)"
                 usage
             fi
 
             local ctx_num=$3
-            local gen_num=$4
-            local gen_tp_size=$5
-            local gen_batch_size=$6
-            local gen_max_num_tokens=$7
-            local gen_gpu_memory_fraction=$8
-            local gen_mtp_size=$9
-            local gen_eplb_num_slots=${10}
-            local gen_concurrency_list=${11}
-
-            echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
 
             # TEP mode: Use false to disable attention dp
-            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
             ;;
         "dep")
-            if [ $# -ne 11 ]; then
-                echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)"
+            if [ $# -ne 14 ]; then
+                echo "Error: DEP mode requires 14 additional parameters (including mtp_mode)"
                 usage
             fi
 
             local ctx_num=$3
-            local gen_num=$4
-            local gen_tp_size=$5
-            local gen_batch_size=$6
-            local gen_max_num_tokens=$7
-            local gen_gpu_memory_fraction=$8
-            local gen_mtp_size=$9
-            local gen_eplb_num_slots=${10}
-            local gen_concurrency_list=${11}
-
-            echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, ctx_tp_size=$ctx_tp_size, ctx_enable_attention_dp=$ctx_enable_attention_dp, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            ;;
+        "tp")
+            if [ $# -ne 14 ]; then
+                echo "Error: TP mode requires 14 additional parameters (including mtp_mode)"
+                usage
+            fi
 
-            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            local ctx_num=$3
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running TP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=1, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size 1 $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
             ;;
         *)
             echo "Error: Unknown mode '$mode'"