initial tests

jthomson04 · jthomson04 · commit 95ac241625dd · 2025-11-27T13:51:18.000-08:00
Signed-off-by: jthomson04 &lt;jwillthomson19@gmail.com&gt;
diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -85,7 +85,6 @@ srun -l --container-name=${CONTAINER_NAME} \
             --num_ctx_servers ${num_ctx_servers} \
             --ctx_tp_size ${ctx_tp_size} \
             --ctx_ep_size ${ctx_ep_size} \
-            --ctx_enable_attention_dp ${ctx_enable_attention_dp} \
             --ctx_batch_size ${ctx_batch_size} \
             --ctx_max_num_tokens ${ctx_max_num_tokens} \
             --ctx_max_seq_len ${ctx_max_seq_len} \
@@ -180,7 +179,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
       --ntasks $gen_tp_size \
       --oversubscribe \
       --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
   echo "$!" >> "$PID_FILE"
 done
 
@@ -203,9 +202,9 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
         --mpi=pmix --overlap -w ${nodes[node_idx]} \
         --oversubscribe \
         --overlap \
-        --ntasks $(( tp_size < 4 ? tp_size : 4 )) \
+        --ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
         --nodes 1 \
-        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
+        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
   prefill_pids+=($!)
   echo "$!" >> "$PID_FILE"
 done
diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
@@ -15,17 +15,14 @@ class ModelType(Enum):
     """
     GPT_OSS = "gpt_oss"
     DSR1 = "dsr1"
-
-    def infer_model_type(self, model_path: str) -> str:
-        if "r1" in model_path.lower():
-            return self.DSR1
-        else:
-            return self.GPT_OSS
-
-CONFIG_MAPPING = {
-    ModelType.GPT_OSS: None,
-    ModelType.DSR1: generate_dsr1_config,
-}
+    
+def get_model_type(model_path: str) -> str:
+    if "r1" in model_path.lower():
+        print("Inferring DSR1-type model")
+        return ModelType.DSR1
+    else:
+        print("Inferring GPT-oss-type model")
+        return ModelType.GPT_OSS
 
 def generate_dsr1_config(    
     config_path: str,
@@ -89,7 +86,7 @@ def generate_dsr1_config(
         "max_seq_len": args.gen_max_seq_len,
         "cuda_graph_config": {
             "enable_padding": True,
-            "batch_sizes": args.gen_cuda_graph_batch_sizes,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
         },
         "print_iter_log": True,
         "kv_cache_config": {
@@ -160,7 +157,7 @@ def generate_gpt_oss_config(
         768,
         1024,
         2048,
-        gen_batch_size,
+        args.gen_batch_size,
     ]
 
     gen_moe_backend = "TRTLLM"
@@ -210,7 +207,7 @@ def generate_gpt_oss_config(
         "max_seq_len": args.gen_max_seq_len,
         "cuda_graph_config": {
             "enable_padding": True,
-            "batch_sizes": args.gen_cuda_graph_batch_sizes,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
         },
         "print_iter_log": True,
         "kv_cache_config": {
@@ -257,6 +254,11 @@ def generate_gpt_oss_config(
     
     return prefill_config, decode_config
 
+CONFIG_MAPPING = {
+    ModelType.GPT_OSS: generate_gpt_oss_config,
+    ModelType.DSR1: generate_dsr1_config,
+}
+
 def process_node_and_task() -> tuple[int, List[str], List[str]]:
     """
     Process SLURM node and task environment variables.
@@ -429,7 +431,7 @@ def gen_config_file(
         server_port: Server port
     """
 
-    model_type = ModelType.get_model_type(model_path)
+    model_type = get_model_type(model_path)
 
     prefill_config, decode_config = CONFIG_MAPPING[model_type](
         config_path,
@@ -471,12 +473,6 @@ def gen_config_file(
         required=True,
         help="Expert parallel size for context servers",
     )
-    parser.add_argument(
-        "--ctx_enable_attention_dp",
-        dest="ctx_enable_attention_dp",
-        action="store_true",
-        help="Enable attention DP for context servers",
-    )
     parser.add_argument(
         "--ctx_batch_size",
         type=int,
@@ -519,6 +515,12 @@ def gen_config_file(
         required=True,
         help="Tensor parallel size for generation servers",
     )
+    parser.add_argument(
+        "--gen_ep_size",
+        type=int,
+        required=True,
+        help="Expert parallel size for generation servers",
+    )
     parser.add_argument(
         "--gen_batch_size",
         type=int,
diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
@@ -3,10 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 config_file=$1
-ctx_gpus=$3
-model_name=$4
-model_path=$5
-disaggregation_mode=$6
+ctx_gpus=$2
+model_name=$3
+model_path=$4
+disaggregation_mode=$5
 unset UCX_TLS
 echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
 
diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
@@ -98,7 +98,7 @@ run_single() {
     total_nodes=$((ctx_num + gen_nodes))
     total_tasks=$((total_nodes * 4))
     set -x
-    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
+    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
     set +x
 }
 

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ run_single() {`
`98`	`98`	`total_nodes=$((ctx_num + gen_nodes))`
`99`	`99`	`total_tasks=$((total_nodes * 4))`
`100`	`100`	`set -x`
`101`		`- sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}`
	`101`	+ sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
`102`	`102`	`set +x`
`103`	`103`	`}`
`104`	`104`