ai-dynamo
diff --git a/‎benchmarks/router/README.md‎
Lines changed: 7 additions & 0 deletions b/‎benchmarks/router/README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/router/run_engines.sh‎
Lines changed: 9 additions & 3 deletions b/‎benchmarks/router/run_engines.sh‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎components/src/dynamo/planner/defaults.py‎
Lines changed: 5 additions & 7 deletions b/‎components/src/dynamo/planner/defaults.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎components/src/dynamo/trtllm/main.py‎
Lines changed: 12 additions & 44 deletions b/‎components/src/dynamo/trtllm/main.py‎
Lines changed: 12 additions & 44 deletions
diff --git a/‎components/src/dynamo/trtllm/request_handlers/handler_base.py‎
Lines changed: 0 additions & 11 deletions b/‎components/src/dynamo/trtllm/request_handlers/handler_base.py‎
Lines changed: 0 additions & 11 deletions
@@ -17,6 +17,13 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
   - `matplotlib` for plotting results
   - `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
 
+> [!Note]
+> If running outside a container, set `DYNAMO_HOME` to the root path of your Dynamo repository:
+> ```bash
+> export DYNAMO_HOME=/path/to/dynamo
+> ```
+> When running in a container, this defaults to `/workspace`.
+
 ### Setting up etcd and NATS
 
 This benchmark requires etcd and NATS. To quickly set them up, run:
 
@@ -225,7 +225,7 @@ else
 
             if [ "$USE_TRTLLM" = true ]; then
                 echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
-                # Run TensorRT-LLM engine with trtllm-llmapi-launch for proper initialization
+                # Run TensorRT-LLM engine
                 TRTLLM_ARGS=()
                 TRTLLM_ARGS+=("--model-path" "$MODEL_PATH")
                 TRTLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
@@ -234,7 +234,7 @@ else
                 fi
                 TRTLLM_ARGS+=("${EXTRA_ARGS[@]}")
 
-                exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES trtllm-llmapi-launch python -m dynamo.trtllm \
+                exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES trtllm-llmapi-launch python3 -m dynamo.trtllm \
                     "${TRTLLM_ARGS[@]}"
             else
                 echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
@@ -252,12 +252,18 @@ else
                 fi
                 VLLM_ARGS+=("${EXTRA_ARGS[@]}")
 
-                exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES python -m dynamo.vllm \
+                exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES python3 -m dynamo.vllm \
                     "${VLLM_ARGS[@]}"
             fi
         } &
         PIDS+=($!)
         echo "Started $MODE worker $i (PID: $!)"
+
+        # Add delay between TensorRT-LLM worker launches to avoid MPI initialization conflicts
+        if [ "$USE_TRTLLM" = true ] && [ "$i" -lt "$NUM_WORKERS" ]; then
+            echo "Waiting 2 seconds before launching next TensorRT-LLM worker..."
+            sleep 2
+        fi
     done
 fi
 
 
@@ -119,16 +119,14 @@ class SGLangComponentName:
 
 
 class TrtllmComponentName:
-    # Note: Planner only supports DECODE_FIRST strategy in TRT-LLM:
-    # - Decode worker is the first worker (tensorrt_llm)
-    # - Prefill worker is the next worker (tensorrt_llm_next)
+    # Unified frontend architecture (consistent with vLLM/SGLang):
+    # - Prefill workers use "prefill" component
+    # - Decode workers use "tensorrt_llm" component
     prefill_worker_k8s_name = "TRTLLMPrefillWorker"
-    prefill_worker_component_name = (
-        "tensorrt_llm_next"  # Prefill is "next" with DECODE_FIRST
-    )
+    prefill_worker_component_name = "prefill"
     prefill_worker_endpoint = "generate"
     decode_worker_k8s_name = "TRTLLMDecodeWorker"
-    decode_worker_component_name = "tensorrt_llm"  # Decode is "first" with DECODE_FIRST
+    decode_worker_component_name = "tensorrt_llm"
     decode_worker_endpoint = "generate"
 
 
 
@@ -45,6 +45,7 @@
 from dynamo.trtllm.health_check import TrtllmHealthCheckPayload
 from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
 from dynamo.trtllm.publisher import get_publisher
+from dynamo.trtllm.request_handlers.handler_base import DisaggregationMode
 from dynamo.trtllm.request_handlers.handlers import (
     RequestHandlerConfig,
     RequestHandlerFactory,
@@ -53,7 +54,6 @@
     Config,
     cmd_line_args,
     deep_update,
-    is_first_worker,
     parse_endpoint,
 )
 
@@ -126,37 +126,6 @@ async def init(runtime: DistributedRuntime, config: Config):
     """
     logging.info(f"Initializing the worker with config: {config}")
 
-    next_client = None
-    if config.next_endpoint:
-        logging.info(
-            f"Initializing next worker client for endpoint: {config.next_endpoint}"
-        )
-        parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint(
-            config.next_endpoint
-        )
-        next_client = (
-            await runtime.namespace(parsed_namespace)
-            .component(parsed_component_name)
-            .endpoint(parsed_endpoint_name)
-            .client()
-        )
-
-    # Set up prefill router client for decode workers
-    next_router_client = None
-    if config.disaggregation_mode.value == "decode":
-        try:
-            logging.info("Initializing prefill router client")
-            next_router_client = (
-                await runtime.namespace(config.namespace)
-                .component("router")  # Standalone router for prefill workers
-                .endpoint("generate")
-                .client()
-            )
-            logging.info("Prefill router client initialized successfully")
-        except Exception as e:
-            logging.warning(f"Failed to initialize prefill router client: {e}")
-            logging.info("Will use direct prefill worker client only")
-
     encode_client = None
     if config.encode_endpoint:
         logging.info(
@@ -273,7 +242,13 @@ async def init(runtime: DistributedRuntime, config: Config):
     default_sampling_params._setup(tokenizer)
     default_sampling_params.stop = None
     model_input = ModelInput.Tokens
-    model_type = ModelType.Chat | ModelType.Completions
+
+    # Set model type based on disaggregation mode for unified frontend support
+    if config.disaggregation_mode == DisaggregationMode.PREFILL:
+        model_type = ModelType.Prefill
+    else:
+        model_type = ModelType.Chat | ModelType.Completions
+
     multimodal_processor = None
 
     if os.getenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR") == "1":
@@ -376,24 +351,17 @@ async def init(runtime: DistributedRuntime, config: Config):
             default_sampling_params=default_sampling_params,
             publisher=None,
             disaggregation_mode=config.disaggregation_mode,
-            disaggregation_strategy=config.disaggregation_strategy,
-            next_client=next_client,
-            next_router_client=next_router_client,
             encode_client=encode_client,
             multimodal_processor=multimodal_processor,
             connector=connector,
             runtime=runtime,  # Pass runtime for graceful shutdown
             metrics_collector=metrics_collector,
         )
 
-        if next_client:
-            logging.info(
-                f"Waiting for the next endpoint to be ready: {config.next_endpoint}"
-            )
-            await next_client.wait_for_instances()
-
-        if is_first_worker(config):
-            # Register the model with runtime config
+        # Register the model with runtime config
+        # Encode workers do NOT register - they're internal workers only
+        # Prefill and decode workers register - frontend detects their role via ModelType
+        if config.disaggregation_mode != DisaggregationMode.ENCODE:
             await register_llm(
                 model_input,
                 model_type,
 
@@ -52,11 +52,6 @@ class DisaggregationMode(Enum):
     ENCODE = "encode"
 
 
-class DisaggregationStrategy(Enum):
-    PREFILL_FIRST = "prefill_first"
-    DECODE_FIRST = "decode_first"
-
-
 @dataclass
 class RequestHandlerConfig:
     """
@@ -68,9 +63,6 @@ class RequestHandlerConfig:
     default_sampling_params: SamplingParams
     publisher: Publisher
     disaggregation_mode: DisaggregationMode
-    disaggregation_strategy: DisaggregationStrategy
-    next_client: object
-    next_router_client: Optional[object] = None
     encode_client: Optional[object] = None
     multimodal_processor: Optional[
         MultimodalRequestProcessor
@@ -94,9 +86,6 @@ def __init__(self, config: RequestHandlerConfig):
         self.publisher = config.publisher
         self.metrics_collector = config.metrics_collector
         self.disaggregation_mode = config.disaggregation_mode
-        self.disaggregation_strategy = config.disaggregation_strategy
-        self.next_client = config.next_client
-        self.next_router_client = config.next_router_client
         self.encode_client = config.encode_client
         self.multimodal_processor = config.multimodal_processor
         self.first_generation = True