ai-dynamo
diff --git a/‎components/src/dynamo/planner/utils/planner_core.py‎
Lines changed: 12 additions & 1 deletion b/‎components/src/dynamo/planner/utils/planner_core.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎components/src/dynamo/planner/utils/prometheus.py‎
Lines changed: 221 additions & 46 deletions b/‎components/src/dynamo/planner/utils/prometheus.py‎
Lines changed: 221 additions & 46 deletions
diff --git a/‎components/src/dynamo/sglang/main.py‎
Lines changed: 44 additions & 21 deletions b/‎components/src/dynamo/sglang/main.py‎
Lines changed: 44 additions & 21 deletions
diff --git a/‎components/src/dynamo/sglang/request_handlers/handler_base.py‎
Lines changed: 37 additions & 0 deletions b/‎components/src/dynamo/sglang/request_handlers/handler_base.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎components/src/dynamo/sglang/request_handlers/llm/decode_handler.py‎
Lines changed: 11 additions & 0 deletions b/‎components/src/dynamo/sglang/request_handlers/llm/decode_handler.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py‎
Lines changed: 6 additions & 0 deletions b/‎components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎components/src/dynamo/trtllm/request_handlers/handler_base.py‎
Lines changed: 7 additions & 3 deletions b/‎components/src/dynamo/trtllm/request_handlers/handler_base.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎components/src/dynamo/vllm/handlers.py‎
Lines changed: 16 additions & 2 deletions b/‎components/src/dynamo/vllm/handlers.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎components/src/dynamo/vllm/health_check.py‎
Lines changed: 1 addition & 2 deletions b/‎components/src/dynamo/vllm/health_check.py‎
Lines changed: 1 addition & 2 deletions
@@ -24,7 +24,7 @@
     PrefillInterpolator,
 )
 from dynamo.planner.utils.pre_swept_results_utils import PreSweptResultsHelper
-from dynamo.planner.utils.prometheus import PrometheusAPIClient
+from dynamo.planner.utils.prometheus import MetricSource, PrometheusAPIClient
 from dynamo.planner.utils.trace_data_extractor import extract_metrics_from_mooncake
 from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -150,9 +150,20 @@ def __init__(
                 else:
                     raise ValueError(f"Invalid environment: {args.environment}")
 
+            # Use backend metrics for vLLM (queries vllm:* metrics directly from workers)
+            # Use frontend metrics for other backends (queries dynamo_frontend_* metrics)
+            metric_source = (
+                MetricSource.VLLM
+                if args.backend.lower() == "vllm"
+                else MetricSource.FRONTEND
+            )
+            logger.info(
+                f"Initializing Prometheus client with metric_source='{metric_source}' for backend '{args.backend}'"
+            )
             self.prometheus_api_client = PrometheusAPIClient(
                 args.metric_pulling_prometheus_endpoint,
                 args.namespace,
+                metric_source=metric_source,
             )
 
         self.num_req_predictor = LOAD_PREDICTORS[args.load_predictor](
 
@@ -103,11 +103,8 @@ async def init(runtime: DistributedRuntime, config: Config):
     server_args, dynamo_args = config.server_args, config.dynamo_args
 
     # Prevent SGLang from blocking on non-leader nodes
-    # We can switch this to 0 and leverage our own metrics
-    # after https://github.com/sgl-project/sglang/pull/13686
-    # is merged in
     if server_args.node_rank >= 1:
-        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1"
+        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
 
     engine = sgl.Engine(server_args=server_args)
 
@@ -222,11 +219,8 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
     server_args, dynamo_args = config.server_args, config.dynamo_args
 
     # Prevent SGLang from blocking on non-leader nodes
-    # We can switch this to 0 and leverage our own metrics
-    # after https://github.com/sgl-project/sglang/pull/13686
-    # is merged in
     if server_args.node_rank >= 1:
-        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1"
+        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
 
     engine = sgl.Engine(server_args=server_args)
 
@@ -430,16 +424,24 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
 
     await pd_worker_client.wait_for_instances()
 
-    tasks = [
-        generate_endpoint.serve_endpoint(
-            handler.generate,
-            graceful_shutdown=True,
-            metrics_labels=[("model", server_args.served_model_name)],
-        )
-    ]
+    ready_event = asyncio.Event()
 
     try:
-        await asyncio.gather(*tasks)
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate,
+                graceful_shutdown=True,
+                metrics_labels=[("model", server_args.served_model_name)],
+            ),
+            register_llm_with_readiness_gate(
+                None,  # encode worker doesn't have engine
+                generate_endpoint,
+                server_args,
+                dynamo_args,
+                input_type=ModelInput.Text,
+                readiness_gate=ready_event,
+            ),
+        )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")
         raise
@@ -473,11 +475,24 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
 
     await handler.async_init()
 
+    health_check_payload = SglangHealthCheckPayload(engine).to_dict()
+    ready_event = asyncio.Event()
+
     try:
-        await generate_endpoint.serve_endpoint(
-            handler.generate,
-            metrics_labels=[("model", server_args.served_model_name)],
-            graceful_shutdown=True,
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate,
+                metrics_labels=[("model", server_args.served_model_name)],
+                graceful_shutdown=True,
+                health_check_payload=health_check_payload,
+            ),
+            register_llm_with_readiness_gate(
+                engine,
+                generate_endpoint,
+                server_args,
+                dynamo_args,
+                readiness_gate=ready_event,
+            ),
         )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")
@@ -502,6 +517,7 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
     await handler.async_init()
 
     health_check_payload = SglangPrefillHealthCheckPayload(engine).to_dict()
+    ready_event = asyncio.Event()
 
     try:
         await asyncio.gather(
@@ -510,7 +526,14 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
                 graceful_shutdown=True,
                 metrics_labels=[("model", server_args.served_model_name)],
                 health_check_payload=health_check_payload,
-            )
+            ),
+            register_llm_with_readiness_gate(
+                engine,
+                generate_endpoint,
+                server_args,
+                dynamo_args,
+                readiness_gate=ready_event,
+            ),
         )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")
 
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import base64
+import json
 import logging
 import random
 import socket
@@ -10,6 +12,7 @@
 from typing import Any, AsyncGenerator, Dict, Optional, Tuple
 
 import sglang as sgl
+from sglang.srt.tracing import trace as sglang_trace
 from sglang.srt.utils import get_local_ip_auto
 
 from dynamo._core import Client, Component, Context
@@ -49,6 +52,7 @@ def __init__(
         self.prefill_client = prefill_client
         self.serving_mode = config.serving_mode
         self.skip_tokenizer_init = config.server_args.skip_tokenizer_init
+        self.enable_trace = config.server_args.enable_trace
 
     @abstractmethod
     async def generate(self, request: Dict[str, Any], context: Context):
@@ -117,6 +121,39 @@ def _get_bootstrap_info(engine: sgl.Engine) -> Tuple[str, int]:
 
         return bootstrap_host, bootstrap_port
 
+    def _propagate_trace_context_to_sglang(
+        self, context: Context, bootstrap_room: int = 0
+    ):
+        """Propagate Dynamo's trace context to SGLang for distributed tracing. SGLang expects a certain
+        format derived by loooking at https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/tracing/trace.py
+        in the to_dict() method.
+
+        Args:
+            context: Dynamo Context object containing trace information.
+            bootstrap_room: Bootstrap room ID (0 for aggregated, actual room for disaggregated).
+        """
+        trace_id = context.trace_id
+        span_id = context.span_id
+        if not trace_id or not span_id:
+            return
+
+        # Build trace context for SGLang
+        trace_context = {
+            str(bootstrap_room): {
+                "root_span": {"traceparent": f"00-{trace_id}-{span_id}-01"},
+                "prev_span": {
+                    "span_id": int(span_id, 16),
+                    "trace_id": int(trace_id, 16),
+                },
+            }
+        }
+
+        # Encode and propagate
+        base64_context = base64.b64encode(
+            json.dumps(trace_context, ensure_ascii=False).encode("utf-8")
+        ).decode("utf-8")
+        sglang_trace.trace_set_remote_propagate_context(base64_context)
+
     async def _handle_cancellation(
         self, request_id_future: asyncio.Future, context: Context
     ):
 
@@ -112,6 +112,7 @@ async def generate(
             RuntimeError: If no bootstrap info received from prefill worker.
         """
         logging.debug(f"New Request ID: {context.id()}")
+        trace_id = context.trace_id
         sampling_params = self._build_sampling_params(request)
         input_param = self._get_input_param(request)
 
@@ -154,13 +155,19 @@ async def generate(
             if not bootstrap_info:
                 raise RuntimeError("No bootstrap info received from prefill worker")
 
+            if self.enable_trace:
+                self._propagate_trace_context_to_sglang(
+                    context, bootstrap_info["bootstrap_room"]
+                )
+
             decode = await self.engine.async_generate(
                 **input_param,
                 sampling_params=sampling_params,
                 stream=True,
                 bootstrap_host=bootstrap_info["bootstrap_host"],
                 bootstrap_port=bootstrap_info["bootstrap_port"],
                 bootstrap_room=bootstrap_info["bootstrap_room"],
+                rid=trace_id,
             )
 
             if self.skip_tokenizer_init:
@@ -170,10 +177,14 @@ async def generate(
                 async for out in self._process_text_stream(decode, context):
                     yield out
         else:
+            if self.enable_trace:
+                self._propagate_trace_context_to_sglang(context)
+
             agg = await self.engine.async_generate(
                 **input_param,
                 sampling_params=sampling_params,
                 stream=True,
+                rid=trace_id,
             )
             if self.skip_tokenizer_init:
                 async for out in self._process_token_stream(agg, context):
 
@@ -64,6 +64,7 @@ async def generate(
             Bootstrap info dict with host, port, and room for decode worker connection.
         """
         logging.debug(f"New Request ID: {context.id()}")
+        trace_id = context.trace_id
         bootstrap_room = self._generate_bootstrap_room()
 
         bootstrap_info = {
@@ -76,13 +77,18 @@ async def generate(
 
         input_param = self._get_input_param(request["request"])
 
+        # Propagate trace context to SGLang
+        if self.enable_trace:
+            self._propagate_trace_context_to_sglang(context, bootstrap_room)
+
         results = await self.engine.async_generate(
             **input_param,
             sampling_params=request["sampling_params"],
             stream=True,
             bootstrap_host=self.bootstrap_host,
             bootstrap_port=self.bootstrap_port,
             bootstrap_room=bootstrap_room,
+            rid=trace_id,
         )
 
         task = asyncio.create_task(self._consume_results(results, context))
 
@@ -369,8 +369,12 @@ async def generate_locally(
 
         # 2. Per-request errors - send to client, don't shutdown
         except RequestError as e:
-            logging.warning(f"Request {request_id} error: {e}")
-            yield {"finish_reason": "error", "token_ids": []}
+            error_msg = str(e)
+            logging.warning(f"Request {request_id} error: {error_msg}")
+            yield {
+                "finish_reason": {"error": error_msg},
+                "token_ids": [],
+            }
 
         # 3. ALL OTHER ERRORS - graceful shutdown
         except Exception as e:
@@ -384,7 +388,7 @@ async def generate_locally(
             # Try to send error to client before shutdown
             try:
                 yield {
-                    "finish_reason": "error",
+                    "finish_reason": {"error": error_msg},
                     "token_ids": [],
                 }
             except Exception:
 
@@ -12,7 +12,7 @@
 from vllm.inputs import TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.engine.exceptions import EngineDeadError
 
 from dynamo.llm import (
@@ -82,8 +82,22 @@ def build_sampling_params(
     sampling_params = SamplingParams(**default_sampling_params)
     sampling_params.detokenize = False
 
-    # Apply sampling_options
+    # Handle guided_decoding - convert to StructuredOutputsParams
+    guided_decoding = request["sampling_options"].get("guided_decoding")
+    if guided_decoding is not None and isinstance(guided_decoding, dict):
+        sampling_params.structured_outputs = StructuredOutputsParams(
+            json=guided_decoding.get("json"),
+            regex=guided_decoding.get("regex"),
+            choice=guided_decoding.get("choice"),
+            grammar=guided_decoding.get("grammar"),
+            whitespace_pattern=guided_decoding.get("whitespace_pattern"),
+        )
+
+    # Apply remaining sampling_options
     for key, value in request["sampling_options"].items():
+        # Skip guided_decoding - already handled above
+        if key == "guided_decoding":
+            continue
         if value is not None and hasattr(sampling_params, key):
             setattr(sampling_params, key, value)
 
 
@@ -67,15 +67,14 @@ def __init__(self, engine_client=None):
         self.default_payload = {
             "token_ids": [bos_token_id],
             "sampling_options": {
-                "max_tokens": 1,
                 "temperature": 0.0,
             },
             "stop_conditions": {
+                "max_tokens": 1,
                 "stop": None,
                 "stop_token_ids": None,
                 "include_stop_str_in_output": False,
                 "ignore_eos": False,
-                "min_tokens": 0,
             },
         }
         super().__init__()