Sgl healthcheck integration + precommit

jthomson04 · jthomson04 · commit cdeac19f35e0 · 2025-12-08T22:08:00.000-08:00
Signed-off-by: jthomson04 &lt;jwillthomson19@gmail.com&gt;
diff --git a/components/src/dynamo/common/utils/input_params.py b/components/src/dynamo/common/utils/input_params.py
@@ -8,6 +8,7 @@ def get_input_param(self, request: dict, use_tokenizer: bool):
         """
 
         if use_tokenizer:
+            print(f"Request: {request}")
             if self.tokenizer is None:
                 raise ValueError("Tokenizer is not available")
 
diff --git a/components/src/dynamo/sglang/health_check.py b/components/src/dynamo/sglang/health_check.py
@@ -53,7 +53,9 @@ class SglangHealthCheckPayload(HealthCheckPayload):
     Provides SGLang defaults and inherits environment override support from base class.
     """
 
-    def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
+    def __init__(
+        self, engine: Optional[sgl.Engine] = None, use_text_input: bool = False
+    ) -> None:
         """Initialize SGLang health check payload with model-specific BOS token.
 
         Args:
@@ -62,7 +64,6 @@ def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
         bos_token_id = _get_bos_token_id_from_engine(engine)
 
         self.default_payload = {
-            "token_ids": [bos_token_id],
             "stop_conditions": {
                 "max_tokens": 1,  # Generate only 1 token
                 "ignore_eos": False,
@@ -75,6 +76,12 @@ def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
             "eos_token_ids": [],
             "annotations": [],
         }
+
+        if use_text_input:
+            self.default_payload["prompt"] = "Test"
+        else:
+            self.default_payload["token_ids"] = [bos_token_id]
+
         super().__init__()
 
 
@@ -84,7 +91,9 @@ class SglangPrefillHealthCheckPayload(HealthCheckPayload):
     The prefill handler expects a wrapped structure with 'request' and 'sampling_params'.
     """
 
-    def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
+    def __init__(
+        self, engine: Optional[sgl.Engine] = None, use_text_input: bool = False
+    ) -> None:
         """Initialize SGLang prefill health check payload with proper wrapped structure.
 
         Args:
@@ -93,9 +102,7 @@ def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
         bos_token_id = _get_bos_token_id_from_engine(engine)
 
         self.default_payload = {
-            "request": {
-                "token_ids": [bos_token_id],
-            },
+            "request": {},
             "sampling_params": {
                 "max_new_tokens": 1,  # Generate only 1 token
                 "temperature": 0.0,
@@ -104,4 +111,10 @@ def __init__(self, engine: Optional[sgl.Engine] = None) -> None:
                 "ignore_eos": False,
             },
         }
+
+        if use_text_input:
+            self.default_payload["request"]["prompt"] = "Test"
+        else:
+            self.default_payload["request"]["token_ids"] = [bos_token_id]
+
         super().__init__()
diff --git a/components/src/dynamo/sglang/main.py b/components/src/dynamo/sglang/main.py
@@ -171,8 +171,10 @@ async def stop_profile_handler(body: dict) -> dict:
     handler = DecodeWorkerHandler(
         component, engine, config, publisher, prefill_client, prefill_router_client
     )
-
-    health_check_payload = SglangHealthCheckPayload(engine).to_dict()
+    print(f"Config: {config}")
+    health_check_payload = SglangHealthCheckPayload(
+        engine, use_text_input=dynamo_args.use_sglang_tokenizer
+    ).to_dict()
 
     logging.info(
         f"Registering model with endpoint types: {dynamo_args.dyn_endpoint_types}"
@@ -325,7 +327,9 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
     ready_event = asyncio.Event()
 
     handler = EmbeddingWorkerHandler(component, engine, config, publisher)
-    health_check_payload = SglangHealthCheckPayload(engine).to_dict()
+    health_check_payload = SglangHealthCheckPayload(
+        engine, use_text_input=dynamo_args.use_sglang_tokenizer
+    ).to_dict()
 
     try:
         # Start endpoint immediately and register model concurrently
diff --git a/components/src/dynamo/sglang/request_handlers/handler_base.py b/components/src/dynamo/sglang/request_handlers/handler_base.py
@@ -75,10 +75,14 @@ def cleanup(self) -> None:
         pass
 
     def _get_input_param(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        return self.input_param_manager.get_input_param(
+        request_input = self.input_param_manager.get_input_param(
             request, use_tokenizer=not self.skip_tokenizer_init
         )
 
+        return {
+            "prompt" if isinstance(request_input, str) else "input_ids": request_input
+        }
+
     @staticmethod
     def _generate_bootstrap_room() -> int:
         """Generate a unique bootstrap room ID for disaggregated serving.
diff --git a/components/src/dynamo/trtllm/health_check.py b/components/src/dynamo/trtllm/health_check.py
@@ -47,6 +47,7 @@ def _get_bos_token_id_from_tokenizer(tokenizer) -> int:
     logger.debug("Using default BOS token ID (1) for health check")
     return 1
 
+
 def _make_default_payload(tokenizer, use_text_input: bool) -> dict:
     default_payload = {
         "stop_conditions": {
@@ -77,6 +78,7 @@ def _make_default_payload(tokenizer, use_text_input: bool) -> dict:
 
     return default_payload
 
+
 class TrtllmHealthCheckPayload(HealthCheckPayload):
     """
     TRT-LLM-specific health check payload.
diff --git a/components/src/dynamo/trtllm/main.py b/components/src/dynamo/trtllm/main.py
@@ -435,7 +435,9 @@ async def init(runtime: DistributedRuntime, config: Config):
             )
 
         # Get health check payload (checks env var and falls back to TensorRT-LLM default)
-        health_check_payload = TrtllmHealthCheckPayload(tokenizer=tokenizer, use_text_input=config.use_trtllm_tokenizer).to_dict()
+        health_check_payload = TrtllmHealthCheckPayload(
+            tokenizer=tokenizer, use_text_input=config.use_trtllm_tokenizer
+        ).to_dict()
 
         if config.publish_events_and_metrics:
             # Initialize and pass in the publisher to the request handler to
diff --git a/components/src/dynamo/trtllm/request_handlers/handler_base.py b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -323,7 +323,9 @@ async def generate_locally(
                 stop_token_ids = stop_conditions.get("stop_token_ids_hidden")
                 if stop_token_ids:
                     existing = sampling_params.stop_token_ids or []
-                    sampling_params.stop_token_ids = list(set(existing).union(stop_token_ids))
+                    sampling_params.stop_token_ids = list(
+                        set(existing).union(stop_token_ids)
+                    )
 
         # TODO: Instead of True, we should use streaming from the request.
         # However, currently dynamo run does not send streaming in the request.
diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
@@ -10,7 +10,7 @@
 from contextlib import asynccontextmanager
 from typing import Any, AsyncGenerator, Dict, Final
 
-from vllm.inputs import TokensPrompt, TextPrompt
+from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
diff --git a/components/src/dynamo/vllm/health_check.py b/components/src/dynamo/vllm/health_check.py
@@ -8,12 +8,15 @@
 """
 
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 from dynamo.health_check import HealthCheckPayload
 
 logger = logging.getLogger(__name__)
 
+if TYPE_CHECKING:
+    from vllm.v1.engine.async_llm import AsyncLLM
+
 
 def _get_bos_token_id_from_engine(engine_client) -> int:
     """
@@ -45,8 +48,10 @@ def _get_bos_token_id_from_engine(engine_client) -> int:
     logger.debug("Using default BOS token ID (1) for health check")
     return 1
 
-def _make_default_payload(engine_client: Optional["AsyncLLM"], use_text_input: bool) -> dict:
 
+def _make_default_payload(
+    engine_client: Optional["AsyncLLM"], use_text_input: bool
+) -> dict:
     sampling_options = {
         "temperature": 0.0,
     }
@@ -72,7 +77,7 @@ def _make_default_payload(engine_client: Optional["AsyncLLM"], use_text_input: b
             "sampling_options": sampling_options,
             "stop_conditions": stop_conditions,
         }
-    
+
 
 class VllmHealthCheckPayload(HealthCheckPayload):
     """
diff --git a/components/src/dynamo/vllm/main.py b/components/src/dynamo/vllm/main.py
@@ -432,7 +432,9 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
             migration_limit=0,  # Prefill doesn't support migration
         )
 
-    health_check_payload = VllmPrefillHealthCheckPayload(engine_client, use_text_input=config.use_vllm_tokenizer).to_dict()
+    health_check_payload = VllmPrefillHealthCheckPayload(
+        engine_client, use_text_input=config.use_vllm_tokenizer
+    ).to_dict()
 
     try:
         logger.debug("Starting serve_endpoint for prefill worker")

Original file line number	Diff line number	Diff line change
`@@ -435,7 +435,9 @@ async def init(runtime: DistributedRuntime, config: Config):`
`435`	`435`	`)`
`436`	`436`
`437`	`437`	`# Get health check payload (checks env var and falls back to TensorRT-LLM default)`
`438`		`- health_check_payload = TrtllmHealthCheckPayload(tokenizer=tokenizer, use_text_input=config.use_trtllm_tokenizer).to_dict()`
	`438`	`+ health_check_payload = TrtllmHealthCheckPayload(`
	`439`	`+ tokenizer=tokenizer, use_text_input=config.use_trtllm_tokenizer`
	`440`	`+ ).to_dict()`
`439`	`441`
`440`	`442`	`if config.publish_events_and_metrics:`
`441`	`443`	`# Initialize and pass in the publisher to the request handler to`
Original file line number	Diff line number	Diff line change
`@@ -432,7 +432,9 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):`
`432`	`432`	`migration_limit=0, # Prefill doesn't support migration`
`433`	`433`	`)`
`434`	`434`
`435`		`- health_check_payload = VllmPrefillHealthCheckPayload(engine_client, use_text_input=config.use_vllm_tokenizer).to_dict()`
	`435`	`+ health_check_payload = VllmPrefillHealthCheckPayload(`
	`436`	`+ engine_client, use_text_input=config.use_vllm_tokenizer`
	`437`	`+ ).to_dict()`
`436`	`438`
`437`	`439`	`try:`
`438`	`440`	`logger.debug("Starting serve_endpoint for prefill worker")`