Fixes

zhongxuanwang-nv · zhongxuanwang-nv · commit b0236886f1b0 · 2025-12-04T20:44:14.000Z
Signed-off-by: Zhongxuan Wang &lt;daniewang@nvidia.com&gt;
diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
@@ -74,7 +74,7 @@ def build_sampling_params(
     return sampling_params
 
 
-def _should_include_timing_metrics(request: Dict[str, Any]) -> bool:
+def _request_contains_timing_metrics(request: Dict[str, Any]) -> bool:
     """Check if timing_metrics is requested in extra_fields."""
     extra_fields: Optional[List[str]] = request.get("extra_fields")
     if extra_fields is None:
@@ -259,10 +259,10 @@ async def generate_tokens(
                     out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
                     if output.finish_reason:
                         out["finish_reason"] = output.finish_reason
-                        out[
-                            "completion_usage"
-                        ] = BaseWorkerHandler._build_completion_usage(
-                            request_output=res,
+                        out["completion_usage"] = (
+                            BaseWorkerHandler._build_completion_usage(
+                                request_output=res,
+                            )
                         )
                     if output.stop_reason:
                         out["stop_reason"] = output.stop_reason
@@ -306,12 +306,14 @@ async def generate(self, request, context):
         logger.debug(f"Decode Request ID: {request_id}")
 
         # Check if timing metrics are requested
-        include_timing = _should_include_timing_metrics(request)
+        include_timing = _request_contains_timing_metrics(request)
 
         # Initialize timing metrics using request_received_seconds from frontend (passed via PreprocessedRequest)
-        timing_metrics: Optional[Dict[str, float]] = None
+        # NOTE: If frontend, prefill workers, and decode workers are running on different machines,
+        # there may be slight clock drifts between them. As a result, timing values recorded on
+        # different machines may not be perfectly synchronized and could show minor inconsistencies.
+        timing_metrics: Dict[str, float] = {}
         if include_timing:
-            timing_metrics = {}
             # Use request_received_seconds from the request (set by frontend) if available
             frontend_received = request.get("request_received_seconds")
             if frontend_received is not None:
@@ -394,9 +396,7 @@ async def generate(self, request, context):
                     # On finish, record decode_end_seconds and inject timing_metrics
                     # Note: request_finish_seconds is set in the Rust HTTP layer when the response actually leaves the server
                     if tok.get("finish_reason") is not None and include_timing:
-                        timing_metrics[
-                            "decode_end_seconds"
-                        ] = time.time()
+                        timing_metrics["decode_end_seconds"] = time.time()
 
                         # Inject timing_metrics into disaggregated_params
                         if (
@@ -439,12 +439,14 @@ async def generate(self, request, context):
         logger.debug(f"Prefill Request ID: {request_id}")
 
         # Check if timing metrics are requested
-        include_timing = _should_include_timing_metrics(request)
+        include_timing = _request_contains_timing_metrics(request)
 
         # Initialize timing metrics using request_received_seconds from frontend (passed via PreprocessedRequest)
-        timing_metrics: Optional[Dict[str, float]] = None
+        # NOTE: If frontend, prefill workers, and decode workers are running on different machines,
+        # there may be slight clock drifts between them. As a result, timing values recorded on
+        # different machines may not be perfectly synchronized and could show minor inconsistencies.
+        timing_metrics: Dict[str, float] = {}
         if include_timing:
-            timing_metrics = {}
             # Use request_received_seconds from the request (set by frontend) if available
             frontend_received = request.get("request_received_seconds")
             if frontend_received is not None:
@@ -509,14 +511,12 @@ async def generate(self, request, context):
                     disaggregated_params: Optional[Dict[str, Any]] = {}
 
                     if res.kv_transfer_params:
-                        disaggregated_params[
-                            "kv_transfer_params"
-                        ] = res.kv_transfer_params
+                        disaggregated_params["kv_transfer_params"] = (
+                            res.kv_transfer_params
+                        )
 
                     if include_timing and timing_metrics:
-                        timing_metrics[
-                            "prefill_end_seconds"
-                        ] = time.time()
+                        timing_metrics["prefill_end_seconds"] = time.time()
                         disaggregated_params["timing_metrics"] = timing_metrics
 
                     output: Dict[str, Any] = {
diff --git a/components/src/dynamo/vllm/tests/test_vllm_extra_fields.py b/components/src/dynamo/vllm/tests/test_vllm_extra_fields.py
@@ -19,7 +19,7 @@
 from dynamo.vllm.handlers import (  # noqa: E402
     DecodeWorkerHandler,
     PrefillWorkerHandler,
-    _should_include_timing_metrics,
+    _request_contains_timing_metrics,
 )
 
 
@@ -32,22 +32,22 @@
 
 
 class TestShouldIncludeTimingMetrics:
-    """Tests for _should_include_timing_metrics helper function."""
+    """Tests for _request_contains_timing_metrics helper function."""
 
     def test_returns_true_with_multiple_extra_fields(self):
         """Timing metrics should be included when explicitly requested."""
         request = {"extra_fields": ["worker_id", "timing_metrics", "other_field"]}
-        assert _should_include_timing_metrics(request) is True
+        assert _request_contains_timing_metrics(request) is True
 
     def test_returns_false_when_extra_fields_is_none(self):
         """Timing metrics should not be included when extra_fields is None."""
         request = {"extra_fields": None}
-        assert _should_include_timing_metrics(request) is False
+        assert _request_contains_timing_metrics(request) is False
 
     def test_returns_false_when_extra_fields_missing(self):
         """Timing metrics should not be included when extra_fields key is absent."""
         request: dict[str, list[str]] = {}
-        assert _should_include_timing_metrics(request) is False
+        assert _request_contains_timing_metrics(request) is False
 
 
 def make_mock_request_output(
diff --git a/lib/llm/src/http/service/openai.rs b/lib/llm/src/http/service/openai.rs
@@ -305,6 +305,9 @@ async fn handler_completions(
     Json(mut request): Json<NvCreateCompletionRequest>,
 ) -> Result<Response, ErrorResponse> {
     // Capture received timestamp immediately when request arrives at the frontend
+    // NOTE: If frontend, prefill workers, and decode workers are running on different machines,
+    // there may be slight clock drifts between them. As a result, timing values recorded on
+    // different machines may not be perfectly synchronized and could show minor inconsistencies.
     let request_received_seconds = SystemTime::now()
         .duration_since(UNIX_EPOCH)
         .map(|d| d.as_secs_f32())
@@ -739,6 +742,9 @@ async fn handler_chat_completions(
     Json(mut request): Json<NvCreateChatCompletionRequest>,
 ) -> Result<Response, ErrorResponse> {
     // Capture received timestamp immediately when request arrives at the frontend
+    // NOTE: If frontend, prefill workers, and decode workers are running on different machines,
+    // there may be slight clock drifts between them. As a result, timing values recorded on
+    // different machines may not be perfectly synchronized and could show minor inconsistencies.
     let request_received_seconds = SystemTime::now()
         .duration_since(UNIX_EPOCH)
         .map(|d| d.as_secs_f32())
diff --git a/lib/llm/src/protocols/common/preprocessor.rs b/lib/llm/src/protocols/common/preprocessor.rs
@@ -106,7 +106,7 @@ pub struct PreprocessedRequest {
     /// Used for timing metrics to track end-to-end latency
     #[builder(default)]
     #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub request_received_seconds: Option<f64>,
+    pub request_received_seconds: Option<f32>,
 }
 
 impl PreprocessedRequest {

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ pub struct PreprocessedRequest {`
`106`	`106`	`/// Used for timing metrics to track end-to-end latency`
`107`	`107`	`#[builder(default)]`
`108`	`108`	`#[serde(default, skip_serializing_if = "Option::is_none")]`
`109`		`- pub request_received_seconds: Option<f64>,`
	`109`	`+ pub request_received_seconds: Option<f32>,`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`impl PreprocessedRequest {`