Use perf_counter() instead + formatting

zhongxuanwang-nv · zhongxuanwang-nv · commit 133893ac5cd4 · 2025-12-09T08:52:25.000Z
Signed-off-by: Zhongxuan Wang &lt;daniewang@nvidia.com&gt;
diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
@@ -633,10 +633,10 @@ async def generate_tokens(
                     out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
                     if output.finish_reason:
                         out["finish_reason"] = output.finish_reason
-                        out["completion_usage"] = (
-                            BaseWorkerHandler._build_completion_usage(
-                                request_output=res,
-                            )
+                        out[
+                            "completion_usage"
+                        ] = BaseWorkerHandler._build_completion_usage(
+                            request_output=res,
                         )
                         # Log completion with LoRA info (debug level to avoid log spam)
                         if lora_request:
@@ -805,7 +805,9 @@ async def generate(self, request, context):
                 ):
                     # Capture first token timing
                     if include_timing and not first_token_sent:
-                        first_token_time = time.time()
+                        first_token_time = decode_start_seconds + (
+                            time.perf_counter() - decode_start_perf_counter
+                        )
                         timing_metrics["decode_first_token_seconds"] = first_token_time
                         # In aggregated mode, prefill finishes when first token is generated
                         if prefill_result is None:
@@ -966,14 +968,15 @@ async def generate(self, request, context):
                     disaggregated_params: Optional[Dict[str, Any]] = {}
 
                     if res.kv_transfer_params:
-                        disaggregated_params["kv_transfer_params"] = (
-                            res.kv_transfer_params
-                        )
+                        disaggregated_params[
+                            "kv_transfer_params"
+                        ] = res.kv_transfer_params
 
                     if include_timing and timing_metrics:
-                        timing_metrics["prefill_end_seconds"] = (
-                            prefill_start_seconds
-                            + (time.perf_counter() - prefill_start_perf_counter)
+                        timing_metrics[
+                            "prefill_end_seconds"
+                        ] = prefill_start_seconds + (
+                            time.perf_counter() - prefill_start_perf_counter
                         )
                         disaggregated_params["timing_metrics"] = timing_metrics
 
diff --git a/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py b/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
@@ -227,9 +227,9 @@ async def generate(self, request: vLLMMultimodalRequest, context):
                 # Update the prompt token id in the decode request to the one
                 # in response, which has image templated filled in. So that
                 # the decode worker will fetch correct amount of KV blocks.
-                decode_request.engine_prompt["prompt_token_ids"] = (
-                    prefill_response.prompt_token_ids
-                )
+                decode_request.engine_prompt[
+                    "prompt_token_ids"
+                ] = prefill_response.prompt_token_ids
                 logger.debug(
                     f"Prefill response kv_transfer_params: {prefill_response.kv_transfer_params}"
                 )
diff --git a/components/src/dynamo/vllm/tests/test_vllm_observability_fields.py b/components/src/dynamo/vllm/tests/test_vllm_observability_fields.py
@@ -35,7 +35,9 @@ class TestShouldIncludeTimingMetrics:
 
     def test_returns_true_with_multiple_observability_fields(self):
         """Timing metrics should be included when explicitly requested."""
-        request = {"observability_fields": ["worker_id", "timing_metrics", "other_field"]}
+        request = {
+            "observability_fields": ["worker_id", "timing_metrics", "other_field"]
+        }
         assert _request_contains_timing_metrics(request) is True
 
     def test_returns_false_when_observability_fields_is_none(self):