@@ -75,11 +75,11 @@ def build_sampling_params(
7575
7676
7777def _request_contains_timing_metrics (request : Dict [str , Any ]) -> bool :
78- """Check if timing_metrics is requested in extra_fields ."""
79- extra_fields : Optional [List [str ]] = request .get ("extra_fields " )
80- if extra_fields is None :
78+ """Check if timing_metrics is requested in observability_fields ."""
79+ observability_fields : Optional [List [str ]] = request .get ("observability_fields " )
80+ if observability_fields is None :
8181 return False
82- return "timing_metrics" in extra_fields
82+ return "timing_metrics" in observability_fields
8383
8484
8585class BaseWorkerHandler (ABC ):
@@ -259,10 +259,10 @@ async def generate_tokens(
259259 out = {"token_ids" : output .token_ids [num_output_tokens_so_far :]}
260260 if output .finish_reason :
261261 out ["finish_reason" ] = output .finish_reason
262- out [
263- "completion_usage"
264- ] = BaseWorkerHandler . _build_completion_usage (
265- request_output = res ,
262+ out ["completion_usage" ] = (
263+ BaseWorkerHandler . _build_completion_usage (
264+ request_output = res ,
265+ )
266266 )
267267 if output .stop_reason :
268268 out ["stop_reason" ] = output .stop_reason
@@ -309,9 +309,18 @@ async def generate(self, request, context):
309309 include_timing = _request_contains_timing_metrics (request )
310310
311311 # Initialize timing metrics using request_received_seconds from frontend (passed via PreprocessedRequest)
312- # NOTE: If frontend, prefill workers, and decode workers are running on different machines,
313- # there may be slight clock drifts between them. As a result, timing values recorded on
314- # different machines may not be perfectly synchronized and could show minor inconsistencies.
312+ #
313+ # TIMING METRICS:
314+ # - Reliable durations: Use same-machine timestamps (e.g., decode_end - decode_start).
315+ # We use time.perf_counter() for intra-worker duration calculations to ensure monotonic,
316+ # high-resolution timing that's immune to system clock adjustments.
317+ # - Cross-machine calculations (e.g., prefill_start - request_received) assume perfect NTP
318+ # synchronization and should be used with UTMOST CAUTION due to clock drift. Even with NTP,
319+ # clocks can drift by milliseconds each day, leading to negative durations or misleading latency values.
320+ # These cross-machine metrics are useful for rough end-to-end analysis but should not be
321+ # relied upon for precise performance measurements.
322+ # - TODO: Measure actual overhead (network, queueing, etc.) - expected to be low but needs
323+ # benchmarking
315324 timing_metrics : Dict [str , float ] = {}
316325 if include_timing :
317326 # Use request_received_seconds from the request (set by frontend) if available
@@ -371,6 +380,7 @@ async def generate(self, request, context):
371380 # Record decode start time
372381 if include_timing :
373382 decode_start_seconds = time .time ()
383+ decode_start_perf_counter = time .perf_counter ()
374384 # If this is aggregated mode (no prefill_result), prefill_start == decode_start
375385 if prefill_result is None :
376386 timing_metrics ["prefill_start_seconds" ] = decode_start_seconds
@@ -396,7 +406,9 @@ async def generate(self, request, context):
396406 # On finish, record decode_end_seconds and inject timing_metrics
397407 # Note: request_finish_seconds is set in the Rust HTTP layer when the response actually leaves the server
398408 if tok .get ("finish_reason" ) is not None and include_timing :
399- timing_metrics ["decode_end_seconds" ] = time .time ()
409+ timing_metrics ["decode_end_seconds" ] = decode_start_seconds + (
410+ time .perf_counter () - decode_start_perf_counter
411+ )
400412
401413 # Inject timing_metrics into disaggregated_params
402414 if (
@@ -442,9 +454,7 @@ async def generate(self, request, context):
442454 include_timing = _request_contains_timing_metrics (request )
443455
444456 # Initialize timing metrics using request_received_seconds from frontend (passed via PreprocessedRequest)
445- # NOTE: If frontend, prefill workers, and decode workers are running on different machines,
446- # there may be slight clock drifts between them. As a result, timing values recorded on
447- # different machines may not be perfectly synchronized and could show minor inconsistencies.
457+ # See DecodeWorkerHandler.generate() for timing metrics documentation
448458 timing_metrics : Dict [str , float ] = {}
449459 if include_timing :
450460 # Use request_received_seconds from the request (set by frontend) if available
@@ -453,7 +463,9 @@ async def generate(self, request, context):
453463 timing_metrics ["request_received_seconds" ] = frontend_received
454464
455465 # Record prefill_start as when we start processing in the prefill worker
456- timing_metrics ["prefill_start_seconds" ] = time .time ()
466+ prefill_start_seconds = time .time ()
467+ prefill_start_perf_counter = time .perf_counter ()
468+ timing_metrics ["prefill_start_seconds" ] = prefill_start_seconds
457469
458470 # Extract and decode multimodal data if present
459471 multi_modal_data = await self ._extract_multimodal_data (request )
@@ -511,12 +523,15 @@ async def generate(self, request, context):
511523 disaggregated_params : Optional [Dict [str , Any ]] = {}
512524
513525 if res .kv_transfer_params :
514- disaggregated_params [
515- " kv_transfer_params"
516- ] = res . kv_transfer_params
526+ disaggregated_params ["kv_transfer_params" ] = (
527+ res . kv_transfer_params
528+ )
517529
518530 if include_timing and timing_metrics :
519- timing_metrics ["prefill_end_seconds" ] = time .time ()
531+ timing_metrics ["prefill_end_seconds" ] = (
532+ prefill_start_seconds
533+ + (time .perf_counter () - prefill_start_perf_counter )
534+ )
520535 disaggregated_params ["timing_metrics" ] = timing_metrics
521536
522537 output : Dict [str , Any ] = {
0 commit comments