chore: cleanups of passing around prefill and decode worker ids (#4829)

PeaBrane · web-flow · commit 1e5b20b2b394 · 2025-12-09T21:37:45.000Z
Signed-off-by: PeaBrane &lt;yanrpei@gmail.com&gt;
diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs
@@ -22,6 +22,8 @@ use futures::stream::{self, StreamExt};
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 
+use crate::protocols::openai::nvext::WorkerIdInfo;
+
 pub mod approx;
 pub mod indexer;
 pub mod prefill_router;
@@ -646,13 +648,19 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
         backend_input.estimated_prefix_hit_num_blocks = Some(overlap_amount);
         backend_input.dp_rank = Some(dp_rank);
 
-        // Get prefill worker ID if available (stored by PrefillRouter)
-        // In aggregated mode, prefill_worker_id is None, so we use decode_worker_id for both
+        // Get prefill worker ID from prefill_result if available
+        // In aggregated mode, prefill_result is None, so we use decode_worker_id for both
         let decode_worker_id = instance_id;
-        let prefill_worker_id = context
-            .get::<u64>("prefill_worker_id")
-            .ok()
-            .map(|arc| *arc)
+        let prefill_worker_id = backend_input
+            .prefill_result
+            .as_ref()
+            .and_then(|prefill_result| {
+                prefill_result
+                    .disaggregated_params
+                    .get("worker_id")
+                    .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
+                    .and_then(|info| info.prefill_worker_id)
+            })
             .or(Some(decode_worker_id)); // Use decode_worker_id if no separate prefill worker
 
         let updated_request = context.map(|_| backend_input);
@@ -699,12 +707,14 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
                                 continue;
                             };
 
-                            // prefill_worker_id comes from context (set by PrefillRouter) or falls back to instance_id
+                            // prefill_worker_id comes from prefill_result.disaggregated_params or falls back to instance_id
                             // decode_worker_id is always the current instance_id
-                            let worker_id_json = json!({
-                                "prefill_worker_id": prefill_worker_id,
-                                "decode_worker_id": decode_worker_id,
-                            });
+                            let worker_id_info = WorkerIdInfo {
+                                prefill_worker_id,
+                                decode_worker_id: Some(decode_worker_id),
+                            };
+                            let worker_id_json = serde_json::to_value(&worker_id_info)
+                                .expect("WorkerIdInfo serialization should not fail");
 
                             if let Some(obj) = data.disaggregated_params.as_mut().and_then(|p| p.as_object_mut()) {
                                 obj.insert("worker_id".to_string(), worker_id_json);
diff --git a/lib/llm/src/kv_router/prefill_router.rs b/lib/llm/src/kv_router/prefill_router.rs
@@ -176,11 +176,11 @@ impl PrefillRouter {
         Ok(())
     }
 
-    /// Call the prefill router and extract structured prefill result and worker ID
+    /// Call the prefill router and extract structured prefill result
     async fn call_prefill(
         &self,
         request: SingleIn<PreprocessedRequest>,
-    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
+    ) -> Result<PrefillResult, PrefillError> {
         // Get the prefill router, error if not activated
         let Some(prefill_router) = self.prefill_router.get() else {
             return Err(PrefillError::NotActivated);
@@ -239,21 +239,10 @@ impl PrefillRouter {
             ));
         };
 
-        // Extract prefill worker ID from disaggregated_params
-        let prefill_worker_id = disaggregated_params
-            .get("worker_id")
-            .and_then(|worker_id_json| {
-                worker_id_json
-                    .get("prefill_worker_id")
-                    .and_then(|v| v.as_u64())
-            });
-        Ok((
-            PrefillResult {
-                disaggregated_params,
-                prompt_tokens_details,
-            },
-            prefill_worker_id,
-        ))
+        Ok(PrefillResult {
+            disaggregated_params,
+            prompt_tokens_details,
+        })
     }
 }
 
@@ -310,7 +299,7 @@ impl
 
         // Handle prefill result
         match prefill_result {
-            Ok((prefill_result, prefill_worker_id)) => {
+            Ok(prefill_result) => {
                 tracing::debug!("Prefill succeeded, using disaggregated params for decode");
 
                 let mut decode_req = req;
@@ -326,14 +315,8 @@ impl
                     ..existing_override.unwrap_or_default()
                 });
 
-                // Store prefill worker ID in context if available
-                let mut decode_context = context;
-                if let Some(worker_id) = prefill_worker_id {
-                    decode_context.insert("prefill_worker_id", worker_id);
-                }
-
                 // Map the modified request through with preserved context
-                let decode_request = decode_context.map(|_| decode_req);
+                let decode_request = context.map(|_| decode_req);
                 next.generate(decode_request).await
             }
             Err(PrefillError::NotActivated) => {
diff --git a/lib/llm/src/protocols/openai/chat_completions/delta.rs b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -4,7 +4,10 @@
 use super::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse};
 use crate::{
     local_model::runtime_config::ModelRuntimeConfig,
-    protocols::common::{self},
+    protocols::{
+        common,
+        openai::nvext::{NvExtResponse, WorkerIdInfo},
+    },
     types::TokenIdType,
 };
 
@@ -363,35 +366,22 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
         let mut stream_response = self.create_choice(index, delta.text, finish_reason, logprobs);
 
         // Extract worker_id from disaggregated_params and inject into nvext if present
-        if let Some(worker_id_json) = delta
+        if let Some(worker_id_info) = delta
             .disaggregated_params
             .as_ref()
             .and_then(|params| params.get("worker_id"))
+            .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
         {
-            use crate::protocols::openai::nvext::{NvExtResponse, WorkerIdInfo};
-
-            let prefill_worker_id = worker_id_json
-                .get("prefill_worker_id")
-                .and_then(|v| v.as_u64());
-            let decode_worker_id = worker_id_json
-                .get("decode_worker_id")
-                .and_then(|v| v.as_u64());
-
-            let worker_id_info = WorkerIdInfo {
-                prefill_worker_id,
-                decode_worker_id,
-            };
-
             let nvext_response = NvExtResponse {
-                worker_id: Some(worker_id_info),
+                worker_id: Some(worker_id_info.clone()),
             };
 
             if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
                 stream_response.nvext = Some(nvext_json);
                 tracing::debug!(
                     "Injected worker_id into chat completion nvext: prefill={:?}, decode={:?}",
-                    prefill_worker_id,
-                    decode_worker_id
+                    worker_id_info.prefill_worker_id,
+                    worker_id_info.decode_worker_id
                 );
             }
         }
diff --git a/lib/llm/src/protocols/openai/completions/delta.rs b/lib/llm/src/protocols/openai/completions/delta.rs
@@ -2,7 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::{NvCreateCompletionRequest, NvCreateCompletionResponse};
-use crate::{protocols::common, types::TokenIdType};
+use crate::{
+    protocols::{
+        common,
+        openai::nvext::{NvExtResponse, WorkerIdInfo},
+    },
+    types::TokenIdType,
+};
 
 impl NvCreateCompletionRequest {
     /// Enables usage tracking for non-streaming requests to comply with OpenAI API specification.
@@ -266,35 +272,22 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
         let mut response = self.create_choice(index, delta.text.clone(), finish_reason, logprobs);
 
         // Extract worker_id from disaggregated_params and inject into nvext if present
-        if let Some(worker_id_json) = delta
+        if let Some(worker_id_info) = delta
             .disaggregated_params
             .as_ref()
             .and_then(|params| params.get("worker_id"))
+            .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
         {
-            use crate::protocols::openai::nvext::{NvExtResponse, WorkerIdInfo};
-
-            let prefill_worker_id = worker_id_json
-                .get("prefill_worker_id")
-                .and_then(|v| v.as_u64());
-            let decode_worker_id = worker_id_json
-                .get("decode_worker_id")
-                .and_then(|v| v.as_u64());
-
-            let worker_id_info = WorkerIdInfo {
-                prefill_worker_id,
-                decode_worker_id,
-            };
-
             let nvext_response = NvExtResponse {
-                worker_id: Some(worker_id_info),
+                worker_id: Some(worker_id_info.clone()),
             };
 
             if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
                 response.inner.nvext = Some(nvext_json);
                 tracing::debug!(
                     "Injected worker_id into completions nvext: prefill={:?}, decode={:?}",
-                    prefill_worker_id,
-                    decode_worker_id
+                    worker_id_info.prefill_worker_id,
+                    worker_id_info.decode_worker_id
                 );
             }
         }
diff --git a/tests/router/common.py b/tests/router/common.py
diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py