use u64 for tokens threshold

PeaBrane · PeaBrane · commit 4b7b430d4da7 · 2025-12-10T20:33:18.000-08:00
Signed-off-by: PeaBrane &lt;yanrpei@gmail.com&gt;
diff --git a/components/src/dynamo/frontend/main.py b/components/src/dynamo/frontend/main.py
@@ -197,9 +197,9 @@ def parse_args():
     )
     parser.add_argument(
         "--active-prefill-tokens-threshold",
-        type=float,
+        type=int,
         default=None,
-        help="Threshold percentage for determining when a worker is considered busy based on prefill token utilization. Can exceed 1.0 since active prefill tokens include queued tokens. If not set, tokens-based busy detection is disabled.",
+        help="Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.",
     )
     parser.add_argument(
         "--model-name",
diff --git a/docs/router/kv_cache_routing.md b/docs/router/kv_cache_routing.md
@@ -33,7 +33,7 @@ The main KV-aware routing arguments:
 
 - `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines emit `ForwardPassMetrics`. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
 
-- `--active-prefill-tokens-threshold`: Threshold for determining when a worker is considered busy based on prefill token utilization. Can exceed 1.0 since active prefill tokens include queued tokens (pending prefill work). If not set, tokens-based busy detection is disabled. When set, the router checks if active prefill tokens exceed `threshold * max_num_batch_tokens`. Generally, set this higher than 1.0 to account for queued requests.
+- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.
 
 - `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
 
@@ -594,8 +594,8 @@ The busy thresholds can be updated at runtime without restarting the frontend. T
 # Set both thresholds for a model
 curl -X POST http://localhost:8000/busy_threshold \
   -H "Content-Type: application/json" \
-  -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1.5}'
-# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1.5}
+  -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}'
+# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
 
 # Set only active decode blocks threshold
 curl -X POST http://localhost:8000/busy_threshold \
@@ -607,12 +607,12 @@ curl -X POST http://localhost:8000/busy_threshold \
 curl -X POST http://localhost:8000/busy_threshold \
   -H "Content-Type: application/json" \
   -d '{"model": "meta-llama/Llama-2-7b-hf"}'
-# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1.5}
+# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
 # Or if not configured: {"model": "...", "active_decode_blocks_threshold": null, "active_prefill_tokens_threshold": null}
 ```
 
 **List all configured thresholds (GET):**
 ```bash
 curl http://localhost:8000/busy_threshold
-# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1.5}]}
+# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}]}
 ```
diff --git a/lib/bindings/c/src/lib.rs b/lib/bindings/c/src/lib.rs
@@ -1033,8 +1033,8 @@ pub async fn create_worker_selection_pipeline_chat(
 
     // Create worker monitor if busy_threshold is set
     // Note: C bindings don't register with ModelManager, so HTTP endpoint won't see this
-    // C bindings only support active_decode_blocks_threshold for now (active_prefill_tokens_threshold defaults to 1000.0 = effectively disabled)
-    let worker_monitor = busy_threshold.map(|t| KvWorkerMonitor::new(client.clone(), t, 1000.0));
+    // C bindings only support active_decode_blocks_threshold for now (active_prefill_tokens_threshold defaults to 1000000 tokens = effectively disabled)
+    let worker_monitor = busy_threshold.map(|t| KvWorkerMonitor::new(client.clone(), t, 1000000));
 
     let engine = build_routed_pipeline::<
         NvCreateChatCompletionRequest,
diff --git a/lib/bindings/python/rust/llm/entrypoint.rs b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -79,8 +79,8 @@ pub struct RouterConfig {
     kv_router_config: KvRouterConfig,
     /// Threshold for active decode blocks utilization (0.0-1.0)
     active_decode_blocks_threshold: Option<f64>,
-    /// Threshold for active prefill tokens utilization (can exceed 1.0)
-    active_prefill_tokens_threshold: Option<f64>,
+    /// Threshold for active prefill tokens utilization (literal token count)
+    active_prefill_tokens_threshold: Option<u64>,
     enforce_disagg: bool,
 }
 
@@ -92,7 +92,7 @@ impl RouterConfig {
         mode: RouterMode,
         config: Option<KvRouterConfig>,
         active_decode_blocks_threshold: Option<f64>,
-        active_prefill_tokens_threshold: Option<f64>,
+        active_prefill_tokens_threshold: Option<u64>,
         enforce_disagg: bool,
     ) -> Self {
         Self {
diff --git a/lib/llm/src/discovery/model_manager.rs b/lib/llm/src/discovery/model_manager.rs
@@ -520,7 +520,7 @@ impl ModelManager {
 
     /// Get or set the active prefill tokens threshold for a model's worker monitor.
     ///
-    /// The threshold can exceed 1.0 since active prefill tokens include queued tokens.
+    /// The threshold is a literal token count (not a percentage).
     ///
     /// # Arguments
     ///
@@ -529,12 +529,12 @@ impl ModelManager {
     ///
     /// # Returns
     ///
-    /// The threshold value as f64, or `None` if no monitor exists for this model.
+    /// The threshold value as u64, or `None` if no monitor exists for this model.
     pub fn active_prefill_tokens_threshold(
         &self,
         model: &str,
-        threshold: Option<f64>,
-    ) -> Option<f64> {
+        threshold: Option<u64>,
+    ) -> Option<u64> {
         let monitors = self.worker_monitors.read();
         let monitor = monitors.get(model)?;
 
@@ -557,7 +557,7 @@ impl ModelManager {
     /// * `model` - The model name
     /// * `client` - The client for subscribing to KV metrics (only used if creating new)
     /// * `active_decode_blocks_threshold` - The initial/updated active decode blocks threshold value (0.0-1.0)
-    /// * `active_prefill_tokens_threshold` - The initial/updated active prefill tokens threshold value (can exceed 1.0)
+    /// * `active_prefill_tokens_threshold` - The initial/updated active prefill tokens threshold value (literal token count)
     ///
     /// # Returns
     ///
@@ -567,7 +567,7 @@ impl ModelManager {
         model: &str,
         client: Client,
         active_decode_blocks_threshold: f64,
-        active_prefill_tokens_threshold: f64,
+        active_prefill_tokens_threshold: u64,
     ) -> KvWorkerMonitor {
         let mut monitors = self.worker_monitors.write();
 
@@ -594,7 +594,7 @@ impl ModelManager {
     /// Lists all models that have worker monitors (and thus busy thresholds) configured.
     ///
     /// Returns a vector of (model_name, active_decode_blocks_threshold, active_prefill_tokens_threshold) tuples.
-    pub fn list_busy_thresholds(&self) -> Vec<(String, f64, f64)> {
+    pub fn list_busy_thresholds(&self) -> Vec<(String, f64, u64)> {
         self.worker_monitors
             .read()
             .iter()
diff --git a/lib/llm/src/discovery/watcher.rs b/lib/llm/src/discovery/watcher.rs
@@ -408,15 +408,15 @@ impl ModelWatcher {
             let worker_monitor = if self.router_config.active_decode_blocks_threshold.is_some()
                 || self.router_config.active_prefill_tokens_threshold.is_some()
             {
-                // Default thresholds: active_decode_blocks=1.0 (disabled), active_prefill_tokens=1000.0 (effectively disabled)
+                // Default thresholds: active_decode_blocks=1.0 (disabled), active_prefill_tokens=1000000 (effectively disabled)
                 let active_decode_blocks = self
                     .router_config
                     .active_decode_blocks_threshold
                     .unwrap_or(1.0);
                 let active_prefill_tokens = self
                     .router_config
                     .active_prefill_tokens_threshold
-                    .unwrap_or(1000.0);
+                    .unwrap_or(1000000);
                 Some(self.manager.get_or_create_worker_monitor(
                     card.name(),
                     client.clone(),
diff --git a/lib/llm/src/discovery/worker_monitor.rs b/lib/llm/src/discovery/worker_monitor.rs
@@ -17,24 +17,20 @@ use tokio_stream::StreamExt;
 /// Scale factor for storing f64 thresholds as u32 (10000 = 4 decimal places)
 const THRESHOLD_SCALE: u32 = 10000;
 
-/// Scale factor for storing f64 tokens threshold as u64 (values can exceed 1.0)
-const TOKENS_THRESHOLD_SCALE: u64 = 10000;
-
 /// Worker load monitoring state per dp_rank
 #[derive(Clone, Debug, Default)]
 pub struct WorkerLoadState {
     pub active_decode_blocks: HashMap<u32, u64>,
     pub kv_total_blocks: HashMap<u32, u64>,
     pub active_prefill_tokens: HashMap<u32, u64>,
-    pub max_num_batch_tokens: HashMap<u32, u64>,
 }
 
 impl WorkerLoadState {
     /// Returns true if ALL dp_ranks are considered busy based on the dual-threshold logic:
     ///
     /// For each dp_rank:
-    /// 1. If `active_prefill_tokens` and `max_num_batch_tokens` are both available,
-    ///    check if tokens exceed threshold. If so, that dp_rank is busy.
+    /// 1. If `active_prefill_tokens` is available, check if tokens exceed the literal threshold.
+    ///    If so, that dp_rank is busy.
     /// 2. If not, check if `active_decode_blocks` and `kv_total_blocks` are both available,
     ///    and if blocks exceed threshold. If so, that dp_rank is busy.
     /// 3. If neither check can be performed (missing data), that dp_rank is considered free.
@@ -43,7 +39,7 @@ impl WorkerLoadState {
     pub fn is_busy(
         &self,
         active_decode_blocks_threshold: f64,
-        active_prefill_tokens_threshold: f64,
+        active_prefill_tokens_threshold: u64,
     ) -> bool {
         // Get all dp_ranks we know about
         let all_dp_ranks: std::collections::HashSet<_> = self
@@ -60,13 +56,9 @@ impl WorkerLoadState {
 
         // Check if ALL dp_ranks are busy
         all_dp_ranks.iter().all(|&dp_rank| {
-            // First check: prefill tokens threshold
-            // Skip if max_tokens is 0 (no capacity means threshold check is meaningless)
-            if let (Some(&active_tokens), Some(&max_tokens)) = (
-                self.active_prefill_tokens.get(&dp_rank),
-                self.max_num_batch_tokens.get(&dp_rank),
-            ) && max_tokens > 0
-                && (active_tokens as f64) > (active_prefill_tokens_threshold * max_tokens as f64)
+            // First check: prefill tokens threshold (literal token count)
+            if let Some(&active_tokens) = self.active_prefill_tokens.get(&dp_rank)
+                && active_tokens > active_prefill_tokens_threshold
             {
                 return true; // This dp_rank is busy due to tokens
             }
@@ -98,7 +90,7 @@ pub struct KvWorkerMonitor {
     worker_load_states: Arc<RwLock<HashMap<u64, WorkerLoadState>>>,
     /// Active decode blocks threshold stored as parts-per-10000 (e.g., 8500 = 0.85)
     active_decode_blocks_threshold: Arc<AtomicU32>,
-    /// Active prefill tokens threshold stored as parts-per-10000 (can exceed 10000 for values > 1.0)
+    /// Active prefill tokens threshold stored as literal token count (u64)
     active_prefill_tokens_threshold: Arc<AtomicU64>,
     /// Guard to ensure start_monitoring() only runs once across clones
     started: Arc<AtomicBool>,
@@ -107,15 +99,15 @@ pub struct KvWorkerMonitor {
 impl KvWorkerMonitor {
     /// Create a new worker monitor with the given thresholds.
     ///
-    /// - `active_decode_blocks_threshold` (0.0-1.0): Threshold for KV cache block utilization
-    /// - `active_prefill_tokens_threshold` (can exceed 1.0): Threshold for prefill token utilization
+    /// - `active_decode_blocks_threshold` (0.0-1.0): Threshold percentage for KV cache block utilization
+    /// - `active_prefill_tokens_threshold`: Literal token count threshold for prefill token utilization
     ///
     /// Both thresholds can be dynamically updated via `set_active_decode_blocks_threshold()` and
     /// `set_active_prefill_tokens_threshold()`.
     pub fn new(
         client: Client,
         active_decode_blocks_threshold: f64,
-        active_prefill_tokens_threshold: f64,
+        active_prefill_tokens_threshold: u64,
     ) -> Self {
         Self {
             client,
@@ -124,7 +116,7 @@ impl KvWorkerMonitor {
                 Self::active_decode_blocks_threshold_to_scaled(active_decode_blocks_threshold),
             )),
             active_prefill_tokens_threshold: Arc::new(AtomicU64::new(
-                Self::active_prefill_tokens_threshold_to_scaled(active_prefill_tokens_threshold),
+                active_prefill_tokens_threshold,
             )),
             started: Arc::new(AtomicBool::new(false)),
         }
@@ -142,18 +134,6 @@ impl KvWorkerMonitor {
         scaled as f64 / THRESHOLD_SCALE as f64
     }
 
-    /// Convert a f64 active prefill tokens threshold (can exceed 1.0) to scaled u64 for atomic storage.
-    #[inline]
-    fn active_prefill_tokens_threshold_to_scaled(threshold: f64) -> u64 {
-        (threshold * TOKENS_THRESHOLD_SCALE as f64) as u64
-    }
-
-    /// Convert a scaled u64 back to f64 active prefill tokens threshold.
-    #[inline]
-    fn scaled_to_active_prefill_tokens_threshold(scaled: u64) -> f64 {
-        scaled as f64 / TOKENS_THRESHOLD_SCALE as f64
-    }
-
     /// Get the current active decode blocks threshold value as f64.
     pub fn active_decode_blocks_threshold(&self) -> f64 {
         Self::scaled_to_active_decode_blocks_threshold(
@@ -169,19 +149,15 @@ impl KvWorkerMonitor {
         );
     }
 
-    /// Get the current active prefill tokens threshold value as f64.
-    pub fn active_prefill_tokens_threshold(&self) -> f64 {
-        Self::scaled_to_active_prefill_tokens_threshold(
-            self.active_prefill_tokens_threshold.load(Ordering::Relaxed),
-        )
+    /// Get the current active prefill tokens threshold value as u64.
+    pub fn active_prefill_tokens_threshold(&self) -> u64 {
+        self.active_prefill_tokens_threshold.load(Ordering::Relaxed)
     }
 
-    /// Set the active prefill tokens threshold value from f64.
-    pub fn set_active_prefill_tokens_threshold(&self, threshold: f64) {
-        self.active_prefill_tokens_threshold.store(
-            Self::active_prefill_tokens_threshold_to_scaled(threshold),
-            Ordering::Relaxed,
-        );
+    /// Set the active prefill tokens threshold value from u64.
+    pub fn set_active_prefill_tokens_threshold(&self, threshold: u64) {
+        self.active_prefill_tokens_threshold
+            .store(threshold, Ordering::Relaxed);
     }
 
     /// Get the worker load states for external access
@@ -244,7 +220,7 @@ impl WorkerLoadMonitor for KvWorkerMonitor {
                         let mut states = worker_load_states.write().unwrap();
                         states.retain(|lease_id, _| runtime_configs.contains_key(lease_id));
 
-                        // Update worker load states with total blocks and max batch tokens for all dp_ranks
+                        // Update worker load states with total blocks for all dp_ranks
                         for (lease_id, runtime_config) in runtime_configs.iter() {
                             let state = states.entry(*lease_id).or_default();
 
@@ -254,13 +230,6 @@ impl WorkerLoadMonitor for KvWorkerMonitor {
                                     state.kv_total_blocks.insert(dp_rank, total_blocks);
                                 }
                             }
-
-                            // Populate max_num_batch_tokens for all dp_ranks
-                            if let Some(max_tokens) = runtime_config.max_num_batched_tokens {
-                                for dp_rank in 0..runtime_config.data_parallel_size {
-                                    state.max_num_batch_tokens.insert(dp_rank, max_tokens);
-                                }
-                            }
                         }
                     }
 
@@ -294,9 +263,7 @@ impl WorkerLoadMonitor for KvWorkerMonitor {
                         let current_active_decode_blocks_threshold = Self::scaled_to_active_decode_blocks_threshold(
                             active_decode_blocks_threshold.load(Ordering::Relaxed),
                         );
-                        let current_active_prefill_tokens_threshold = Self::scaled_to_active_prefill_tokens_threshold(
-                            active_prefill_tokens_threshold.load(Ordering::Relaxed),
-                        );
+                        let current_active_prefill_tokens_threshold = active_prefill_tokens_threshold.load(Ordering::Relaxed);
 
                         // Recalculate all busy instances and update
                         let states = worker_load_states.read().unwrap();
diff --git a/lib/llm/src/entrypoint.rs b/lib/llm/src/entrypoint.rs
@@ -23,8 +23,8 @@ pub struct RouterConfig {
     pub kv_router_config: KvRouterConfig,
     /// Threshold for active decode blocks utilization (0.0-1.0)
     pub active_decode_blocks_threshold: Option<f64>,
-    /// Threshold for active prefill tokens utilization (can exceed 1.0)
-    pub active_prefill_tokens_threshold: Option<f64>,
+    /// Threshold for active prefill tokens utilization (literal token count)
+    pub active_prefill_tokens_threshold: Option<u64>,
     pub enforce_disagg: bool,
 }
 
@@ -44,7 +44,7 @@ impl RouterConfig {
         self
     }
 
-    pub fn with_active_prefill_tokens_threshold(mut self, threshold: Option<f64>) -> Self {
+    pub fn with_active_prefill_tokens_threshold(mut self, threshold: Option<u64>) -> Self {
         self.active_prefill_tokens_threshold = threshold;
         self
     }
diff --git a/lib/llm/src/http/service/busy_threshold.rs b/lib/llm/src/http/service/busy_threshold.rs
diff --git a/tests/router/common.py b/tests/router/common.py

Original file line number	Diff line number	Diff line change
`@@ -197,9 +197,9 @@ def parse_args():`
`197`	`197`	`)`
`198`	`198`	`parser.add_argument(`
`199`	`199`	`"--active-prefill-tokens-threshold",`
`200`		`- type=float,`
	`200`	`+ type=int,`
`201`	`201`	`default=None,`
`202`		`- help="Threshold percentage for determining when a worker is considered busy based on prefill token utilization. Can exceed 1.0 since active prefill tokens include queued tokens. If not set, tokens-based busy detection is disabled.",`
	`202`	`+ help="Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.",`
`203`	`203`	`)`
`204`	`204`	`parser.add_argument(`
`205`	`205`	`"--model-name",`
Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@ pub struct RouterConfig {`
`23`	`23`	`pub kv_router_config: KvRouterConfig,`
`24`	`24`	`/// Threshold for active decode blocks utilization (0.0-1.0)`
`25`	`25`	`pub active_decode_blocks_threshold: Option<f64>,`
`26`		`- /// Threshold for active prefill tokens utilization (can exceed 1.0)`
`27`		`- pub active_prefill_tokens_threshold: Option<f64>,`
	`26`	`+ /// Threshold for active prefill tokens utilization (literal token count)`
	`27`	`+ pub active_prefill_tokens_threshold: Option<u64>,`
`28`	`28`	`pub enforce_disagg: bool,`
`29`	`29`	`}`
`30`	`30`
`@@ -44,7 +44,7 @@ impl RouterConfig {`
`44`	`44`	`self`
`45`	`45`	`}`
`46`	`46`
`47`		`- pub fn with_active_prefill_tokens_threshold(mut self, threshold: Option<f64>) -> Self {`
	`47`	`+ pub fn with_active_prefill_tokens_threshold(mut self, threshold: Option<u64>) -> Self {`
`48`	`48`	`self.active_prefill_tokens_threshold = threshold;`
`49`	`49`	`self`
`50`	`50`	`}`