ai-dynamo
diff --git a/‎Cargo.lock‎
Lines changed: 2 additions & 2 deletions b/‎Cargo.lock‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎components/src/dynamo/frontend/main.py‎
Lines changed: 13 additions & 3 deletions b/‎components/src/dynamo/frontend/main.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎components/src/dynamo/mocker/args.py‎
Lines changed: 7 additions & 0 deletions b/‎components/src/dynamo/mocker/args.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎components/src/dynamo/planner/utils/perf_interpolation.py‎
Lines changed: 6 additions & 1 deletion b/‎components/src/dynamo/planner/utils/perf_interpolation.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎components/src/dynamo/vllm/args.py‎
Lines changed: 10 additions & 0 deletions b/‎components/src/dynamo/vllm/args.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎components/src/dynamo/vllm/main.py‎
Lines changed: 2 additions & 0 deletions b/‎components/src/dynamo/vllm/main.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/router/kv_cache_routing.md‎
Lines changed: 18 additions & 12 deletions b/‎docs/router/kv_cache_routing.md‎
Lines changed: 18 additions & 12 deletions
diff --git a/‎lib/bindings/c/src/lib.rs‎
Lines changed: 5 additions & 2 deletions b/‎lib/bindings/c/src/lib.rs‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lib/bindings/python/rust/llm/entrypoint.rs‎
Lines changed: 11 additions & 5 deletions b/‎lib/bindings/python/rust/llm/entrypoint.rs‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎lib/bindings/python/rust/llm/kv.rs‎
Lines changed: 11 additions & 4 deletions b/‎lib/bindings/python/rust/llm/kv.rs‎
Lines changed: 11 additions & 4 deletions
@@ -190,10 +190,16 @@ def parse_args():
         help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.",
     )
     parser.add_argument(
-        "--busy-threshold",
+        "--active-decode-blocks-threshold",
         type=float,
         default=None,
-        help="Threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache usage. If not set, busy detection is disabled.",
+        help="Threshold percentage (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. If not set, blocks-based busy detection is disabled.",
+    )
+    parser.add_argument(
+        "--active-prefill-tokens-threshold",
+        type=int,
+        default=None,
+        help="Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.",
     )
     parser.add_argument(
         "--model-name",
@@ -316,7 +322,11 @@ def signal_handler():
         "http_port": flags.http_port,
         "kv_cache_block_size": flags.kv_cache_block_size,
         "router_config": RouterConfig(
-            router_mode, kv_router_config, flags.busy_threshold, flags.enforce_disagg
+            router_mode,
+            kv_router_config,
+            active_decode_blocks_threshold=flags.active_decode_blocks_threshold,
+            active_prefill_tokens_threshold=flags.active_prefill_tokens_threshold,
+            enforce_disagg=flags.enforce_disagg,
         ),
     }
 
 
@@ -113,6 +113,7 @@ def create_temp_engine_args_file(args) -> Path:
         else None,
         "is_prefill": getattr(args, "is_prefill_worker", None),
         "is_decode": getattr(args, "is_decode_worker", None),
+        "enable_local_indexer": getattr(args, "enable_local_indexer", None),
     }
 
     # Remove None values to only include explicitly set arguments
@@ -284,6 +285,12 @@ def parse_args():
         default=False,
         help="Mark this as a decode worker which does not publish KV events and skips prefill cost estimation (default: False)",
     )
+    parser.add_argument(
+        "--enable-local-indexer",
+        action="store_true",
+        default=False,
+        help="Enable worker-local KV indexer for tracking this worker's own KV cache state (default: False)",
+    )
     parser.add_argument(
         "--store-kv",
         type=str,
 
@@ -20,7 +20,6 @@
 from typing import Optional
 
 import numpy as np
-import scipy
 
 from dynamo.runtime.logging import configure_dynamo_logging
 
@@ -80,6 +79,9 @@ def __init__(
         self.min_isl = min(self.prefill_isl)
         self.max_isl = max(self.prefill_isl)
 
+        # Lazy import scipy only when interpolation is actually needed
+        import scipy.interpolate
+
         # perform 1d interpolation
         self.ttft_interpolator = scipy.interpolate.interp1d(
             self.prefill_isl, self.prefill_ttft, kind="cubic"
@@ -151,6 +153,9 @@ def __init__(
         self.yi = np.linspace(0, max(self.y_context_length), resolution)
         self.X, self.Y = np.meshgrid(self.xi, self.yi)
 
+        # Lazy import scipy only when interpolation is actually needed
+        import scipy.interpolate
+
         # perform 2d interpolation with fallback for NaN values
         self.itl_interpolator = scipy.interpolate.griddata(
             (self.x_kv_usage, self.y_context_length),
 
@@ -40,6 +40,7 @@ class Config:
     custom_jinja_template: Optional[str] = None
     store_kv: str
     request_plane: str
+    enable_local_indexer: bool = False
 
     # mirror vLLM
     model: str
@@ -204,6 +205,13 @@ def parse_args() -> Config:
         default=os.environ.get("DYN_REQUEST_PLANE", "nats"),
         help="Determines how requests are distributed from routers to workers. 'tcp' is fastest [nats|http|tcp]",
     )
+    parser.add_argument(
+        "--enable-local-indexer",
+        type=str,
+        choices=["true", "false"],
+        default=os.environ.get("DYN_LOCAL_INDEXER", "false"),
+        help="Enable worker-local KV indexer for tracking this worker's own KV cache state (can also be toggled with env var DYN_LOCAL_INDEXER).",
+    )
     parser.add_argument(
         "--use-vllm-tokenizer",
         action="store_true",
@@ -214,6 +222,7 @@ def parse_args() -> Config:
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
+    args.enable_local_indexer = str(args.enable_local_indexer).lower() == "true"
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
     # Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
@@ -312,6 +321,7 @@ def parse_args() -> Config:
     config.mm_prompt_template = args.mm_prompt_template
     config.store_kv = args.store_kv
     config.request_plane = args.request_plane
+    config.enable_local_indexer = args.enable_local_indexer
     config.use_vllm_tokenizer = args.use_vllm_tokenizer
 
     # Validate custom Jinja template file exists if provided
 
@@ -224,6 +224,7 @@ def setup_kv_event_publisher(
             worker_id=generate_endpoint.connection_id(),
             kv_block_size=vllm_config.cache_config.block_size,
             zmq_endpoint=zmq_endpoint,
+            enable_local_indexer=config.enable_local_indexer,
         )
         kv_publisher = ZmqKvEventPublisher(component=component, config=zmq_config)
         kv_publishers.append(kv_publisher)
@@ -336,6 +337,7 @@ async def register_vllm_model(
     runtime_config.total_kv_blocks = runtime_values["num_gpu_blocks"]
     runtime_config.max_num_seqs = runtime_values["max_num_seqs"]
     runtime_config.max_num_batched_tokens = runtime_values["max_num_batched_tokens"]
+    runtime_config.enable_local_indexer = config.enable_local_indexer
 
     # Add tool/reasoning parsers for decode models
     if model_type != ModelType.Prefill:
 
@@ -31,7 +31,9 @@ The main KV-aware routing arguments:
 
 - `--no-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management.
 
-- `--busy-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache usage. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines emit `ForwardPassMetrics`. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
+- `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines emit `ForwardPassMetrics`. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
+
+- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.
 
 - `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
 
@@ -585,28 +587,32 @@ See [KV Router Architecture](../router/README.md) for performance tuning details
 
 ## Dynamic Threshold Configuration
 
-The busy threshold can be updated at runtime without restarting the frontend. The frontend exposes HTTP endpoints at `/busy_threshold`:
+The busy thresholds can be updated at runtime without restarting the frontend. The frontend exposes HTTP endpoints at `/busy_threshold`:
 
-**Get or set a model's threshold (POST):**
+**Get or set a model's thresholds (POST):**
 ```bash
-# Set threshold for a model
+# Set both thresholds for a model
 curl -X POST http://localhost:8000/busy_threshold \
   -H "Content-Type: application/json" \
-  -d '{"model": "meta-llama/Llama-2-7b-hf", "threshold": 0.85}'
-# Response: {"model": "meta-llama/Llama-2-7b-hf", "threshold": 0.85}
+  -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}'
+# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
 
-# Get current threshold (omit threshold field)
+# Set only active decode blocks threshold
+curl -X POST http://localhost:8000/busy_threshold \
+  -H "Content-Type: application/json" \
+  -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85}'
+# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": <current_value>}
+
+# Get current thresholds (omit threshold fields)
 curl -X POST http://localhost:8000/busy_threshold \
   -H "Content-Type: application/json" \
   -d '{"model": "meta-llama/Llama-2-7b-hf"}'
-# Response: {"model": "meta-llama/Llama-2-7b-hf", "threshold": 0.85}
-# Or if not configured: {"model": "...", "threshold": null}
+# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
+# Or if not configured: {"model": "...", "active_decode_blocks_threshold": null, "active_prefill_tokens_threshold": null}
 ```
 
 **List all configured thresholds (GET):**
 ```bash
 curl http://localhost:8000/busy_threshold
-# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "threshold": 0.85}]}
+# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}]}
 ```
-
-This allows you to tune the busy threshold based on observed system behavior without service interruption.
 
@@ -966,7 +966,9 @@ pub async fn create_worker_selection_pipeline_chat(
     let router_config = dynamo_llm::entrypoint::RouterConfig {
         router_mode,
         kv_router_config: kv_router_config.unwrap_or_default(),
-        busy_threshold,
+        // C bindings only support active_decode_blocks_threshold for now (via busy_threshold param)
+        active_decode_blocks_threshold: busy_threshold,
+        active_prefill_tokens_threshold: None,
         enforce_disagg: false,
     };
     let watcher = ModelWatcher::new(
@@ -1031,7 +1033,8 @@ pub async fn create_worker_selection_pipeline_chat(
 
     // Create worker monitor if busy_threshold is set
     // Note: C bindings don't register with ModelManager, so HTTP endpoint won't see this
-    let worker_monitor = busy_threshold.map(|t| KvWorkerMonitor::new(client.clone(), t));
+    // C bindings only support active_decode_blocks_threshold for now (active_prefill_tokens_threshold defaults to 1000000 tokens = effectively disabled)
+    let worker_monitor = busy_threshold.map(|t| KvWorkerMonitor::new(client.clone(), t, 1000000));
 
     let engine = build_routed_pipeline::<
         NvCreateChatCompletionRequest,
 
@@ -77,24 +77,29 @@ impl KvRouterConfig {
 pub struct RouterConfig {
     router_mode: RouterMode,
     kv_router_config: KvRouterConfig,
-    busy_threshold: Option<f64>,
+    /// Threshold for active decode blocks utilization (0.0-1.0)
+    active_decode_blocks_threshold: Option<f64>,
+    /// Threshold for active prefill tokens utilization (literal token count)
+    active_prefill_tokens_threshold: Option<u64>,
     enforce_disagg: bool,
 }
 
 #[pymethods]
 impl RouterConfig {
     #[new]
-    #[pyo3(signature = (mode, config=None, busy_threshold=None, enforce_disagg=false))]
+    #[pyo3(signature = (mode, config=None, active_decode_blocks_threshold=None, active_prefill_tokens_threshold=None, enforce_disagg=false))]
     pub fn new(
         mode: RouterMode,
         config: Option<KvRouterConfig>,
-        busy_threshold: Option<f64>,
+        active_decode_blocks_threshold: Option<f64>,
+        active_prefill_tokens_threshold: Option<u64>,
         enforce_disagg: bool,
     ) -> Self {
         Self {
             router_mode: mode,
             kv_router_config: config.unwrap_or_default(),
-            busy_threshold,
+            active_decode_blocks_threshold,
+            active_prefill_tokens_threshold,
             enforce_disagg,
         }
     }
@@ -105,7 +110,8 @@ impl From<RouterConfig> for RsRouterConfig {
         RsRouterConfig {
             router_mode: rc.router_mode.into(),
             kv_router_config: rc.kv_router_config.inner,
-            busy_threshold: rc.busy_threshold,
+            active_decode_blocks_threshold: rc.active_decode_blocks_threshold,
+            active_prefill_tokens_threshold: rc.active_prefill_tokens_threshold,
             enforce_disagg: rc.enforce_disagg,
         }
     }
 
@@ -21,7 +21,7 @@ use rs::traits::events::EventSubscriber;
 use tracing;
 
 use llm_rs::kv_router::protocols::*;
-use llm_rs::kv_router::publisher::{KvEventSourceConfig, create_stored_blocks};
+use llm_rs::kv_router::publisher::{KvEventSourceConfig, create_stored_blocks, start_zmq_listener};
 use llm_rs::protocols::common::{OutputOptions, SamplingOptions, StopConditions};
 
 #[pyfunction]
@@ -106,6 +106,9 @@ pub struct ZmqKvEventPublisherConfig {
     pub zmq_endpoint: String,
     #[pyo3(get, set)]
     pub zmq_topic: String,
+    #[pyo3(get, set)]
+    pub enable_local_indexer: bool, // whether the underlying KvEventPublisher publishes to
+                                    // both global and worker-local KvIndexers
 }
 
 #[pymethods]
@@ -115,19 +118,22 @@ impl ZmqKvEventPublisherConfig {
         worker_id,
         kv_block_size,
         zmq_endpoint = "tcp://127.0.0.1:5557".to_string(),
-        zmq_topic = "".to_string()
+        zmq_topic = "".to_string(),
+        enable_local_indexer = false
     ))]
     pub fn new(
         worker_id: WorkerId,
         kv_block_size: usize,
         zmq_endpoint: String,
         zmq_topic: String,
+        enable_local_indexer: bool,
     ) -> Self {
         Self {
             worker_id,
             kv_block_size,
             zmq_endpoint,
             zmq_topic,
+            enable_local_indexer,
         }
     }
 }
@@ -141,13 +147,14 @@ pub(crate) struct ZmqKvEventPublisher {
 impl ZmqKvEventPublisher {
     #[new]
     fn new(component: Component, config: ZmqKvEventPublisherConfig) -> PyResult<Self> {
-        let inner = llm_rs::kv_router::publisher::KvEventPublisher::new(
+        let inner = llm_rs::kv_router::publisher::KvEventPublisher::new_with_local_indexer(
             component.inner,
             config.kv_block_size as u32,
             Some(KvEventSourceConfig::Zmq {
                 endpoint: config.zmq_endpoint,
                 topic: config.zmq_topic,
             }),
+            config.enable_local_indexer,
         )
         .map_err(to_pyerr)?;
         Ok(Self { inner })
@@ -179,7 +186,7 @@ impl ZmqKvEventListener {
             let (tx, rx) = tokio::sync::mpsc::unbounded_channel::<KvCacheEvent>();
             let shutdown_token = tokio_util::sync::CancellationToken::new();
 
-            tokio::spawn(llm_rs::kv_router::publisher::start_zmq_listener(
+            tokio::spawn(start_zmq_listener(
                 zmq_endpoint,
                 zmq_topic,
                 tx,