ai-dynamo
diff --git a/‎lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/worker.py‎
Lines changed: 5 additions & 13 deletions b/‎lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/worker.py‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎lib/bindings/kvbm/src/v2/connector/worker/mod.rs‎
Lines changed: 6 additions & 7 deletions b/‎lib/bindings/kvbm/src/v2/connector/worker/mod.rs‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎lib/kvbm/src/v2/distributed/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎lib/kvbm/src/v2/distributed/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎…vbm/src/v2/integrations/offload/batch.rs‎ ‎…kvbm/src/v2/distributed/offload/batch.rs‎lib/kvbm/src/v2/integrations/offload/batch.rs renamed to lib/kvbm/src/v2/distributed/offload/batch.rs
Lines changed: 6 additions & 0 deletions b/‎…vbm/src/v2/integrations/offload/batch.rs‎ ‎…kvbm/src/v2/distributed/offload/batch.rs‎lib/kvbm/src/v2/integrations/offload/batch.rs renamed to lib/kvbm/src/v2/distributed/offload/batch.rs
Lines changed: 6 additions & 0 deletions
diff --git a/‎…bm/src/v2/integrations/offload/cancel.rs‎ ‎…vbm/src/v2/distributed/offload/cancel.rs‎lib/kvbm/src/v2/integrations/offload/cancel.rs renamed to lib/kvbm/src/v2/distributed/offload/cancel.rs b/‎…bm/src/v2/integrations/offload/cancel.rs‎ ‎…vbm/src/v2/distributed/offload/cancel.rs‎lib/kvbm/src/v2/integrations/offload/cancel.rs renamed to lib/kvbm/src/v2/distributed/offload/cancel.rs
diff --git a/‎…bm/src/v2/integrations/offload/engine.rs‎ ‎…vbm/src/v2/distributed/offload/engine.rs‎lib/kvbm/src/v2/integrations/offload/engine.rs renamed to lib/kvbm/src/v2/distributed/offload/engine.rs
Lines changed: 107 additions & 3 deletions b/‎…bm/src/v2/integrations/offload/engine.rs‎ ‎…vbm/src/v2/distributed/offload/engine.rs‎lib/kvbm/src/v2/integrations/offload/engine.rs renamed to lib/kvbm/src/v2/distributed/offload/engine.rs
Lines changed: 107 additions & 3 deletions
diff --git a/‎…bm/src/v2/integrations/offload/handle.rs‎ ‎…vbm/src/v2/distributed/offload/handle.rs‎lib/kvbm/src/v2/integrations/offload/handle.rs renamed to lib/kvbm/src/v2/distributed/offload/handle.rs b/‎…bm/src/v2/integrations/offload/handle.rs‎ ‎…vbm/src/v2/distributed/offload/handle.rs‎lib/kvbm/src/v2/integrations/offload/handle.rs renamed to lib/kvbm/src/v2/distributed/offload/handle.rs
diff --git a/‎…/kvbm/src/v2/integrations/offload/mod.rs‎ ‎…b/kvbm/src/v2/distributed/offload/mod.rs‎lib/kvbm/src/v2/integrations/offload/mod.rs renamed to lib/kvbm/src/v2/distributed/offload/mod.rs
Lines changed: 4 additions & 2 deletions b/‎…/kvbm/src/v2/integrations/offload/mod.rs‎ ‎…b/kvbm/src/v2/distributed/offload/mod.rs‎lib/kvbm/src/v2/integrations/offload/mod.rs renamed to lib/kvbm/src/v2/distributed/offload/mod.rs
Lines changed: 4 additions & 2 deletions
@@ -80,7 +80,7 @@ def __init__(
         self.runtime = KvbmRuntime.build_worker(self.kvbm_override_config)
 
         # Create the Rust ConnectorWorker that handles NIXL registration
-        self.connector_worker = ConnectorWorker(self.runtime)
+        self.worker = ConnectorWorker(self.runtime)
 
         # Store peer info for handshake
         instance_id, worker_addr = self.runtime.peer_info()
@@ -149,7 +149,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]) -> None:
         # This caches tensor state for deferred NIXL registration
         # The actual NIXL registration happens when the leader triggers
         # initialization via bind_connector_metadata()
-        self.connector_worker.register_kv_caches(
+        self.worker.register_kv_caches(
             tensors,
             num_device_blocks,
             page_size,
@@ -171,7 +171,7 @@ def bind_connector_metadata(self, data: bytes) -> None:
         """
         Bind connector metadata from the leader.
         """
-        self.connector_worker.bind_connector_metadata(data)
+        self.worker.bind_connector_metadata(data)
 
     def clear_connector_metadata(self) -> None:
         """
@@ -222,19 +222,11 @@ def get_finished(
         Returns:
             (None, None): No finished sends/receives
         """
-        # Just acknowledge the finished requests
-        # Since our leader's request_finished() always returns False,
-        # these requests have already had their blocks freed
-        if len(finished_req_ids) > 0:
-            print(
-                f"SchedulerConnectorWorker.get_finished() acknowledging {len(finished_req_ids)} finished requests"
-            )
-
-        return (None, None)
+        return self.worker.get_finished()
 
     def get_block_ids_with_load_errors(self) -> set[int]:
         """Returns empty set - no load errors tracked."""
-        return set()
+        return self.worker.get_failed_onboarding()
 
     def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata:
         """
 
@@ -3,6 +3,7 @@
 
 //! Python bindings for the v2 connector worker.
 
+use std::collections::HashSet;
 use std::sync::Arc;
 
 use pyo3::prelude::*;
@@ -139,14 +140,8 @@ impl PyConnectorWorker {
     /// Returns:
     ///     tuple: (Optional[set[str]], Optional[set[str]]) for (offload_ids, onboard_ids)
     ///            Returns None for each set if there are no completed requests of that type.
-    #[pyo3(name = "get_finished")]
     #[allow(clippy::type_complexity)]
-    pub fn py_get_finished(
-        &self,
-    ) -> PyResult<(
-        Option<std::collections::HashSet<String>>,
-        Option<std::collections::HashSet<String>>,
-    )> {
+    pub fn get_finished(&self) -> PyResult<(Option<HashSet<String>>, Option<HashSet<String>>)> {
         let (offload_ids, onboard_ids) = self.inner.get_finished();
 
         let offload = if offload_ids.is_empty() {
@@ -162,4 +157,8 @@ impl PyConnectorWorker {
 
         Ok((offload, onboard))
     }
+
+    pub fn get_failed_onboarding(&self) -> PyResult<HashSet<usize>> {
+        Ok(self.inner.get_failed_onboarding())
+    }
 }
@@ -9,6 +9,7 @@
 // pub mod cohort;
 
 pub mod leader;
+pub mod offload;
 pub mod worker;
 
 pub mod parallelism;
@@ -16,6 +16,7 @@ use crate::v2::logical::blocks::BlockMetadata;
 use crate::v2::{BlockId, SequenceHash};
 
 use super::handle::TransferId;
+use super::pending::PendingGuard;
 use super::queue::CancellableQueue;
 use super::source::SourceBlock;
 
@@ -73,6 +74,11 @@ pub struct QueuedBlock<T: BlockMetadata> {
     pub source: SourceBlock<T>,
     /// Transfer state for completion tracking
     pub state: Arc<std::sync::Mutex<TransferState>>,
+    /// RAII guard that removes this block from pending set on drop.
+    ///
+    /// This ensures duplicate prevention tracking is automatically cleaned up
+    /// when the block completes transfer, is cancelled, or errors out.
+    pub pending_guard: Option<PendingGuard>,
 }
 
 impl<T: BlockMetadata> std::fmt::Debug for QueuedBlock<T> {
 
@@ -33,15 +33,17 @@ use std::sync::Arc;
 
 use anyhow::Result;
 use dashmap::DashMap;
+use tokio::task::JoinHandle;
 
 use crate::v2::distributed::leader::InstanceLeader;
 use crate::v2::logical::LogicalLayoutHandle;
-use crate::v2::logical::blocks::{BlockMetadata, BlockRegistry};
+use crate::v2::logical::blocks::{BlockMetadata, BlockRegistry, WeakBlock};
 use crate::v2::logical::manager::BlockManager;
 use crate::v2::{BlockId, G1, G2, G3, G4};
 
 use super::handle::{TransferHandle, TransferId, TransferState};
-use super::pipeline::{Pipeline, PipelineConfig};
+use super::pipeline::{ChainOutput, ChainOutputRx, Pipeline, PipelineConfig, PipelineInput};
+use super::queue::CancellableQueue;
 use super::source::SourceBlocks;
 
 /// Central coordinator for offload pipelines.
@@ -62,6 +64,8 @@ pub struct OffloadEngine {
     g2_to_g4: Option<Pipeline<G2, G4>>,
     /// Active transfer tracking
     transfers: Arc<DashMap<TransferId, Arc<std::sync::Mutex<TransferState>>>>,
+    /// Chain router task handle (routes G1→G2 output to downstream pipelines)
+    _chain_router_handle: Option<JoinHandle<()>>,
 }
 
 impl OffloadEngine {
@@ -276,7 +280,7 @@ impl OffloadEngineBuilder {
         let runtime = self.runtime.unwrap_or_else(|| self.leader.runtime());
 
         // Build G1→G2 pipeline if configured
-        let g1_to_g2 = if let Some(config) = self.g1_to_g2_config {
+        let mut g1_to_g2 = if let Some(config) = self.g1_to_g2_config {
             let g1_manager = self
                 .g1_manager
                 .ok_or_else(|| anyhow::anyhow!("G1 manager required for G1→G2 pipeline"))?;
@@ -346,17 +350,117 @@ impl OffloadEngineBuilder {
             None
         };
 
+        // Wire up auto-chaining from G1→G2 to downstream G2→G3/G2→G4 pipelines
+        let chain_router_handle = if let Some(ref mut g1_to_g2_pipeline) = g1_to_g2 {
+            if g1_to_g2_pipeline.auto_chain() {
+                if let Some(chain_rx) = g1_to_g2_pipeline.take_chain_rx() {
+                    // Get references to downstream pipeline queues
+                    let g2_to_g3_queue = g2_to_g3.as_ref().map(|p| p.eval_queue.clone());
+                    let g2_to_g4_queue = g2_to_g4.as_ref().map(|p| p.eval_queue.clone());
+
+                    // Only spawn if there's at least one downstream pipeline
+                    if g2_to_g3_queue.is_some() || g2_to_g4_queue.is_some() {
+                        tracing::debug!(
+                            has_g2_to_g3 = g2_to_g3_queue.is_some(),
+                            has_g2_to_g4 = g2_to_g4_queue.is_some(),
+                            "Spawning chain router for G1→G2 auto-chaining"
+                        );
+                        Some(runtime.spawn(chain_router_task(
+                            chain_rx,
+                            g2_to_g3_queue,
+                            g2_to_g4_queue,
+                        )))
+                    } else {
+                        tracing::debug!(
+                            "G1→G2 auto_chain enabled but no downstream pipelines configured"
+                        );
+                        None
+                    }
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
         Ok(OffloadEngine {
             leader: self.leader,
             registry,
             g1_to_g2,
             g2_to_g3,
             g2_to_g4,
             transfers: Arc::new(DashMap::new()),
+            _chain_router_handle: chain_router_handle,
         })
     }
 }
 
+/// Routes chain output from G1→G2 to downstream G2→G3 and G2→G4 pipelines.
+///
+/// Blocks are converted to WeakBlocks for best-effort offloading - if they're
+/// evicted before the downstream pipeline processes them, that's acceptable.
+/// This enables graceful degradation under memory pressure.
+async fn chain_router_task(
+    mut chain_rx: ChainOutputRx<G2>,
+    g2_to_g3_queue: Option<Arc<CancellableQueue<PipelineInput<G2>>>>,
+    g2_to_g4_queue: Option<Arc<CancellableQueue<PipelineInput<G2>>>>,
+) {
+    while let Some(output) = chain_rx.recv().await {
+        let ChainOutput {
+            transfer_id,
+            blocks,
+            state,
+        } = output;
+
+        if blocks.is_empty() {
+            continue;
+        }
+
+        // Convert strong blocks to weak blocks for best-effort downstream processing
+        // This allows blocks to be evicted if memory pressure requires it
+        let weak_blocks: Vec<WeakBlock<G2>> =
+            blocks.iter().map(|block| block.downgrade()).collect();
+
+        // Drop strong references - blocks can now be evicted if needed
+        drop(blocks);
+
+        tracing::debug!(
+            %transfer_id,
+            num_blocks = weak_blocks.len(),
+            "Routing chain output to downstream pipelines as WeakBlocks"
+        );
+
+        // Enqueue to G2→G3 if available
+        if let Some(ref queue) = g2_to_g3_queue {
+            let input = PipelineInput {
+                transfer_id,
+                source: SourceBlocks::Weak(weak_blocks.clone()),
+                state: state.clone(),
+            };
+            if !queue.push(transfer_id, input) {
+                tracing::debug!(%transfer_id, "G2→G3 chain enqueue skipped (cancelled)");
+            }
+        }
+
+        // Enqueue to G2→G4 if available
+        if let Some(ref queue) = g2_to_g4_queue {
+            let input = PipelineInput {
+                transfer_id,
+                source: SourceBlocks::Weak(weak_blocks),
+                state,
+            };
+            if !queue.push(transfer_id, input) {
+                tracing::debug!(%transfer_id, "G2→G4 chain enqueue skipped (cancelled)");
+            }
+        }
+    }
+
+    tracing::debug!("Chain router task shutting down");
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
@@ -43,7 +43,7 @@
 //! # Example
 //!
 //! ```ignore
-//! use kvbm::v2::integrations::offload::{
+//! use kvbm::v2::distributed::offload::{
 //!     OffloadEngine, PipelineBuilder, PresenceFilter, PresenceAndLFUFilter,
 //! };
 //!
@@ -79,6 +79,7 @@ mod batch;
 mod cancel;
 mod engine;
 mod handle;
+mod pending;
 mod pipeline;
 mod policy;
 mod queue;
@@ -88,11 +89,12 @@ mod source;
 pub use cancel::{CancelConfirmation, CancelState, CancellationToken};
 pub use engine::{OffloadEngine, OffloadEngineBuilder};
 pub use handle::{TransferHandle, TransferId, TransferResult, TransferStatus};
+pub use pending::{PendingGuard, PendingTracker};
 pub use pipeline::{Pipeline, PipelineBuilder, PipelineConfig};
 pub use policy::{
     AllOfPolicy, AnyOfPolicy, BoxFuture, EvalContext, OffloadPolicy, PassAllPolicy,
     PolicyBatchFuture, PolicyFuture, PresenceAndLFUFilter, PresenceFilter, async_batch_result,
-    async_result, sync_batch_result, sync_result,
+    async_result, create_policy_from_config, sync_batch_result, sync_result,
 };
 pub use queue::CancellableQueue;
 pub use source::{ExternalBlock, SourceBlock, SourceBlocks};