ai-dynamo
diff --git a/‎docs/guides/run_kvbm_in_trtllm.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/guides/run_kvbm_in_trtllm.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/guides/run_kvbm_in_vllm.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/guides/run_kvbm_in_vllm.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/bindings/python/rust/llm/block_manager.rs‎
Lines changed: 29 additions & 5 deletions b/‎lib/bindings/python/rust/llm/block_manager.rs‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎lib/llm/src/block_manager.rs‎
Lines changed: 1 addition & 1 deletion b/‎lib/llm/src/block_manager.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/llm/src/block_manager/block/factory.rs‎
Lines changed: 4 additions & 1 deletion b/‎lib/llm/src/block_manager/block/factory.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/llm/src/block_manager/block/factory/local.rs‎
Lines changed: 7 additions & 0 deletions b/‎lib/llm/src/block_manager/block/factory/local.rs‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/llm/src/block_manager/block/factory/logical.rs‎
Lines changed: 12 additions & 1 deletion b/‎lib/llm/src/block_manager/block/factory/logical.rs‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎lib/llm/src/block_manager/config.rs‎
Lines changed: 5 additions & 0 deletions b/‎lib/llm/src/block_manager/config.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/llm/src/block_manager/controller.rs‎
Lines changed: 1 addition & 1 deletion b/‎lib/llm/src/block_manager/controller.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/llm/src/block_manager/offload.rs‎
Lines changed: 65 additions & 9 deletions b/‎lib/llm/src/block_manager/offload.rs‎
Lines changed: 65 additions & 9 deletions
@@ -58,6 +58,11 @@ export DYN_KVBM_DISK_CACHE_GB=8
 export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200
 ```
 
+> [!NOTE]
+> When disk offloading is enabled, to extend SSD lifespan, disk offload filtering would be enabled by default. The current policy is only offloading KV blocks from CPU to disk if the blocks have frequency equal or more than `2`. Frequency is determined via doubling on cache hit (init with 1) and decrement by 1 on each time decay step.
+>
+> To disable disk offload filtering, set `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER` to true or 1.
+
 ```bash
 # write an example LLM API config
 # Note: Disable partial reuse "enable_partial_reuse: false" in the LLM API config’s "kv_connector_config" to increase offloading cache hits.
 
@@ -61,6 +61,11 @@ cd $DYNAMO_HOME/components/backends/vllm
 > [!NOTE]
 > `DYN_KVBM_CPU_CACHE_GB` must be set and `DYN_KVBM_DISK_CACHE_GB` is optional.
 
+> [!NOTE]
+> When disk offloading is enabled, to extend SSD lifespan, disk offload filtering would be enabled by default. The current policy is only offloading KV blocks from CPU to disk if the blocks have frequency equal or more than `2`. Frequency is determined via doubling on cache hit (init with 1) and decrement by 1 on each time decay step.
+>
+> To disable disk offload filtering, set `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER` to true or 1.
+
 ### Sample Request
 ```bash
 # make a request to verify vLLM with KVBM is started up correctly
 
@@ -6,8 +6,11 @@ use anyhow::Result;
 use dynamo_llm::block_manager::block::{
     data::logical::distributed_leader_worker::DistributedLeaderWorkerResources, locality::Logical,
 };
+use dynamo_llm::block_manager::offload::filter::FrequencyFilter;
 use dynamo_llm::block_manager::{BasicMetadata, BlockParallelismStrategy};
+
 use pyo3::PyResult;
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
 mod controller;
@@ -94,13 +97,34 @@ impl BlockManager {
 
             if leader.num_host_blocks() > 0 {
                 tracing::info!("Using {} host blocks", leader.num_host_blocks());
-                config = config.host_layout(
+
+                let mut host_layout_config =
                     dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
                         .num_blocks(leader.num_host_blocks())
-                        .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
-                        .build()
-                        .map_err(to_pyerr)?,
-                );
+                        .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded));
+
+                if leader.num_disk_blocks() > 0 {
+                    // Check if disk offload filter is disabled via environment variable
+                    let disable_filter = std::env::var("DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER")
+                        .map(|v| v == "true" || v == "1")
+                        .unwrap_or(false);
+
+                    if !disable_filter {
+                        // TODO: These values seem plausible for most use cases, but we need to figure out a better way to configure them.
+                        let frequency_filter = FrequencyFilter::new(
+                            2,
+                            Duration::from_secs(600),
+                            1e6 as usize,
+                            cancel_token.child_token(),
+                            rt.inner().runtime().primary().clone(),
+                        )
+                        .map_err(to_pyerr)?;
+                        host_layout_config =
+                            host_layout_config.offload_filter(Some(Arc::new(frequency_filter)));
+                    }
+                }
+
+                config = config.host_layout(host_layout_config.build().map_err(to_pyerr)?);
             }
 
             if leader.num_disk_blocks() > 0 {
 
@@ -33,7 +33,7 @@ pub use block::{
 pub use config::*;
 
 pub use layout::{LayoutConfig, LayoutConfigBuilder, LayoutError, LayoutType, nixl::NixlLayout};
-pub use offload::request::BlockResult;
+pub use offload::{filter::OffloadFilter, request::BlockResult};
 pub use pool::{BlockPool, ManagedBlockPool};
 pub use storage::{
     DeviceStorage, DiskStorage, PinnedStorage, Storage, StorageAllocator,
 
@@ -6,7 +6,7 @@ pub mod logical;
 
 pub use local::LocalBlockDataFactory;
 
-use crate::block_manager::LayoutConfig;
+use crate::block_manager::{LayoutConfig, OffloadFilter};
 
 use super::*;
 
@@ -47,6 +47,9 @@ pub trait BlockFactory<S: Storage, L: LocalityProvider> {
 
     /// Get the layout configuration information
     fn layout_config(&self) -> &LayoutConfig;
+
+    /// Get the offload filter for this factory
+    fn offload_filter(&self) -> Option<Arc<dyn OffloadFilter>>;
 }
 
 /// Extension trait for factories that can produce all blocks at once
 
@@ -8,18 +8,21 @@ pub struct LocalBlockDataFactory<S: Storage> {
     layout: Arc<dyn BlockLayout<StorageType = S>>,
     block_set_idx: usize,
     worker_id: WorkerID,
+    offload_filter: Option<Arc<dyn OffloadFilter>>,
 }
 
 impl<S: Storage> LocalBlockDataFactory<S> {
     pub fn new(
         layout: Arc<dyn BlockLayout<StorageType = S>>,
         block_set_idx: usize,
         worker_id: WorkerID,
+        offload_filter: Option<Arc<dyn OffloadFilter>>,
     ) -> Self {
         Self {
             layout,
             block_set_idx,
             worker_id,
+            offload_filter,
         }
     }
 }
@@ -46,6 +49,10 @@ impl<S: Storage> BlockFactory<S, locality::Local> for LocalBlockDataFactory<S> {
     fn layout_config(&self) -> &LayoutConfig {
         self.layout.config()
     }
+
+    fn offload_filter(&self) -> Option<Arc<dyn OffloadFilter>> {
+        self.offload_filter.clone()
+    }
 }
 
 impl<S: Storage> IntoBlocks<S, locality::Local> for LocalBlockDataFactory<S> {}
@@ -2,7 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::*;
-use crate::block_manager::locality::{Logical, LogicalBlockData, LogicalResources};
+use crate::block_manager::{
+    OffloadFilter,
+    locality::{Logical, LogicalBlockData, LogicalResources},
+};
 
 #[derive(Debug)]
 pub struct LogicalBlockFactory<S: Storage, R: LogicalResources> {
@@ -12,6 +15,7 @@ pub struct LogicalBlockFactory<S: Storage, R: LogicalResources> {
     resources: Arc<R>,
     storage_type: StorageType,
     storage: std::marker::PhantomData<S>,
+    offload_filter: Option<Arc<dyn OffloadFilter>>,
 }
 
 impl<S: Storage, R: LogicalResources> LogicalBlockFactory<S, R> {
@@ -21,6 +25,7 @@ impl<S: Storage, R: LogicalResources> LogicalBlockFactory<S, R> {
         worker_id: WorkerID,
         resources: Arc<R>,
         storage_type: StorageType,
+        offload_filter: Option<Arc<dyn OffloadFilter>>,
     ) -> Self {
         Self {
             layout_config,
@@ -29,6 +34,7 @@ impl<S: Storage, R: LogicalResources> LogicalBlockFactory<S, R> {
             resources,
             storage_type,
             storage: std::marker::PhantomData,
+            offload_filter,
         }
     }
 }
@@ -57,6 +63,10 @@ impl<S: Storage, R: LogicalResources> BlockFactory<S, Logical<R>> for LogicalBlo
     fn layout_config(&self) -> &LayoutConfig {
         &self.layout_config
     }
+
+    fn offload_filter(&self) -> Option<Arc<dyn OffloadFilter>> {
+        self.offload_filter.clone()
+    }
 }
 
 impl<S: Storage, R: LogicalResources> IntoBlocks<S, Logical<R>> for LogicalBlockFactory<S, R> {}
@@ -89,6 +99,7 @@ mod tests {
             TEST_WORKER_ID,
             Arc::new(NullResources),
             StorageType::Pinned,
+            None,
         );
 
         let block_data = factory.create_block_data(0).unwrap();
 
@@ -116,6 +116,11 @@ pub struct KvManagerLayoutConfig<S: Storage + NixlRegisterableStorage> {
     /// The type of block parallelism strategy to use
     #[builder(default)]
     pub logical: Option<BlockParallelismStrategy>,
+
+    /// The offload filter to use (if any).
+    /// This dictates which blocks will be offloaded to the next-lowest cache level.
+    #[builder(default = "None")]
+    pub offload_filter: Option<Arc<dyn OffloadFilter>>,
 }
 
 impl<S: Storage + NixlRegisterableStorage> KvManagerLayoutConfig<S> {
 
@@ -95,7 +95,7 @@ pub enum ResetResponse {
     ResetBlocks(ResetBlocksResponse),
 }
 
-#[cfg(all(test, feature = "testing-full"))]
+#[cfg(all(test, feature = "testing-etcd", feature = "testing-full"))]
 mod tests {
     use crate::tokens::Tokens;
 
 
@@ -42,10 +42,12 @@ use super::pool::{BlockPool, BlockPoolError};
 use super::storage::{Cuda, Storage};
 use super::{DeviceStorage, DiskStorage, KvManagerModelConfig, PinnedStorage};
 use nixl_sys::Agent as NixlAgent;
-use std::sync::Arc;
+use std::sync::{
+    Arc,
+    atomic::{AtomicU64, Ordering},
+};
 use tokio::runtime::Handle;
 use tokio::sync::{
-    Mutex,
     mpsc::{self, error::TryRecvError},
     oneshot,
 };
@@ -56,12 +58,16 @@ use std::any::Any;
 
 use std::collections::BTreeSet;
 
+pub mod filter;
 mod pending;
 pub mod request;
 
+use filter::OffloadFilter;
 use pending::{LocalTransferManager, PendingTransfer, TransferBatcher, TransferManager};
 use request::{BlockResult, OffloadRequest, OffloadRequestKey, OnboardRequest};
 
+use derive_builder::Builder;
+use derive_getters::Getters;
 use dynamo_runtime::utils::task::CriticalTaskExecutionHandle;
 
 pub const MAX_CONCURRENT_TRANSFERS: usize = 4;
@@ -94,16 +100,18 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> {
         mpsc::UnboundedSender<OnboardRequest<DiskStorage, DeviceStorage, Locality, Metadata>>,
 
     /// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first.
-    tick: Arc<Mutex<u64>>,
+    tick: Arc<AtomicU64>,
 }
 
 impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
     OffloadManager<Locality, Metadata>
 {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         disk: Option<Arc<dyn BlockPool<DiskStorage, Locality, Metadata>>>,
         host: Option<Arc<dyn BlockPool<PinnedStorage, Locality, Metadata>>>,
         device: Option<Arc<dyn BlockPool<DeviceStorage, Locality, Metadata>>>,
+        filters: OffloadFilters,
         config: OffloadManagerConfig,
     ) -> Result<Arc<Self>> {
         let (device_offload_tx, device_offload_rx) = mpsc::unbounded_channel();
@@ -120,7 +128,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
             host_offload_tx,
             host_onboard_tx,
             disk_onboard_tx,
-            tick: Arc::new(Mutex::new(0)),
+            tick: Arc::new(AtomicU64::new(0)),
         });
 
         let cuda_ctx = Cuda::device_or_create(0)?;
@@ -163,6 +171,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
                 &config.async_rt_handle,
                 config.cancellation_token.clone(),
             )),
+            filters.device.clone(),
             device_metrics.clone(),
             config.cancellation_token.clone(),
         );
@@ -199,6 +208,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
                 &config.async_rt_handle,
                 config.cancellation_token.clone(),
             )),
+            filters.host.clone(),
             host_metrics.clone(),
             config.cancellation_token.clone(),
         );
@@ -276,6 +286,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
         target_pool: Option<Arc<dyn BlockPool<Target, Locality, Metadata>>>,
         mut offload_rx: mpsc::UnboundedReceiver<OffloadRequest<Source, Locality, Metadata>>,
         transfer_manager: Arc<dyn TransferManager<Source, Target, Locality, Metadata>>,
+        offload_filter: Option<Arc<dyn OffloadFilter>>,
         pool_metrics: Arc<PoolMetrics>,
         cancellation_token: CancellationToken,
     ) -> Result<()> {
@@ -331,6 +342,12 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
                         continue;
                     }
 
+                    if let Some(offload_filter) = offload_filter.as_ref()
+                        && !offload_filter.should_offload(request.sequence_hash)
+                    {
+                        continue;
+                    }
+
                     let target_block = 'target_block: {
                         if let Ok(blocks) = target_pool.allocate_blocks(1).await
                             && let Some(block) = blocks.into_iter().next()
@@ -443,14 +460,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
             }
         }
 
-        let mut tick = self.tick.lock().await;
+        let tick = self.tick.fetch_add(1, Ordering::Relaxed);
         let key = OffloadRequestKey {
             priority,
-            timestamp: *tick,
+            timestamp: tick,
         };
-        // Increment a counter for each block. Within the same priority, blocks with lower counter values are processed first.
-        *tick += 1;
-        drop(tick);
 
         // This can get called by all pools, regardless of whether or not they have a place to offload to.
         // Because of this, we need to check the block type here.
@@ -584,6 +598,47 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
     }
 }
 
+#[derive(Debug, Clone, Getters, Builder)]
+#[builder(pattern = "owned", build_fn(validate = "Self::validate"))]
+pub struct OffloadFilters {
+    #[builder(default)]
+    device: Option<Arc<dyn OffloadFilter>>,
+    #[builder(default)]
+    host: Option<Arc<dyn OffloadFilter>>,
+    #[builder(default)]
+    disk: Option<Arc<dyn OffloadFilter>>,
+}
+
+impl OffloadFilters {
+    pub fn builder() -> OffloadFiltersBuilder {
+        OffloadFiltersBuilder::default()
+    }
+}
+
+impl OffloadFiltersBuilder {
+    pub fn validate(&self) -> Result<(), String> {
+        if let Some(disk) = self.disk.as_ref()
+            && disk.is_some()
+        {
+            return Err("Disk offload filter is not supported.".to_string());
+        }
+
+        let host_is_none = if let Some(host) = self.host.as_ref() {
+            host.is_none()
+        } else {
+            true
+        };
+
+        if host_is_none {
+            tracing::warn!(
+                "Host to Disk offload filter is not provided. All blocks in host will be offloaded to disk. This may result in excessive disk offloading and accelerated SSD degradation."
+            );
+        }
+
+        Ok(())
+    }
+}
+
 #[cfg(all(test, feature = "testing-cuda"))]
 mod tests {
     use super::*;
@@ -771,6 +826,7 @@ mod tests {
             disk_pool.clone(),
             host_pool.clone(),
             device_pool.clone(),
+            OffloadFilters::builder().build()?,
             config,
         )?;
Original file line number	Diff line number	Diff line change
`@@ -8,18 +8,21 @@ pub struct LocalBlockDataFactory<S: Storage> {`
`8`	`8`	`layout: Arc<dyn BlockLayout<StorageType = S>>,`
`9`	`9`	`block_set_idx: usize,`
`10`	`10`	`worker_id: WorkerID,`
	`11`	`+ offload_filter: Option<Arc<dyn OffloadFilter>>,`
`11`	`12`	`}`
`12`	`13`
`13`	`14`	`impl<S: Storage> LocalBlockDataFactory<S> {`
`14`	`15`	`pub fn new(`
`15`	`16`	`layout: Arc<dyn BlockLayout<StorageType = S>>,`
`16`	`17`	`block_set_idx: usize,`
`17`	`18`	`worker_id: WorkerID,`
	`19`	`+ offload_filter: Option<Arc<dyn OffloadFilter>>,`
`18`	`20`	`) -> Self {`
`19`	`21`	`Self {`
`20`	`22`	`layout,`
`21`	`23`	`block_set_idx,`
`22`	`24`	`worker_id,`
	`25`	`+ offload_filter,`
`23`	`26`	`}`
`24`	`27`	`}`
`25`	`28`	`}`
`@@ -46,6 +49,10 @@ impl<S: Storage> BlockFactory<S, locality::Local> for LocalBlockDataFactory<S> {`
`46`	`49`	`fn layout_config(&self) -> &LayoutConfig {`
`47`	`50`	`self.layout.config()`
`48`	`51`	`}`
	`52`	`+`
	`53`	`+ fn offload_filter(&self) -> Option<Arc<dyn OffloadFilter>> {`
	`54`	`+ self.offload_filter.clone()`
	`55`	`+ }`
`49`	`56`	`}`
`50`	`57`
`51`	`58`	`impl<S: Storage> IntoBlocks<S, locality::Local> for LocalBlockDataFactory<S> {}`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ pub enum ResetResponse {`
`95`	`95`	`ResetBlocks(ResetBlocksResponse),`
`96`	`96`	`}`
`97`	`97`
`98`		`-#[cfg(all(test, feature = "testing-full"))]`
	`98`	`+#[cfg(all(test, feature = "testing-etcd", feature = "testing-full"))]`
`99`	`99`	`mod tests {`
`100`	`100`	`use crate::tokens::Tokens;`
`101`	`101`