Skip to content

Commit 788a6f0

Browse files
jthomson04zxue2
authored andcommitted
feat: KVBM V2 optimized bounce buffer transfer + benchmark (ai-dynamo#3947)
Signed-off-by: jthomson04 <[email protected]>
1 parent 7071697 commit 788a6f0

File tree

4 files changed

+329
-24
lines changed

4 files changed

+329
-24
lines changed

Cargo.lock

Lines changed: 39 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/llm/Cargo.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ testing-cuda = ["dep:cudarc"]
2222
testing-nixl = ["dep:nixl-sys"]
2323
testing-etcd = []
2424
block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:nix", "dep:aligned-vec"]
25+
block-manager-bench = ["block-manager", "testing-full", "dep:clap", "dep:indicatif"]
2526
cuda = ["dep:cudarc"]
2627
integration = ["dynamo-runtime/integration"]
2728
media-nixl = ["dep:nixl-sys", "dep:dynamo-memory"]
@@ -105,6 +106,10 @@ nixl-sys = { version = "=0.7.1", optional = true }
105106
cudarc = { workspace = true, optional = true }
106107
nix = { version = "0.26", optional = true }
107108

109+
# block_manager_bench
110+
clap = { version = "4.5.49", features = ["derive"], optional = true }
111+
indicatif = { version = "0.18.0", optional = true }
112+
108113

109114
# protocols
110115
unicode-segmentation = "1.12"
@@ -188,3 +193,8 @@ mockito = "1.7.0"
188193

189194
[build-dependencies]
190195
tonic-build = { version = "0.13.1" }
196+
197+
[[bin]]
198+
name = "bench_local_transfer_v2"
199+
path = "bin/bench_local_transfer_v2.rs"
200+
required-features = ["block-manager-bench"]
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
16+
use anyhow::Result;
17+
use clap::Parser;
18+
19+
use core::time::Duration;
20+
use indicatif::ProgressIterator;
21+
use std::time::Instant;
22+
23+
use dynamo_llm::block_manager::v2::physical::{
24+
layout::LayoutConfig,
25+
transfer::{
26+
BounceBufferSpec, NixlAgent, PhysicalLayout, StorageKind, TransferOptions,
27+
TransportManager, executor::execute_transfer,
28+
},
29+
};
30+
31+
use std::sync::Arc;
32+
33+
#[derive(Parser)]
34+
struct Args {
35+
/// Amount of layers
36+
#[clap(long, default_value_t = 24)]
37+
num_layers: usize,
38+
39+
/// Inner dimension
40+
#[clap(long, default_value_t = 4096)]
41+
inner_dim: usize,
42+
43+
/// Block size
44+
#[clap(long, default_value_t = 32)]
45+
block_size: usize,
46+
47+
/// Amount of blocks per pool
48+
#[clap(long, default_value_t = 16)]
49+
num_blocks: usize,
50+
51+
/// Amount of blocks per transferred batch
52+
#[clap(long, default_value_t = 4)]
53+
blocks_per_batch: usize,
54+
55+
/// Amount of pinned bounce buffer blocks
56+
#[clap(long, default_value_t = 2)]
57+
num_bounce_blocks: usize,
58+
59+
/// Amount of iterations
60+
#[clap(long, default_value_t = 100)]
61+
iterations: usize,
62+
}
63+
64+
struct DummyBounceBufferSpec {
65+
pub layout: PhysicalLayout,
66+
pub block_ids: Vec<usize>,
67+
}
68+
69+
impl BounceBufferSpec for DummyBounceBufferSpec {
70+
fn layout(&self) -> &PhysicalLayout {
71+
&self.layout
72+
}
73+
fn block_ids(&self) -> &[usize] {
74+
&self.block_ids
75+
}
76+
}
77+
78+
#[tokio::main]
79+
pub async fn main() -> Result<()> {
80+
let args = Args::parse();
81+
82+
// let manager = build_manager(&args).await?;
83+
84+
benchmark(&args).await?;
85+
86+
Ok(())
87+
}
88+
89+
fn build_layout(
90+
agent: NixlAgent,
91+
config: LayoutConfig,
92+
storage_kind: StorageKind,
93+
) -> PhysicalLayout {
94+
let builder = PhysicalLayout::builder(agent)
95+
.with_config(config)
96+
.fully_contiguous();
97+
98+
match storage_kind {
99+
StorageKind::System => builder.allocate_system().build().unwrap(),
100+
StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(),
101+
StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
102+
StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
103+
}
104+
}
105+
106+
fn get_bandwidth_gbs(latencies: Vec<Duration>, args: &Args) -> f64 {
107+
let total_bytes =
108+
args.num_layers * args.inner_dim * args.block_size * args.blocks_per_batch * 2;
109+
let mean = latencies.iter().sum::<Duration>() / latencies.len() as u32;
110+
111+
total_bytes as f64 / mean.as_nanos() as f64
112+
}
113+
114+
async fn benchmark(args: &Args) -> Result<()> {
115+
let agent = NixlAgent::require_backends("test_agent", &["POSIX", "GDS_MT"])?;
116+
let src_dst_config = LayoutConfig::builder()
117+
.num_blocks(args.num_blocks)
118+
.num_layers(args.num_layers)
119+
.outer_dim(2)
120+
.page_size(args.block_size)
121+
.inner_dim(args.inner_dim)
122+
.dtype_width_bytes(2)
123+
.build()?;
124+
125+
let disk_layout = build_layout(agent.clone(), src_dst_config.clone(), StorageKind::Disk(0));
126+
let device_layout = build_layout(
127+
agent.clone(),
128+
src_dst_config.clone(),
129+
StorageKind::Device(0),
130+
);
131+
132+
let bounce_config = LayoutConfig::builder()
133+
.num_blocks(args.num_bounce_blocks)
134+
.num_layers(args.num_layers)
135+
.outer_dim(2)
136+
.page_size(args.block_size)
137+
.inner_dim(args.inner_dim)
138+
.dtype_width_bytes(2)
139+
.build()?;
140+
141+
let bounce_layout = build_layout(agent.clone(), bounce_config.clone(), StorageKind::Pinned);
142+
143+
let ctx = TransportManager::builder()
144+
.worker_id(0)
145+
.nixl_agent(agent)
146+
.cuda_device_id(0)
147+
.build()?;
148+
149+
let bounce_buffer_spec: Arc<dyn BounceBufferSpec> = Arc::new(DummyBounceBufferSpec {
150+
layout: bounce_layout,
151+
block_ids: (0..args.num_bounce_blocks).collect(),
152+
});
153+
154+
let options = TransferOptions::builder()
155+
.bounce_buffer(bounce_buffer_spec)
156+
.build()?;
157+
158+
anyhow::ensure!(
159+
args.blocks_per_batch <= args.num_blocks,
160+
"blocks_per_batch must be less than or equal to num_blocks"
161+
);
162+
let blocks = (0..args.blocks_per_batch).collect::<Vec<_>>();
163+
164+
for (src, dst, name) in vec![
165+
(disk_layout.clone(), device_layout.clone(), "disk_to_device"),
166+
(device_layout, disk_layout, "device_to_disk"),
167+
] {
168+
println!("Starting {} benchmark...", name);
169+
170+
let mut latencies = Vec::new();
171+
for _ in (0..args.iterations).progress() {
172+
let options_clone = options.clone();
173+
let start = Instant::now();
174+
execute_transfer(
175+
&src,
176+
&dst,
177+
blocks.as_slice(),
178+
blocks.as_slice(),
179+
options_clone,
180+
ctx.context(),
181+
)?
182+
.await?;
183+
let end = Instant::now();
184+
let duration = end.duration_since(start);
185+
latencies.push(duration);
186+
}
187+
188+
println!(
189+
"{} bandwidth: {:?} GB/s",
190+
name,
191+
get_bandwidth_gbs(latencies, args)
192+
);
193+
}
194+
195+
Ok(())
196+
}

0 commit comments

Comments
 (0)