|
| 1 | +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | +// |
| 4 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +// you may not use this file except in compliance with the License. |
| 6 | +// You may obtain a copy of the License at |
| 7 | +// |
| 8 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +// |
| 10 | +// Unless required by applicable law or agreed to in writing, software |
| 11 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +// See the License for the specific language governing permissions and |
| 14 | +// limitations under the License. |
| 15 | + |
| 16 | +use anyhow::Result; |
| 17 | +use clap::Parser; |
| 18 | + |
| 19 | +use core::time::Duration; |
| 20 | +use indicatif::ProgressIterator; |
| 21 | +use std::time::Instant; |
| 22 | + |
| 23 | +use dynamo_llm::block_manager::v2::physical::{ |
| 24 | + layout::LayoutConfig, |
| 25 | + transfer::{ |
| 26 | + BounceBufferSpec, NixlAgent, PhysicalLayout, StorageKind, TransferOptions, |
| 27 | + TransportManager, executor::execute_transfer, |
| 28 | + }, |
| 29 | +}; |
| 30 | + |
| 31 | +use std::sync::Arc; |
| 32 | + |
| 33 | +#[derive(Parser)] |
| 34 | +struct Args { |
| 35 | + /// Amount of layers |
| 36 | + #[clap(long, default_value_t = 24)] |
| 37 | + num_layers: usize, |
| 38 | + |
| 39 | + /// Inner dimension |
| 40 | + #[clap(long, default_value_t = 4096)] |
| 41 | + inner_dim: usize, |
| 42 | + |
| 43 | + /// Block size |
| 44 | + #[clap(long, default_value_t = 32)] |
| 45 | + block_size: usize, |
| 46 | + |
| 47 | + /// Amount of blocks per pool |
| 48 | + #[clap(long, default_value_t = 16)] |
| 49 | + num_blocks: usize, |
| 50 | + |
| 51 | + /// Amount of blocks per transferred batch |
| 52 | + #[clap(long, default_value_t = 4)] |
| 53 | + blocks_per_batch: usize, |
| 54 | + |
| 55 | + /// Amount of pinned bounce buffer blocks |
| 56 | + #[clap(long, default_value_t = 2)] |
| 57 | + num_bounce_blocks: usize, |
| 58 | + |
| 59 | + /// Amount of iterations |
| 60 | + #[clap(long, default_value_t = 100)] |
| 61 | + iterations: usize, |
| 62 | +} |
| 63 | + |
| 64 | +struct DummyBounceBufferSpec { |
| 65 | + pub layout: PhysicalLayout, |
| 66 | + pub block_ids: Vec<usize>, |
| 67 | +} |
| 68 | + |
| 69 | +impl BounceBufferSpec for DummyBounceBufferSpec { |
| 70 | + fn layout(&self) -> &PhysicalLayout { |
| 71 | + &self.layout |
| 72 | + } |
| 73 | + fn block_ids(&self) -> &[usize] { |
| 74 | + &self.block_ids |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +#[tokio::main] |
| 79 | +pub async fn main() -> Result<()> { |
| 80 | + let args = Args::parse(); |
| 81 | + |
| 82 | + // let manager = build_manager(&args).await?; |
| 83 | + |
| 84 | + benchmark(&args).await?; |
| 85 | + |
| 86 | + Ok(()) |
| 87 | +} |
| 88 | + |
| 89 | +fn build_layout( |
| 90 | + agent: NixlAgent, |
| 91 | + config: LayoutConfig, |
| 92 | + storage_kind: StorageKind, |
| 93 | +) -> PhysicalLayout { |
| 94 | + let builder = PhysicalLayout::builder(agent) |
| 95 | + .with_config(config) |
| 96 | + .fully_contiguous(); |
| 97 | + |
| 98 | + match storage_kind { |
| 99 | + StorageKind::System => builder.allocate_system().build().unwrap(), |
| 100 | + StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(), |
| 101 | + StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(), |
| 102 | + StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(), |
| 103 | + } |
| 104 | +} |
| 105 | + |
| 106 | +fn get_bandwidth_gbs(latencies: Vec<Duration>, args: &Args) -> f64 { |
| 107 | + let total_bytes = |
| 108 | + args.num_layers * args.inner_dim * args.block_size * args.blocks_per_batch * 2; |
| 109 | + let mean = latencies.iter().sum::<Duration>() / latencies.len() as u32; |
| 110 | + |
| 111 | + total_bytes as f64 / mean.as_nanos() as f64 |
| 112 | +} |
| 113 | + |
| 114 | +async fn benchmark(args: &Args) -> Result<()> { |
| 115 | + let agent = NixlAgent::require_backends("test_agent", &["POSIX", "GDS_MT"])?; |
| 116 | + let src_dst_config = LayoutConfig::builder() |
| 117 | + .num_blocks(args.num_blocks) |
| 118 | + .num_layers(args.num_layers) |
| 119 | + .outer_dim(2) |
| 120 | + .page_size(args.block_size) |
| 121 | + .inner_dim(args.inner_dim) |
| 122 | + .dtype_width_bytes(2) |
| 123 | + .build()?; |
| 124 | + |
| 125 | + let disk_layout = build_layout(agent.clone(), src_dst_config.clone(), StorageKind::Disk(0)); |
| 126 | + let device_layout = build_layout( |
| 127 | + agent.clone(), |
| 128 | + src_dst_config.clone(), |
| 129 | + StorageKind::Device(0), |
| 130 | + ); |
| 131 | + |
| 132 | + let bounce_config = LayoutConfig::builder() |
| 133 | + .num_blocks(args.num_bounce_blocks) |
| 134 | + .num_layers(args.num_layers) |
| 135 | + .outer_dim(2) |
| 136 | + .page_size(args.block_size) |
| 137 | + .inner_dim(args.inner_dim) |
| 138 | + .dtype_width_bytes(2) |
| 139 | + .build()?; |
| 140 | + |
| 141 | + let bounce_layout = build_layout(agent.clone(), bounce_config.clone(), StorageKind::Pinned); |
| 142 | + |
| 143 | + let ctx = TransportManager::builder() |
| 144 | + .worker_id(0) |
| 145 | + .nixl_agent(agent) |
| 146 | + .cuda_device_id(0) |
| 147 | + .build()?; |
| 148 | + |
| 149 | + let bounce_buffer_spec: Arc<dyn BounceBufferSpec> = Arc::new(DummyBounceBufferSpec { |
| 150 | + layout: bounce_layout, |
| 151 | + block_ids: (0..args.num_bounce_blocks).collect(), |
| 152 | + }); |
| 153 | + |
| 154 | + let options = TransferOptions::builder() |
| 155 | + .bounce_buffer(bounce_buffer_spec) |
| 156 | + .build()?; |
| 157 | + |
| 158 | + anyhow::ensure!( |
| 159 | + args.blocks_per_batch <= args.num_blocks, |
| 160 | + "blocks_per_batch must be less than or equal to num_blocks" |
| 161 | + ); |
| 162 | + let blocks = (0..args.blocks_per_batch).collect::<Vec<_>>(); |
| 163 | + |
| 164 | + for (src, dst, name) in vec![ |
| 165 | + (disk_layout.clone(), device_layout.clone(), "disk_to_device"), |
| 166 | + (device_layout, disk_layout, "device_to_disk"), |
| 167 | + ] { |
| 168 | + println!("Starting {} benchmark...", name); |
| 169 | + |
| 170 | + let mut latencies = Vec::new(); |
| 171 | + for _ in (0..args.iterations).progress() { |
| 172 | + let options_clone = options.clone(); |
| 173 | + let start = Instant::now(); |
| 174 | + execute_transfer( |
| 175 | + &src, |
| 176 | + &dst, |
| 177 | + blocks.as_slice(), |
| 178 | + blocks.as_slice(), |
| 179 | + options_clone, |
| 180 | + ctx.context(), |
| 181 | + )? |
| 182 | + .await?; |
| 183 | + let end = Instant::now(); |
| 184 | + let duration = end.duration_since(start); |
| 185 | + latencies.push(duration); |
| 186 | + } |
| 187 | + |
| 188 | + println!( |
| 189 | + "{} bandwidth: {:?} GB/s", |
| 190 | + name, |
| 191 | + get_bandwidth_gbs(latencies, args) |
| 192 | + ); |
| 193 | + } |
| 194 | + |
| 195 | + Ok(()) |
| 196 | +} |
0 commit comments