diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs index ed6440ab..35bc2d43 100644 --- a/crates/cuda_std/src/rt/mod.rs +++ b/crates/cuda_std/src/rt/mod.rs @@ -107,69 +107,59 @@ macro_rules! launch { #[derive(Debug, Clone, PartialEq, Eq)] pub struct GridSize { /// Width of grid in blocks - pub x: u32, + pub x: usize, /// Height of grid in blocks - pub y: u32, + pub y: usize, /// Depth of grid in blocks - pub z: u32, + pub z: usize, } impl GridSize { /// Create a one-dimensional grid of `x` blocks #[inline] - pub fn x(x: u32) -> GridSize { + pub fn x(x: usize) -> GridSize { GridSize { x, y: 1, z: 1 } } /// Create a two-dimensional grid of `x * y` blocks #[inline] - pub fn xy(x: u32, y: u32) -> GridSize { + pub fn xy(x: usize, y: usize) -> GridSize { GridSize { x, y, z: 1 } } /// Create a three-dimensional grid of `x * y * z` blocks #[inline] - pub fn xyz(x: u32, y: u32, z: u32) -> GridSize { + pub fn xyz(x: usize, y: usize, z: usize) -> GridSize { GridSize { x, y, z } } } -impl From for GridSize { - fn from(x: u32) -> GridSize { +impl From for GridSize { + fn from(x: usize) -> GridSize { GridSize::x(x) } } -impl From<(u32, u32)> for GridSize { - fn from((x, y): (u32, u32)) -> GridSize { +impl From<(usize, usize)> for GridSize { + fn from((x, y): (usize, usize)) -> GridSize { GridSize::xy(x, y) } } -impl From<(u32, u32, u32)> for GridSize { - fn from((x, y, z): (u32, u32, u32)) -> GridSize { +impl From<(usize, usize, usize)> for GridSize { + fn from((x, y, z): (usize, usize, usize)) -> GridSize { GridSize::xyz(x, y, z) } } -impl<'a> From<&'a GridSize> for GridSize { +impl From<&GridSize> for GridSize { fn from(other: &GridSize) -> GridSize { other.clone() } } -impl From for GridSize { - fn from(vec: glam::UVec2) -> Self { - GridSize::xy(vec.x, vec.y) - } -} -impl From for GridSize { - fn from(vec: glam::UVec3) -> Self { - GridSize::xyz(vec.x, vec.y, vec.z) - } -} impl From for GridSize { fn from(vec: glam::USizeVec2) -> Self { - GridSize::xy(vec.x as u32, vec.y as u32) + GridSize::xy(vec.x, vec.y) } } impl From for GridSize { fn from(vec: glam::USizeVec3) -> Self { - GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) + GridSize::xyz(vec.x, vec.y, vec.z) } } @@ -183,68 +173,58 @@ impl From for GridSize { #[derive(Debug, Clone, PartialEq, Eq)] pub struct BlockSize { /// X dimension of each thread block - pub x: u32, + pub x: usize, /// Y dimension of each thread block - pub y: u32, + pub y: usize, /// Z dimension of each thread block - pub z: u32, + pub z: usize, } impl BlockSize { /// Create a one-dimensional block of `x` threads #[inline] - pub fn x(x: u32) -> BlockSize { + pub fn x(x: usize) -> BlockSize { BlockSize { x, y: 1, z: 1 } } /// Create a two-dimensional block of `x * y` threads #[inline] - pub fn xy(x: u32, y: u32) -> BlockSize { + pub fn xy(x: usize, y: usize) -> BlockSize { BlockSize { x, y, z: 1 } } /// Create a three-dimensional block of `x * y * z` threads #[inline] - pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize { + pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize { BlockSize { x, y, z } } } -impl From for BlockSize { - fn from(x: u32) -> BlockSize { +impl From for BlockSize { + fn from(x: usize) -> BlockSize { BlockSize::x(x) } } -impl From<(u32, u32)> for BlockSize { - fn from((x, y): (u32, u32)) -> BlockSize { +impl From<(usize, usize)> for BlockSize { + fn from((x, y): (usize, usize)) -> BlockSize { BlockSize::xy(x, y) } } -impl From<(u32, u32, u32)> for BlockSize { - fn from((x, y, z): (u32, u32, u32)) -> BlockSize { +impl From<(usize, usize, usize)> for BlockSize { + fn from((x, y, z): (usize, usize, usize)) -> BlockSize { BlockSize::xyz(x, y, z) } } -impl<'a> From<&'a BlockSize> for BlockSize { +impl From<&BlockSize> for BlockSize { fn from(other: &BlockSize) -> BlockSize { other.clone() } } -impl From for BlockSize { - fn from(vec: glam::UVec2) -> Self { - BlockSize::xy(vec.x, vec.y) - } -} -impl From for BlockSize { - fn from(vec: glam::UVec3) -> Self { - BlockSize::xyz(vec.x, vec.y, vec.z) - } -} impl From for BlockSize { fn from(vec: glam::USizeVec2) -> Self { - BlockSize::xy(vec.x as u32, vec.y as u32) + BlockSize::xy(vec.x, vec.y) } } impl From for BlockSize { fn from(vec: glam::USizeVec3) -> Self { - BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) + BlockSize::xyz(vec.x, vec.y, vec.z) } } diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs index 8df6f1b8..a772c351 100644 --- a/crates/cuda_std/src/thread.rs +++ b/crates/cuda_std/src/thread.rs @@ -66,7 +66,7 @@ //! vary by device. Query device properties when you need exact limits. //! use cuda_std_macros::gpu_only; -use glam::{UVec2, UVec3}; +use glam::{USizeVec2, USizeVec3}; // different calling conventions dont exist in nvptx, so we just use C as a placeholder. unsafe extern "C" { @@ -99,116 +99,116 @@ macro_rules! in_range { #[gpu_only] #[inline(always)] -pub fn thread_idx_x() -> u32 { - // The range is derived from the `block_idx_x` range. - in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) +pub fn thread_idx_x() -> usize { + // The range is derived from the `block_dim_x` range. + in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize } #[gpu_only] #[inline(always)] -pub fn thread_idx_y() -> u32 { - // The range is derived from the `block_idx_y` range. - in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) +pub fn thread_idx_y() -> usize { + // The range is derived from the `block_dim_y` range. + in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize } #[gpu_only] #[inline(always)] -pub fn thread_idx_z() -> u32 { - // The range is derived from the `block_idx_z` range. - in_range!(core::arch::nvptx::_thread_idx_z, 0..64) +pub fn thread_idx_z() -> usize { + // The range is derived from the `block_dim_z` range. + in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize } #[gpu_only] #[inline(always)] -pub fn block_idx_x() -> u32 { - // The range is derived from the `grid_idx_x` range. - in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) +pub fn block_idx_x() -> usize { + // The range is derived from the `grid_dim_x` range. + in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize } #[gpu_only] #[inline(always)] -pub fn block_idx_y() -> u32 { - // The range is derived from the `grid_idx_y` range. - in_range!(core::arch::nvptx::_block_idx_y, 0..65535) +pub fn block_idx_y() -> usize { + // The range is derived from the `grid_dim_y` range. + in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize } #[gpu_only] #[inline(always)] -pub fn block_idx_z() -> u32 { - // The range is derived from the `grid_idx_z` range. - in_range!(core::arch::nvptx::_block_idx_z, 0..65535) +pub fn block_idx_z() -> usize { + // The range is derived from the `grid_dim_z` range. + in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize } #[gpu_only] #[inline(always)] -pub fn block_dim_x() -> u32 { +pub fn block_dim_x() -> usize { // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024. - in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) + in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize } #[gpu_only] #[inline(always)] -pub fn block_dim_y() -> u32 { +pub fn block_dim_y() -> usize { // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024. - in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) + in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize } #[gpu_only] #[inline(always)] -pub fn block_dim_z() -> u32 { +pub fn block_dim_z() -> usize { // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64. - in_range!(core::arch::nvptx::_block_dim_z, 1..=64) + in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize } #[gpu_only] #[inline(always)] -pub fn grid_dim_x() -> u32 { +pub fn grid_dim_x() -> usize { // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1. - in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) + in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize } #[gpu_only] #[inline(always)] -pub fn grid_dim_y() -> u32 { +pub fn grid_dim_y() -> usize { // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535. - in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) + in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize } #[gpu_only] #[inline(always)] -pub fn grid_dim_z() -> u32 { +pub fn grid_dim_z() -> usize { // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535. - in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) + in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize } /// Gets the 3d index of the thread currently executing the kernel. #[gpu_only] #[inline(always)] -pub fn thread_idx() -> UVec3 { - UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z()) +pub fn thread_idx() -> USizeVec3 { + USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z()) } /// Gets the 3d index of the block that the thread currently executing the kernel is located in. #[gpu_only] #[inline(always)] -pub fn block_idx() -> UVec3 { - UVec3::new(block_idx_x(), block_idx_y(), block_idx_z()) +pub fn block_idx() -> USizeVec3 { + USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z()) } /// Gets the 3d layout of the thread blocks executing this kernel. In other words, /// how many threads exist in each thread block in every direction. #[gpu_only] #[inline(always)] -pub fn block_dim() -> UVec3 { - UVec3::new(block_dim_x(), block_dim_y(), block_dim_z()) +pub fn block_dim() -> USizeVec3 { + USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z()) } /// Gets the 3d layout of the block grids executing this kernel. In other words, /// how many thread blocks exist in each grid in every direction. #[gpu_only] #[inline(always)] -pub fn grid_dim() -> UVec3 { - UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z()) +pub fn grid_dim() -> USizeVec3 { + USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z()) } /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This @@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 { #[gpu_only] #[rustfmt::skip] #[inline(always)] -pub fn index() -> u32 { +pub fn index() -> usize { let grid_dim = grid_dim(); let block_idx = block_idx(); let block_dim = block_dim(); @@ -235,23 +235,23 @@ pub fn index() -> u32 { } #[inline(always)] -pub fn index_1d() -> u32 { - thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32 +pub fn index_1d() -> usize { + thread_idx_x() + block_idx_x() * block_dim_x() } #[inline(always)] -pub fn index_2d() -> UVec2 { +pub fn index_2d() -> USizeVec2 { let i = thread_idx_x() + block_idx_x() * block_dim_x(); let j = thread_idx_y() + block_idx_y() * block_dim_y(); - UVec2::new(i, j) + USizeVec2::new(i, j) } #[inline(always)] -pub fn index_3d() -> UVec3 { +pub fn index_3d() -> USizeVec3 { let i = thread_idx_x() + block_idx_x() * block_dim_x(); let j = thread_idx_y() + block_idx_y() * block_dim_y(); let k = thread_idx_z() + block_idx_z() * block_dim_z(); - UVec3::new(i, j, k) + USizeVec3::new(i, j, k) } /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed @@ -259,7 +259,7 @@ pub fn index_3d() -> UVec3 { /// once. #[inline(always)] pub fn first() -> bool { - block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO + block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO } /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture. diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs index 467a6319..acf2bb9e 100644 --- a/crates/cuda_std_macros/src/lib.rs +++ b/crates/cuda_std_macros/src/lib.rs @@ -253,7 +253,7 @@ pub fn externally_visible( /// pub unsafe fn reverse_array(d: *mut u32, n: usize) { /// ##[address_space(shared)] /// static mut S: [MaybeUninit; 64] = [const { MaybeUninit::uninit() }; 64]; -/// let i = thread::thread_idx_x() as usize; +/// let i = thread::thread_idx_x(); /// let ir = n - i - 1; /// unsafe { S[i].write(*d.add(i)); }; /// thread::sync_threads(); diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs index 67a5fb84..e73ad38c 100644 --- a/crates/cust/src/function.rs +++ b/crates/cust/src/function.rs @@ -18,43 +18,43 @@ use crate::module::Module; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct GridSize { /// Width of grid in blocks - pub x: u32, + pub x: usize, /// Height of grid in blocks - pub y: u32, + pub y: usize, /// Depth of grid in blocks - pub z: u32, + pub z: usize, } impl GridSize { /// Create a one-dimensional grid of `x` blocks #[inline] - pub fn x(x: u32) -> GridSize { + pub fn x(x: usize) -> GridSize { GridSize { x, y: 1, z: 1 } } /// Create a two-dimensional grid of `x * y` blocks #[inline] - pub fn xy(x: u32, y: u32) -> GridSize { + pub fn xy(x: usize, y: usize) -> GridSize { GridSize { x, y, z: 1 } } /// Create a three-dimensional grid of `x * y * z` blocks #[inline] - pub fn xyz(x: u32, y: u32, z: u32) -> GridSize { + pub fn xyz(x: usize, y: usize, z: usize) -> GridSize { GridSize { x, y, z } } } -impl From for GridSize { - fn from(x: u32) -> GridSize { +impl From for GridSize { + fn from(x: usize) -> GridSize { GridSize::x(x) } } -impl From<(u32, u32)> for GridSize { - fn from((x, y): (u32, u32)) -> GridSize { +impl From<(usize, usize)> for GridSize { + fn from((x, y): (usize, usize)) -> GridSize { GridSize::xy(x, y) } } -impl From<(u32, u32, u32)> for GridSize { - fn from((x, y, z): (u32, u32, u32)) -> GridSize { +impl From<(usize, usize, usize)> for GridSize { + fn from((x, y, z): (usize, usize, usize)) -> GridSize { GridSize::xyz(x, y, z) } } @@ -64,52 +64,28 @@ impl From<&GridSize> for GridSize { } } #[cfg(feature = "vek")] -impl From> for GridSize { - fn from(vec: vek::Vec2) -> Self { - GridSize::xy(vec.x, vec.y) - } -} -#[cfg(feature = "vek")] -impl From> for GridSize { - fn from(vec: vek::Vec3) -> Self { - GridSize::xyz(vec.x, vec.y, vec.z) - } -} -#[cfg(feature = "vek")] impl From> for GridSize { fn from(vec: vek::Vec2) -> Self { - GridSize::xy(vec.x as u32, vec.y as u32) + GridSize::xy(vec.x, vec.y) } } #[cfg(feature = "vek")] impl From> for GridSize { fn from(vec: vek::Vec3) -> Self { - GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) - } -} - -#[cfg(feature = "glam")] -impl From for GridSize { - fn from(vec: glam::UVec2) -> Self { - GridSize::xy(vec.x, vec.y) - } -} -#[cfg(feature = "glam")] -impl From for GridSize { - fn from(vec: glam::UVec3) -> Self { GridSize::xyz(vec.x, vec.y, vec.z) } } + #[cfg(feature = "glam")] impl From for GridSize { fn from(vec: glam::USizeVec2) -> Self { - GridSize::xy(vec.x as u32, vec.y as u32) + GridSize::xy(vec.x, vec.y) } } #[cfg(feature = "glam")] impl From for GridSize { fn from(vec: glam::USizeVec3) -> Self { - GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) + GridSize::xyz(vec.x, vec.y, vec.z) } } @@ -123,43 +99,43 @@ impl From for GridSize { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct BlockSize { /// X dimension of each thread block - pub x: u32, + pub x: usize, /// Y dimension of each thread block - pub y: u32, + pub y: usize, /// Z dimension of each thread block - pub z: u32, + pub z: usize, } impl BlockSize { /// Create a one-dimensional block of `x` threads #[inline] - pub fn x(x: u32) -> BlockSize { + pub fn x(x: usize) -> BlockSize { BlockSize { x, y: 1, z: 1 } } /// Create a two-dimensional block of `x * y` threads #[inline] - pub fn xy(x: u32, y: u32) -> BlockSize { + pub fn xy(x: usize, y: usize) -> BlockSize { BlockSize { x, y, z: 1 } } /// Create a three-dimensional block of `x * y * z` threads #[inline] - pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize { + pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize { BlockSize { x, y, z } } } -impl From for BlockSize { - fn from(x: u32) -> BlockSize { +impl From for BlockSize { + fn from(x: usize) -> BlockSize { BlockSize::x(x) } } -impl From<(u32, u32)> for BlockSize { - fn from((x, y): (u32, u32)) -> BlockSize { +impl From<(usize, usize)> for BlockSize { + fn from((x, y): (usize, usize)) -> BlockSize { BlockSize::xy(x, y) } } -impl From<(u32, u32, u32)> for BlockSize { - fn from((x, y, z): (u32, u32, u32)) -> BlockSize { +impl From<(usize, usize, usize)> for BlockSize { + fn from((x, y, z): (usize, usize, usize)) -> BlockSize { BlockSize::xyz(x, y, z) } } @@ -169,52 +145,28 @@ impl From<&BlockSize> for BlockSize { } } #[cfg(feature = "vek")] -impl From> for BlockSize { - fn from(vec: vek::Vec2) -> Self { - BlockSize::xy(vec.x, vec.y) - } -} -#[cfg(feature = "vek")] -impl From> for BlockSize { - fn from(vec: vek::Vec3) -> Self { - BlockSize::xyz(vec.x, vec.y, vec.z) - } -} -#[cfg(feature = "vek")] impl From> for BlockSize { fn from(vec: vek::Vec2) -> Self { - BlockSize::xy(vec.x as u32, vec.y as u32) + BlockSize::xy(vec.x, vec.y) } } #[cfg(feature = "vek")] impl From> for BlockSize { fn from(vec: vek::Vec3) -> Self { - BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) - } -} - -#[cfg(feature = "glam")] -impl From for BlockSize { - fn from(vec: glam::UVec2) -> Self { - BlockSize::xy(vec.x, vec.y) - } -} -#[cfg(feature = "glam")] -impl From for BlockSize { - fn from(vec: glam::UVec3) -> Self { BlockSize::xyz(vec.x, vec.y, vec.z) } } + #[cfg(feature = "glam")] impl From for BlockSize { fn from(vec: glam::USizeVec2) -> Self { - BlockSize::xy(vec.x as u32, vec.y as u32) + BlockSize::xy(vec.x, vec.y) } } #[cfg(feature = "glam")] impl From for BlockSize { fn from(vec: glam::USizeVec3) -> Self { - BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) + BlockSize::xyz(vec.x, vec.y, vec.z) } } @@ -448,7 +400,7 @@ impl Function<'_> { &self, dynamic_smem_size: usize, block_size_limit: BlockSize, - ) -> CudaResult<(u32, u32)> { + ) -> CudaResult<(usize, usize)> { let mut min_grid_size = MaybeUninit::uninit(); let mut block_size = MaybeUninit::uninit(); @@ -465,8 +417,8 @@ impl Function<'_> { ) .to_result()?; Ok(( - min_grid_size.assume_init() as u32, - block_size.assume_init() as u32, + min_grid_size.assume_init() as usize, + block_size.assume_init() as usize, )) } } diff --git a/crates/cust/src/graph.rs b/crates/cust/src/graph.rs index 9f04329e..f97a49d4 100644 --- a/crates/cust/src/graph.rs +++ b/crates/cust/src/graph.rs @@ -83,14 +83,18 @@ impl KernelInvocation { } pub fn to_raw(self) -> driver_sys::CUDA_KERNEL_NODE_PARAMS { + let to_u32 = |i: usize| { + i.try_into() + .expect("invocation dimension must fit in a `u32`") + }; driver_sys::CUDA_KERNEL_NODE_PARAMS { func: self.func, - gridDimX: self.grid_dim.x, - gridDimY: self.grid_dim.y, - gridDimZ: self.grid_dim.z, - blockDimX: self.block_dim.x, - blockDimY: self.block_dim.y, - blockDimZ: self.block_dim.z, + gridDimX: to_u32(self.grid_dim.x), + gridDimY: to_u32(self.grid_dim.y), + gridDimZ: to_u32(self.grid_dim.z), + blockDimX: to_u32(self.block_dim.x), + blockDimY: to_u32(self.block_dim.y), + blockDimZ: to_u32(self.block_dim.z), kernelParams: Box::into_raw(self.params), sharedMemBytes: self.shared_mem_bytes, extra: ptr::null_mut(), @@ -109,8 +113,16 @@ impl KernelInvocation { pub unsafe fn from_raw(raw: driver_sys::CUDA_KERNEL_NODE_PARAMS) -> Self { Self { func: raw.func, - grid_dim: GridSize::xyz(raw.gridDimX, raw.gridDimY, raw.gridDimZ), - block_dim: BlockSize::xyz(raw.blockDimX, raw.gridDimY, raw.gridDimZ), + grid_dim: GridSize::xyz( + raw.gridDimX as usize, + raw.gridDimY as usize, + raw.gridDimZ as usize, + ), + block_dim: BlockSize::xyz( + raw.blockDimX as usize, + raw.gridDimY as usize, + raw.gridDimZ as usize, + ), params: Box::from_raw(raw.kernelParams), shared_mem_bytes: raw.sharedMemBytes, params_len: None, diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 2cbbab84..34c24865 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -271,15 +271,16 @@ impl Stream { let grid_size: GridSize = grid_size.into(); let block_size: BlockSize = block_size.into(); + let to_u32 = |i: usize| i.try_into().expect("launch size must fit in a `u32`"); unsafe { driver_sys::cuLaunchKernel( func.to_raw(), - grid_size.x, - grid_size.y, - grid_size.z, - block_size.x, - block_size.y, - block_size.z, + to_u32(grid_size.x), + to_u32(grid_size.y), + to_u32(grid_size.z), + to_u32(block_size.x), + to_u32(block_size.y), + to_u32(block_size.z), shared_mem_bytes, self.inner, args.as_ptr() as *mut _, diff --git a/crates/optix/examples/path_tracer/kernels/src/render.rs b/crates/optix/examples/path_tracer/kernels/src/render.rs index 86cd8cc6..eb589710 100644 --- a/crates/optix/examples/path_tracer/kernels/src/render.rs +++ b/crates/optix/examples/path_tracer/kernels/src/render.rs @@ -1,5 +1,5 @@ use crate::*; -use cuda_std::glam::UVec2; +use cuda_std::glam::USizeVec2; const BACKGROUND_BLUE_MULTIPLIER: f32 = 0.7; @@ -9,7 +9,7 @@ pub fn color(ray: Ray) -> Vec3 { (1.0 - t) * Vec3::ONE + t * Vec3::new(0.5, 0.7, 1.0) } -pub fn generate_ray(idx: UVec2, view: &Viewport, offset: Vec2) -> Ray { +pub fn generate_ray(idx: USizeVec2, view: &Viewport, offset: Vec2) -> Ray { let uv = (idx.as_vec2() + offset) / view.bounds.as_vec2(); Ray { origin: view.origin, diff --git a/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs b/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs index e3141bf4..af63bee9 100644 --- a/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs +++ b/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs @@ -6,10 +6,10 @@ use gpu_rand::{DefaultRand, GpuRand}; #[kernel] pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: *mut DefaultRand) { let idx = thread::index_2d(); - if idx.x >= view.bounds.x as u32 || idx.y >= view.bounds.y as u32 { + if idx.x >= view.bounds.x || idx.y >= view.bounds.y { return; } - let px_idx = idx.y as usize * view.bounds.x + idx.x as usize; + let px_idx = idx.y * view.bounds.x + idx.x; // generate a tiny offset for the ray for antialiasing let rng = unsafe { &mut *rand_states.add(px_idx) }; @@ -27,10 +27,10 @@ pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: #[kernel] pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: Viewport) { let idx_2d = thread::index_2d(); - if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 { + if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y { return; } - let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize; + let idx = idx_2d.y * view.bounds.x + idx_2d.x; let original = unsafe { &*fb.add(idx) }; let out = unsafe { &mut *out.add(idx) }; @@ -43,10 +43,10 @@ pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: #[kernel] pub unsafe fn postprocess(fb: *const Vec3, out: *mut U8Vec3, view: Viewport) { let idx_2d = thread::index_2d(); - if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 { + if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y { return; } - let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize; + let idx = idx_2d.y * view.bounds.x + idx_2d.x; let original = unsafe { &*fb.add(idx) }; let out = unsafe { &mut *out.add(idx) }; // gamma=2.0 diff --git a/crates/optix/examples/path_tracer/src/cpu/mod.rs b/crates/optix/examples/path_tracer/src/cpu/mod.rs index e5f3f1ee..d06c5485 100644 --- a/crates/optix/examples/path_tracer/src/cpu/mod.rs +++ b/crates/optix/examples/path_tracer/src/cpu/mod.rs @@ -1,6 +1,6 @@ use std::time::Duration; -use glam::{U8Vec3, USizeVec2, UVec2, Vec2, Vec3}; +use glam::{U8Vec3, USizeVec2, Vec2, Vec3}; use gpu_rand::{DefaultRand, GpuRand}; use imgui::Ui; use path_tracer_kernels::{ @@ -131,7 +131,7 @@ impl CpuRenderer { .for_each(|(idx, (px, rng))| { let x = idx % viewport.bounds.x; let y = idx / viewport.bounds.x; - let idx = UVec2::new(x as u32, y as u32); + let idx = USizeVec2::new(x, y); let offset = Vec2::from(rng.normal_f32_2()); diff --git a/crates/optix/examples/path_tracer/src/main.rs b/crates/optix/examples/path_tracer/src/main.rs index f65d75fa..da789a38 100644 --- a/crates/optix/examples/path_tracer/src/main.rs +++ b/crates/optix/examples/path_tracer/src/main.rs @@ -16,8 +16,8 @@ use path_tracer_kernels::{ }; use std::error::Error; -pub const WIDTH: u32 = 1920; -pub const HEIGHT: u32 = 1080; +pub const WIDTH: usize = 1920; +pub const HEIGHT: usize = 1080; fn main() -> Result<(), Box> { let camera = Camera { diff --git a/crates/optix/examples/path_tracer/src/viewer.rs b/crates/optix/examples/path_tracer/src/viewer.rs index 3086d078..c15884ff 100644 --- a/crates/optix/examples/path_tracer/src/viewer.rs +++ b/crates/optix/examples/path_tracer/src/viewer.rs @@ -60,11 +60,7 @@ pub fn run(camera: &Camera, scene: &Scene) -> ! { .with_inner_size(PhysicalSize::new(WIDTH as f64, HEIGHT as f64)); let cb = ContextBuilder::new().with_vsync(true); let display = Display::new(wb, cb, &event_loop).unwrap(); - let renderer = Renderer::new( - USizeVec2::new(WIDTH as usize, HEIGHT as usize), - camera, - scene, - ); + let renderer = Renderer::new(USizeVec2::new(WIDTH, HEIGHT), camera, scene); let mut viewer = ViewerRenderer::new(display, renderer); let mut last_frame = Instant::now(); @@ -93,8 +89,7 @@ impl ViewerRenderer { let size = display.gl_window().window().inner_size(); let image_size = USizeVec2::new(size.width as usize, size.height as usize); - let texture = - SrgbTexture2d::empty(&display, image_size.x as u32, image_size.y as u32).unwrap(); + let texture = SrgbTexture2d::empty(&display, size.width, size.height).unwrap(); let mut imgui_ctx = imgui::Context::create(); imgui_ctx.set_ini_filename(None); diff --git a/examples/gemm/kernels/src/gemm_naive.rs b/examples/gemm/kernels/src/gemm_naive.rs index 02ba504f..7897867b 100644 --- a/examples/gemm/kernels/src/gemm_naive.rs +++ b/examples/gemm/kernels/src/gemm_naive.rs @@ -32,15 +32,15 @@ pub unsafe fn gemm_naive( alpha: f32, beta: f32, ) { - let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize; - let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize; + let row = thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x(); + let col = thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y(); if row < m && col < n { let mut sum = 0.0f32; for i in 0..k { sum += mat_a[row * k + i] * mat_b[i * n + col]; } - let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) }; + let elem = unsafe { &mut *mat_c.add(row * n + col) }; *elem = alpha * sum + beta * *elem; } } diff --git a/examples/gemm/kernels/src/gemm_tiled.rs b/examples/gemm/kernels/src/gemm_tiled.rs index 5cc172f8..378d7779 100644 --- a/examples/gemm/kernels/src/gemm_tiled.rs +++ b/examples/gemm/kernels/src/gemm_tiled.rs @@ -53,12 +53,12 @@ pub unsafe fn gemm_tiled( static mut TILE_B: [MaybeUninit; TILE_SIZE_2D] = [MaybeUninit::uninit(); TILE_SIZE_2D]; // Thread indices within the block. - let tx = thread::thread_idx_x() as usize; - let ty = thread::thread_idx_y() as usize; + let tx = thread::thread_idx_x(); + let ty = thread::thread_idx_y(); // Calculate row and column in the mat_c. - let row = thread::block_idx_x() as usize * TILE_SIZE + ty; - let col = thread::block_idx_y() as usize * TILE_SIZE + tx; + let row = thread::block_idx_x() * TILE_SIZE + ty; + let col = thread::block_idx_y() * TILE_SIZE + tx; let mut sum = 0.0f32; // Loop over tiles of mat_a and mat_b in the k dimension. diff --git a/examples/gemm/src/main.rs b/examples/gemm/src/main.rs index 531e0c7c..a57e91bd 100644 --- a/examples/gemm/src/main.rs +++ b/examples/gemm/src/main.rs @@ -365,16 +365,12 @@ pub fn gemm_naive( // This will try to maximize how much of the GPU is used by finding the best launch configuration for the // current CUDA device/architecture. let (_, block_size) = kernel.suggested_launch_configuration(0, 0.into())?; - let block_size = block_size as usize; let (block_size_x, block_size_y) = if block_size > m * n { - (block_size.div_ceil(m) as u32, m as u32) + (block_size.div_ceil(m), m) } else { - (1, block_size as u32) + (1, block_size) }; - let (grid_size_x, grid_size_y) = ( - (m as u32).div_ceil(block_size_x), - (n as u32).div_ceil(block_size_y), - ); + let (grid_size_x, grid_size_y) = (m.div_ceil(block_size_x), n.div_ceil(block_size_y)); unsafe { launch!( kernel<<< @@ -438,12 +434,12 @@ pub fn gemm_tiled( }); let kernel = &*kernel_cell; - let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE) as u32, m.div_ceil(TILE_SIZE) as u32); + let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE), m.div_ceil(TILE_SIZE)); unsafe { launch!( kernel<<< (grid_size_x, grid_size_y), - (TILE_SIZE as u32, TILE_SIZE as u32), + (TILE_SIZE, TILE_SIZE), 0, stream >>>( diff --git a/examples/i128_demo/kernels/src/lib.rs b/examples/i128_demo/kernels/src/lib.rs index 8598e15e..5d4a9411 100644 --- a/examples/i128_demo/kernels/src/lib.rs +++ b/examples/i128_demo/kernels/src/lib.rs @@ -18,7 +18,7 @@ pub unsafe fn i128_ops( urem_out: *mut u128, srem_out: *mut u128, ) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx >= a.len() || idx >= b.len() { return; } diff --git a/examples/i128_demo/src/main.rs b/examples/i128_demo/src/main.rs index 3ed9f92d..a2e6e7b3 100644 --- a/examples/i128_demo/src/main.rs +++ b/examples/i128_demo/src/main.rs @@ -119,8 +119,8 @@ fn main() -> Result<(), Box> { let urem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; let srem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; - let block_size = 128u32; - let grid_size = (len as u32).div_ceil(block_size); + let block_size = 128usize; + let grid_size = len.div_ceil(block_size); unsafe { launch!( @@ -241,7 +241,7 @@ fn main() -> Result<(), Box> { let trap_launch = unsafe { launch!( - kernel<<<1u32, 1u32, 0, trap_stream>>>( + kernel<<<1, 1, 0, trap_stream>>>( trap_a.as_device_ptr(), trap_a.len(), trap_b.as_device_ptr(), diff --git a/examples/sha2_crates_io/kernels/src/lib.rs b/examples/sha2_crates_io/kernels/src/lib.rs index 7d5da4da..49b17f8b 100644 --- a/examples/sha2_crates_io/kernels/src/lib.rs +++ b/examples/sha2_crates_io/kernels/src/lib.rs @@ -5,7 +5,7 @@ use sha2::{Digest, Sha256, Sha512}; #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx == 0 { let hash = Sha256::digest(input); @@ -21,7 +21,7 @@ pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) { #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8; 32]) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx == 0 { let mut hasher = Sha256::new(); @@ -40,7 +40,7 @@ pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8; #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx == 0 { let hash = Sha512::digest(input); @@ -56,7 +56,7 @@ pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) { #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] pub unsafe fn sha512_incremental(input: &[u8], output: *mut [u8; 64]) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx == 0 { let mut hasher = Sha512::new(); diff --git a/examples/vecadd/kernels/src/lib.rs b/examples/vecadd/kernels/src/lib.rs index ba1c3038..2020b256 100644 --- a/examples/vecadd/kernels/src/lib.rs +++ b/examples/vecadd/kernels/src/lib.rs @@ -3,7 +3,7 @@ use cuda_std::prelude::*; #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) { - let idx = thread::index_1d() as usize; + let idx = thread::index_1d(); if idx < a.len() { let elem = unsafe { &mut *c.add(idx) }; *elem = a[idx] + b[idx]; diff --git a/examples/vecadd/src/main.rs b/examples/vecadd/src/main.rs index cf804d0c..73260c59 100644 --- a/examples/vecadd/src/main.rs +++ b/examples/vecadd/src/main.rs @@ -45,7 +45,7 @@ fn main() -> Result<(), Box> { // current CUDA device/architecture. let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?; - let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size); + let grid_size = NUMBERS_LEN.div_ceil(block_size); println!("using {grid_size} blocks and {block_size} threads per block"); diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md index 762e17cb..1206bb3a 100644 --- a/guide/src/guide/getting_started.md +++ b/guide/src/guide/getting_started.md @@ -119,7 +119,7 @@ pub type T = f32; #[kernel] #[allow(improper_ctypes_definitions)] pub unsafe fn add(a: &[T], b: &[T], c: *mut T) { - let i = thread::index_1d() as usize; + let i = thread::index_1d(); if i < a.len() { let elem = unsafe { &mut *c.add(i) }; *elem = a[i] + b[i]; diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs index ffcc07d3..1d98c9a3 100644 --- a/samples/introduction/async_api/kernels/src/lib.rs +++ b/samples/introduction/async_api/kernels/src/lib.rs @@ -8,9 +8,9 @@ use cuda_std::prelude::*; pub unsafe fn increment(g_data: *mut u32, inc_value: u32) { // This can also be obtained directly as // - // let idx: usize = cuda_std::thread::index() as usize; - let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x - + cuda_std::thread::thread_idx().x) as usize; + // let idx: usize = cuda_std::thread::index(); + let idx: usize = cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x + + cuda_std::thread::thread_idx().x; let elem: &mut u32 = unsafe { &mut *g_data.add(idx) }; *elem += inc_value; diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs index 063efbd3..9ddc31c9 100644 --- a/samples/introduction/async_api/src/main.rs +++ b/samples/introduction/async_api/src/main.rs @@ -37,7 +37,7 @@ fn main() -> Result<(), cust::error::CudaError> { let value = 26; let blocks = BlockSize::xy(512, 1); - let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1); + let grids = GridSize::xy(N / blocks.x, 1); let start_event = Event::new(EventFlags::DEFAULT)?; let stop_event = Event::new(EventFlags::DEFAULT)?; diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs index a9ed3b7b..fd5e53da 100644 --- a/tests/compiletests/ui/shared/shared_memory.rs +++ b/tests/compiletests/ui/shared/shared_memory.rs @@ -13,7 +13,7 @@ pub unsafe fn test_static_shared_memory() { #[address_space(shared)] static mut SHARED_DATA: [MaybeUninit; 256] = [MaybeUninit::uninit(); 256]; - let tid = thread::thread_idx_x() as usize; + let tid = thread::thread_idx_x(); // Write to shared memory unsafe { SHARED_DATA[tid] }.write(tid as i32);