Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 30 additions & 50 deletions crates/cuda_std/src/rt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,69 +107,59 @@ macro_rules! launch {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GridSize {
/// Width of grid in blocks
pub x: u32,
pub x: usize,
/// Height of grid in blocks
pub y: u32,
pub y: usize,
/// Depth of grid in blocks
pub z: u32,
pub z: usize,
}
impl GridSize {
/// Create a one-dimensional grid of `x` blocks
#[inline]
pub fn x(x: u32) -> GridSize {
pub fn x(x: usize) -> GridSize {
GridSize { x, y: 1, z: 1 }
}

/// Create a two-dimensional grid of `x * y` blocks
#[inline]
pub fn xy(x: u32, y: u32) -> GridSize {
pub fn xy(x: usize, y: usize) -> GridSize {
GridSize { x, y, z: 1 }
}

/// Create a three-dimensional grid of `x * y * z` blocks
#[inline]
pub fn xyz(x: u32, y: u32, z: u32) -> GridSize {
pub fn xyz(x: usize, y: usize, z: usize) -> GridSize {
GridSize { x, y, z }
}
}
impl From<u32> for GridSize {
fn from(x: u32) -> GridSize {
impl From<usize> for GridSize {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe consider keeping the old impl, if that does not cause any issues(to allow casting from u32 too).

IDK if this would be of any worth.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it but none of the existing example code requires it so it doesn't seem useful. We can add it back if necessary.

fn from(x: usize) -> GridSize {
GridSize::x(x)
}
}
impl From<(u32, u32)> for GridSize {
fn from((x, y): (u32, u32)) -> GridSize {
impl From<(usize, usize)> for GridSize {
fn from((x, y): (usize, usize)) -> GridSize {
GridSize::xy(x, y)
}
}
impl From<(u32, u32, u32)> for GridSize {
fn from((x, y, z): (u32, u32, u32)) -> GridSize {
impl From<(usize, usize, usize)> for GridSize {
fn from((x, y, z): (usize, usize, usize)) -> GridSize {
GridSize::xyz(x, y, z)
}
}
impl<'a> From<&'a GridSize> for GridSize {
impl From<&GridSize> for GridSize {
fn from(other: &GridSize) -> GridSize {
other.clone()
}
}
impl From<glam::UVec2> for GridSize {
fn from(vec: glam::UVec2) -> Self {
GridSize::xy(vec.x, vec.y)
}
}
impl From<glam::UVec3> for GridSize {
fn from(vec: glam::UVec3) -> Self {
GridSize::xyz(vec.x, vec.y, vec.z)
}
}
impl From<glam::USizeVec2> for GridSize {
fn from(vec: glam::USizeVec2) -> Self {
GridSize::xy(vec.x as u32, vec.y as u32)
GridSize::xy(vec.x, vec.y)
}
}
impl From<glam::USizeVec3> for GridSize {
fn from(vec: glam::USizeVec3) -> Self {
GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
GridSize::xyz(vec.x, vec.y, vec.z)
}
}

Expand All @@ -183,68 +173,58 @@ impl From<glam::USizeVec3> for GridSize {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BlockSize {
/// X dimension of each thread block
pub x: u32,
pub x: usize,
/// Y dimension of each thread block
pub y: u32,
pub y: usize,
/// Z dimension of each thread block
pub z: u32,
pub z: usize,
}
impl BlockSize {
/// Create a one-dimensional block of `x` threads
#[inline]
pub fn x(x: u32) -> BlockSize {
pub fn x(x: usize) -> BlockSize {
BlockSize { x, y: 1, z: 1 }
}

/// Create a two-dimensional block of `x * y` threads
#[inline]
pub fn xy(x: u32, y: u32) -> BlockSize {
pub fn xy(x: usize, y: usize) -> BlockSize {
BlockSize { x, y, z: 1 }
}

/// Create a three-dimensional block of `x * y * z` threads
#[inline]
pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize {
pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize {
BlockSize { x, y, z }
}
}
impl From<u32> for BlockSize {
fn from(x: u32) -> BlockSize {
impl From<usize> for BlockSize {
fn from(x: usize) -> BlockSize {
BlockSize::x(x)
}
}
impl From<(u32, u32)> for BlockSize {
fn from((x, y): (u32, u32)) -> BlockSize {
impl From<(usize, usize)> for BlockSize {
fn from((x, y): (usize, usize)) -> BlockSize {
BlockSize::xy(x, y)
}
}
impl From<(u32, u32, u32)> for BlockSize {
fn from((x, y, z): (u32, u32, u32)) -> BlockSize {
impl From<(usize, usize, usize)> for BlockSize {
fn from((x, y, z): (usize, usize, usize)) -> BlockSize {
BlockSize::xyz(x, y, z)
}
}
impl<'a> From<&'a BlockSize> for BlockSize {
impl From<&BlockSize> for BlockSize {
fn from(other: &BlockSize) -> BlockSize {
other.clone()
}
}
impl From<glam::UVec2> for BlockSize {
fn from(vec: glam::UVec2) -> Self {
BlockSize::xy(vec.x, vec.y)
}
}
impl From<glam::UVec3> for BlockSize {
fn from(vec: glam::UVec3) -> Self {
BlockSize::xyz(vec.x, vec.y, vec.z)
}
}
impl From<glam::USizeVec2> for BlockSize {
fn from(vec: glam::USizeVec2) -> Self {
BlockSize::xy(vec.x as u32, vec.y as u32)
BlockSize::xy(vec.x, vec.y)
}
}
impl From<glam::USizeVec3> for BlockSize {
fn from(vec: glam::USizeVec3) -> Self {
BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
BlockSize::xyz(vec.x, vec.y, vec.z)
}
}
94 changes: 47 additions & 47 deletions crates/cuda_std/src/thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
//! vary by device. Query device properties when you need exact limits.
//!
use cuda_std_macros::gpu_only;
use glam::{UVec2, UVec3};
use glam::{USizeVec2, USizeVec3};

// different calling conventions dont exist in nvptx, so we just use C as a placeholder.
unsafe extern "C" {
Expand Down Expand Up @@ -99,116 +99,116 @@ macro_rules! in_range {

#[gpu_only]
#[inline(always)]
pub fn thread_idx_x() -> u32 {
// The range is derived from the `block_idx_x` range.
in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
pub fn thread_idx_x() -> usize {
// The range is derived from the `block_dim_x` range.
in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to current change, but I find the range here suspicious:
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-hierarchy

There is a limit to the number of threads per block, since all threads of a block are expected to reside on the same streaming multiprocessor core and must share the limited memory resources of that core. On current GPUs, a thread block may contain up to 1024 threads.
However, a kernel can be executed by multiple equally-shaped thread blocks, so that the total number of threads is equal to the number of threads per block times the number of blocks.

The "current GPUs" suggests this is not an API promise, but mearly the current highest value of this. Can this raise in the future? What happens then?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See table 30 in this section for a more specific description of the limits here. You are right that it's not a guarantee, but the relevant numbers haven't changed from Compute Capability 5.0 all the way to 12.x.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this I realize that some of these comments have an error in them. I'll fix that.

}

#[gpu_only]
#[inline(always)]
pub fn thread_idx_y() -> u32 {
// The range is derived from the `block_idx_y` range.
in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
pub fn thread_idx_y() -> usize {
// The range is derived from the `block_dim_y` range.
in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize
}

#[gpu_only]
#[inline(always)]
pub fn thread_idx_z() -> u32 {
// The range is derived from the `block_idx_z` range.
in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
pub fn thread_idx_z() -> usize {
// The range is derived from the `block_dim_z` range.
in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_idx_x() -> u32 {
// The range is derived from the `grid_idx_x` range.
in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
pub fn block_idx_x() -> usize {
// The range is derived from the `grid_dim_x` range.
in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_idx_y() -> u32 {
// The range is derived from the `grid_idx_y` range.
in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
pub fn block_idx_y() -> usize {
// The range is derived from the `grid_dim_y` range.
in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_idx_z() -> u32 {
// The range is derived from the `grid_idx_z` range.
in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
pub fn block_idx_z() -> usize {
// The range is derived from the `grid_dim_z` range.
in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_dim_x() -> u32 {
pub fn block_dim_x() -> usize {
// CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_dim_y() -> u32 {
pub fn block_dim_y() -> usize {
// CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize
}

#[gpu_only]
#[inline(always)]
pub fn block_dim_z() -> u32 {
pub fn block_dim_z() -> usize {
// CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize
}

#[gpu_only]
#[inline(always)]
pub fn grid_dim_x() -> u32 {
pub fn grid_dim_x() -> usize {
// CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize
}

#[gpu_only]
#[inline(always)]
pub fn grid_dim_y() -> u32 {
pub fn grid_dim_y() -> usize {
// CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize
}

#[gpu_only]
#[inline(always)]
pub fn grid_dim_z() -> u32 {
pub fn grid_dim_z() -> usize {
// CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize
}

/// Gets the 3d index of the thread currently executing the kernel.
#[gpu_only]
#[inline(always)]
pub fn thread_idx() -> UVec3 {
UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
pub fn thread_idx() -> USizeVec3 {
USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
}

/// Gets the 3d index of the block that the thread currently executing the kernel is located in.
#[gpu_only]
#[inline(always)]
pub fn block_idx() -> UVec3 {
UVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
pub fn block_idx() -> USizeVec3 {
USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
}

/// Gets the 3d layout of the thread blocks executing this kernel. In other words,
/// how many threads exist in each thread block in every direction.
#[gpu_only]
#[inline(always)]
pub fn block_dim() -> UVec3 {
UVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
pub fn block_dim() -> USizeVec3 {
USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
}

/// Gets the 3d layout of the block grids executing this kernel. In other words,
/// how many thread blocks exist in each grid in every direction.
#[gpu_only]
#[inline(always)]
pub fn grid_dim() -> UVec3 {
UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
pub fn grid_dim() -> USizeVec3 {
USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
}

/// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
Expand All @@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 {
#[gpu_only]
#[rustfmt::skip]
#[inline(always)]
pub fn index() -> u32 {
pub fn index() -> usize {
let grid_dim = grid_dim();
let block_idx = block_idx();
let block_dim = block_dim();
Expand All @@ -235,31 +235,31 @@ pub fn index() -> u32 {
}

#[inline(always)]
pub fn index_1d() -> u32 {
thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32
pub fn index_1d() -> usize {
thread_idx_x() + block_idx_x() * block_dim_x()
}

#[inline(always)]
pub fn index_2d() -> UVec2 {
pub fn index_2d() -> USizeVec2 {
let i = thread_idx_x() + block_idx_x() * block_dim_x();
let j = thread_idx_y() + block_idx_y() * block_dim_y();
UVec2::new(i, j)
USizeVec2::new(i, j)
}

#[inline(always)]
pub fn index_3d() -> UVec3 {
pub fn index_3d() -> USizeVec3 {
let i = thread_idx_x() + block_idx_x() * block_dim_x();
let j = thread_idx_y() + block_idx_y() * block_dim_y();
let k = thread_idx_z() + block_idx_z() * block_dim_z();
UVec3::new(i, j, k)
USizeVec3::new(i, j, k)
}

/// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
/// to only return true in a single thread that is invoking it. This is useful for only doing something
/// once.
#[inline(always)]
pub fn first() -> bool {
block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO
}

/// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.
Expand Down
2 changes: 1 addition & 1 deletion crates/cuda_std_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ pub fn externally_visible(
/// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
/// ##[address_space(shared)]
/// static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
/// let i = thread::thread_idx_x() as usize;
/// let i = thread::thread_idx_x();
/// let ir = n - i - 1;
/// unsafe { S[i].write(*d.add(i)); };
/// thread::sync_threads();
Expand Down
Loading