Use usize in index and dimension functions.

nnethercote · nnethercote · commit 55cd2bc152d8 · 2025-12-11T17:12:39.000+11:00
Specifically:
- `thread_idx*`
- `block_idx*`
- `block_dim*`
- `grid_dim*`
- `index*`

This removes lots of `as u32`/`as usize` casts.
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
@@ -66,7 +66,7 @@
 //! vary by device. Query device properties when you need exact limits.
 //!
 use cuda_std_macros::gpu_only;
-use glam::{UVec2, UVec3};
+use glam::{USizeVec2, USizeVec3};
 
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 unsafe extern "C" {
@@ -99,116 +99,116 @@ macro_rules! in_range {
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_x() -> u32 {
+pub fn thread_idx_x() -> usize {
     // The range is derived from the `block_idx_x` range.
-    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_y() -> u32 {
+pub fn thread_idx_y() -> usize {
     // The range is derived from the `block_idx_y` range.
-    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_z() -> u32 {
+pub fn thread_idx_z() -> usize {
     // The range is derived from the `block_idx_z` range.
-    in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
+    in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_x() -> u32 {
+pub fn block_idx_x() -> usize {
     // The range is derived from the `grid_idx_x` range.
-    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
+    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_y() -> u32 {
+pub fn block_idx_y() -> usize {
     // The range is derived from the `grid_idx_y` range.
-    in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_z() -> u32 {
+pub fn block_idx_z() -> usize {
     // The range is derived from the `grid_idx_z` range.
-    in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_x() -> u32 {
+pub fn block_dim_x() -> usize {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_y() -> u32 {
+pub fn block_dim_y() -> usize {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_z() -> u32 {
+pub fn block_dim_z() -> usize {
     // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
-    in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
+    in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_x() -> u32 {
+pub fn grid_dim_x() -> usize {
     // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
-    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
+    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_y() -> u32 {
+pub fn grid_dim_y() -> usize {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_z() -> u32 {
+pub fn grid_dim_z() -> usize {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx() -> UVec3 {
-    UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
+pub fn thread_idx() -> USizeVec3 {
+    USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
 }
 
 /// Gets the 3d index of the block that the thread currently executing the kernel is located in.
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx() -> UVec3 {
-    UVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
+pub fn block_idx() -> USizeVec3 {
+    USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
 }
 
 /// Gets the 3d layout of the thread blocks executing this kernel. In other words,
 /// how many threads exist in each thread block in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim() -> UVec3 {
-    UVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
+pub fn block_dim() -> USizeVec3 {
+    USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
 }
 
 /// Gets the 3d layout of the block grids executing this kernel. In other words,
 /// how many thread blocks exist in each grid in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim() -> UVec3 {
-    UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
+pub fn grid_dim() -> USizeVec3 {
+    USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
 }
 
 /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
@@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 {
 #[gpu_only]
 #[rustfmt::skip]
 #[inline(always)]
-pub fn index() -> u32 {
+pub fn index() -> usize {
     let grid_dim = grid_dim();
     let block_idx = block_idx();
     let block_dim = block_dim();
@@ -235,31 +235,31 @@ pub fn index() -> u32 {
 }
 
 #[inline(always)]
-pub fn index_1d() -> u32 {
-    thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32
+pub fn index_1d() -> usize {
+    thread_idx_x() + block_idx_x() * block_dim_x()
 }
 
 #[inline(always)]
-pub fn index_2d() -> UVec2 {
+pub fn index_2d() -> USizeVec2 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
-    UVec2::new(i, j)
+    USizeVec2::new(i, j)
 }
 
 #[inline(always)]
-pub fn index_3d() -> UVec3 {
+pub fn index_3d() -> USizeVec3 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
     let k = thread_idx_z() + block_idx_z() * block_dim_z();
-    UVec3::new(i, j, k)
+    USizeVec3::new(i, j, k)
 }
 
 /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
 /// to only return true in a single thread that is invoking it. This is useful for only doing something
 /// once.
 #[inline(always)]
 pub fn first() -> bool {
-    block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
+    block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO
 }
 
 /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.
diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
@@ -253,7 +253,7 @@ pub fn externally_visible(
 /// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
 ///     ##[address_space(shared)]
 ///     static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
-///     let i = thread::thread_idx_x() as usize;
+///     let i = thread::thread_idx_x();
 ///     let ir = n - i - 1;
 ///     unsafe { S[i].write(*d.add(i)); };
 ///     thread::sync_threads();
diff --git a/crates/optix/examples/path_tracer/kernels/src/render.rs b/crates/optix/examples/path_tracer/kernels/src/render.rs
@@ -1,5 +1,5 @@
 use crate::*;
-use cuda_std::glam::UVec2;
+use cuda_std::glam::USizeVec2;
 
 const BACKGROUND_BLUE_MULTIPLIER: f32 = 0.7;
 
@@ -9,7 +9,7 @@ pub fn color(ray: Ray) -> Vec3 {
     (1.0 - t) * Vec3::ONE + t * Vec3::new(0.5, 0.7, 1.0)
 }
 
-pub fn generate_ray(idx: UVec2, view: &Viewport, offset: Vec2) -> Ray {
+pub fn generate_ray(idx: USizeVec2, view: &Viewport, offset: Vec2) -> Ray {
     let uv = (idx.as_vec2() + offset) / view.bounds.as_vec2();
     Ray {
         origin: view.origin,
diff --git a/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs b/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs
@@ -6,10 +6,10 @@ use gpu_rand::{DefaultRand, GpuRand};
 #[kernel]
 pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: *mut DefaultRand) {
     let idx = thread::index_2d();
-    if idx.x >= view.bounds.x as u32 || idx.y >= view.bounds.y as u32 {
+    if idx.x >= view.bounds.x || idx.y >= view.bounds.y {
         return;
     }
-    let px_idx = idx.y as usize * view.bounds.x + idx.x as usize;
+    let px_idx = idx.y * view.bounds.x + idx.x;
 
     // generate a tiny offset for the ray for antialiasing
     let rng = unsafe { &mut *rand_states.add(px_idx) };
@@ -27,10 +27,10 @@ pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states:
 #[kernel]
 pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: Viewport) {
     let idx_2d = thread::index_2d();
-    if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
+    if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
         return;
     }
-    let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
+    let idx = idx_2d.y * view.bounds.x + idx_2d.x;
     let original = unsafe { &*fb.add(idx) };
     let out = unsafe { &mut *out.add(idx) };
 
@@ -43,10 +43,10 @@ pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view:
 #[kernel]
 pub unsafe fn postprocess(fb: *const Vec3, out: *mut U8Vec3, view: Viewport) {
     let idx_2d = thread::index_2d();
-    if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
+    if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
         return;
     }
-    let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
+    let idx = idx_2d.y * view.bounds.x + idx_2d.x;
     let original = unsafe { &*fb.add(idx) };
     let out = unsafe { &mut *out.add(idx) };
     // gamma=2.0
diff --git a/crates/optix/examples/path_tracer/src/cpu/mod.rs b/crates/optix/examples/path_tracer/src/cpu/mod.rs
@@ -1,6 +1,6 @@
 use std::time::Duration;
 
-use glam::{U8Vec3, USizeVec2, UVec2, Vec2, Vec3};
+use glam::{U8Vec3, USizeVec2, Vec2, Vec3};
 use gpu_rand::{DefaultRand, GpuRand};
 use imgui::Ui;
 use path_tracer_kernels::{
@@ -131,7 +131,7 @@ impl CpuRenderer {
             .for_each(|(idx, (px, rng))| {
                 let x = idx % viewport.bounds.x;
                 let y = idx / viewport.bounds.x;
-                let idx = UVec2::new(x as u32, y as u32);
+                let idx = USizeVec2::new(x, y);
 
                 let offset = Vec2::from(rng.normal_f32_2());
 
diff --git a/crates/optix/examples/path_tracer/src/main.rs b/crates/optix/examples/path_tracer/src/main.rs
@@ -16,8 +16,8 @@ use path_tracer_kernels::{
 };
 use std::error::Error;
 
-pub const WIDTH: u32 = 1920;
-pub const HEIGHT: u32 = 1080;
+pub const WIDTH: usize = 1920;
+pub const HEIGHT: usize = 1080;
 
 fn main() -> Result<(), Box<dyn Error>> {
     let camera = Camera {
diff --git a/crates/optix/examples/path_tracer/src/viewer.rs b/crates/optix/examples/path_tracer/src/viewer.rs
@@ -60,11 +60,7 @@ pub fn run(camera: &Camera, scene: &Scene) -> ! {
         .with_inner_size(PhysicalSize::new(WIDTH as f64, HEIGHT as f64));
     let cb = ContextBuilder::new().with_vsync(true);
     let display = Display::new(wb, cb, &event_loop).unwrap();
-    let renderer = Renderer::new(
-        USizeVec2::new(WIDTH as usize, HEIGHT as usize),
-        camera,
-        scene,
-    );
+    let renderer = Renderer::new(USizeVec2::new(WIDTH, HEIGHT), camera, scene);
     let mut viewer = ViewerRenderer::new(display, renderer);
 
     let mut last_frame = Instant::now();
@@ -93,8 +89,7 @@ impl ViewerRenderer {
 
         let size = display.gl_window().window().inner_size();
         let image_size = USizeVec2::new(size.width as usize, size.height as usize);
-        let texture =
-            SrgbTexture2d::empty(&display, image_size.x as u32, image_size.y as u32).unwrap();
+        let texture = SrgbTexture2d::empty(&display, size.width, size.height).unwrap();
 
         let mut imgui_ctx = imgui::Context::create();
         imgui_ctx.set_ini_filename(None);
diff --git a/examples/gemm/kernels/src/gemm_naive.rs b/examples/gemm/kernels/src/gemm_naive.rs
@@ -32,15 +32,15 @@ pub unsafe fn gemm_naive(
     alpha: f32,
     beta: f32,
 ) {
-    let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
-    let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
+    let row = thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x();
+    let col = thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y();
 
     if row < m && col < n {
         let mut sum = 0.0f32;
         for i in 0..k {
             sum += mat_a[row * k + i] * mat_b[i * n + col];
         }
-        let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
+        let elem = unsafe { &mut *mat_c.add(row * n + col) };
         *elem = alpha * sum + beta * *elem;
     }
 }
diff --git a/examples/gemm/kernels/src/gemm_tiled.rs b/examples/gemm/kernels/src/gemm_tiled.rs
@@ -53,12 +53,12 @@ pub unsafe fn gemm_tiled(
     static mut TILE_B: [MaybeUninit<f32>; TILE_SIZE_2D] = [MaybeUninit::uninit(); TILE_SIZE_2D];
 
     // Thread indices within the block.
-    let tx = thread::thread_idx_x() as usize;
-    let ty = thread::thread_idx_y() as usize;
+    let tx = thread::thread_idx_x();
+    let ty = thread::thread_idx_y();
 
     // Calculate row and column in the mat_c.
-    let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
-    let col = thread::block_idx_y() as usize * TILE_SIZE + tx;
+    let row = thread::block_idx_x() * TILE_SIZE + ty;
+    let col = thread::block_idx_y() * TILE_SIZE + tx;
 
     let mut sum = 0.0f32;
     // Loop over tiles of mat_a and mat_b in the k dimension.
diff --git a/examples/i128_demo/kernels/src/lib.rs b/examples/i128_demo/kernels/src/lib.rs
@@ -18,7 +18,7 @@ pub unsafe fn i128_ops(
     urem_out: *mut u128,
     srem_out: *mut u128,
 ) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
     if idx >= a.len() || idx >= b.len() {
         return;
     }
diff --git a/examples/sha2_crates_io/kernels/src/lib.rs b/examples/sha2_crates_io/kernels/src/lib.rs
diff --git a/examples/vecadd/kernels/src/lib.rs b/examples/vecadd/kernels/src/lib.rs
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs

Original file line number	Diff line number	Diff line change
`@@ -32,15 +32,15 @@ pub unsafe fn gemm_naive(`
`32`	`32`	`alpha: f32,`
`33`	`33`	`beta: f32,`
`34`	`34`	`) {`
`35`		`- let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;`
`36`		`- let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;`
	`35`	`+ let row = thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x();`
	`36`	`+ let col = thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y();`
`37`	`37`
`38`	`38`	`if row < m && col < n {`
`39`	`39`	`let mut sum = 0.0f32;`
`40`	`40`	`for i in 0..k {`
`41`	`41`	`sum += mat_a[row * k + i] * mat_b[i * n + col];`
`42`	`42`	`}`
`43`		`- let elem = unsafe { &mut mat_c.add((row n + col) as usize) };`
	`43`	`+ let elem = unsafe { &mut mat_c.add(row n + col) };`
`44`	`44`	`elem = alpha sum + beta * *elem;`
`45`	`45`	`}`
`46`	`46`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ pub unsafe fn i128_ops(`
`18`	`18`	`urem_out: *mut u128,`
`19`	`19`	`srem_out: *mut u128,`
`20`	`20`	`) {`
`21`		`- let idx = thread::index_1d() as usize;`
	`21`	`+ let idx = thread::index_1d();`
`22`	`22`	`if idx >= a.len() \|\| idx >= b.len() {`
`23`	`23`	`return;`
`24`	`24`	`}`