diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
index ed6440ab..35bc2d43 100644
--- a/crates/cuda_std/src/rt/mod.rs
+++ b/crates/cuda_std/src/rt/mod.rs
@@ -107,69 +107,59 @@ macro_rules! launch {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct GridSize {
     /// Width of grid in blocks
-    pub x: u32,
+    pub x: usize,
     /// Height of grid in blocks
-    pub y: u32,
+    pub y: usize,
     /// Depth of grid in blocks
-    pub z: u32,
+    pub z: usize,
 }
 impl GridSize {
     /// Create a one-dimensional grid of `x` blocks
     #[inline]
-    pub fn x(x: u32) -> GridSize {
+    pub fn x(x: usize) -> GridSize {
         GridSize { x, y: 1, z: 1 }
     }
 
     /// Create a two-dimensional grid of `x * y` blocks
     #[inline]
-    pub fn xy(x: u32, y: u32) -> GridSize {
+    pub fn xy(x: usize, y: usize) -> GridSize {
         GridSize { x, y, z: 1 }
     }
 
     /// Create a three-dimensional grid of `x * y * z` blocks
     #[inline]
-    pub fn xyz(x: u32, y: u32, z: u32) -> GridSize {
+    pub fn xyz(x: usize, y: usize, z: usize) -> GridSize {
         GridSize { x, y, z }
     }
 }
-impl From<u32> for GridSize {
-    fn from(x: u32) -> GridSize {
+impl From<usize> for GridSize {
+    fn from(x: usize) -> GridSize {
         GridSize::x(x)
     }
 }
-impl From<(u32, u32)> for GridSize {
-    fn from((x, y): (u32, u32)) -> GridSize {
+impl From<(usize, usize)> for GridSize {
+    fn from((x, y): (usize, usize)) -> GridSize {
         GridSize::xy(x, y)
     }
 }
-impl From<(u32, u32, u32)> for GridSize {
-    fn from((x, y, z): (u32, u32, u32)) -> GridSize {
+impl From<(usize, usize, usize)> for GridSize {
+    fn from((x, y, z): (usize, usize, usize)) -> GridSize {
         GridSize::xyz(x, y, z)
     }
 }
-impl<'a> From<&'a GridSize> for GridSize {
+impl From<&GridSize> for GridSize {
     fn from(other: &GridSize) -> GridSize {
         other.clone()
     }
 }
-impl From<glam::UVec2> for GridSize {
-    fn from(vec: glam::UVec2) -> Self {
-        GridSize::xy(vec.x, vec.y)
-    }
-}
-impl From<glam::UVec3> for GridSize {
-    fn from(vec: glam::UVec3) -> Self {
-        GridSize::xyz(vec.x, vec.y, vec.z)
-    }
-}
 impl From<glam::USizeVec2> for GridSize {
     fn from(vec: glam::USizeVec2) -> Self {
-        GridSize::xy(vec.x as u32, vec.y as u32)
+        GridSize::xy(vec.x, vec.y)
     }
 }
 impl From<glam::USizeVec3> for GridSize {
     fn from(vec: glam::USizeVec3) -> Self {
-        GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+        GridSize::xyz(vec.x, vec.y, vec.z)
     }
 }
 
@@ -183,68 +173,58 @@ impl From<glam::USizeVec3> for GridSize {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct BlockSize {
     /// X dimension of each thread block
-    pub x: u32,
+    pub x: usize,
     /// Y dimension of each thread block
-    pub y: u32,
+    pub y: usize,
     /// Z dimension of each thread block
-    pub z: u32,
+    pub z: usize,
 }
 impl BlockSize {
     /// Create a one-dimensional block of `x` threads
     #[inline]
-    pub fn x(x: u32) -> BlockSize {
+    pub fn x(x: usize) -> BlockSize {
         BlockSize { x, y: 1, z: 1 }
     }
 
     /// Create a two-dimensional block of `x * y` threads
     #[inline]
-    pub fn xy(x: u32, y: u32) -> BlockSize {
+    pub fn xy(x: usize, y: usize) -> BlockSize {
         BlockSize { x, y, z: 1 }
     }
 
     /// Create a three-dimensional block of `x * y * z` threads
     #[inline]
-    pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize {
+    pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize {
         BlockSize { x, y, z }
     }
 }
-impl From<u32> for BlockSize {
-    fn from(x: u32) -> BlockSize {
+impl From<usize> for BlockSize {
+    fn from(x: usize) -> BlockSize {
         BlockSize::x(x)
     }
 }
-impl From<(u32, u32)> for BlockSize {
-    fn from((x, y): (u32, u32)) -> BlockSize {
+impl From<(usize, usize)> for BlockSize {
+    fn from((x, y): (usize, usize)) -> BlockSize {
         BlockSize::xy(x, y)
     }
 }
-impl From<(u32, u32, u32)> for BlockSize {
-    fn from((x, y, z): (u32, u32, u32)) -> BlockSize {
+impl From<(usize, usize, usize)> for BlockSize {
+    fn from((x, y, z): (usize, usize, usize)) -> BlockSize {
         BlockSize::xyz(x, y, z)
     }
 }
-impl<'a> From<&'a BlockSize> for BlockSize {
+impl From<&BlockSize> for BlockSize {
     fn from(other: &BlockSize) -> BlockSize {
         other.clone()
     }
 }
-impl From<glam::UVec2> for BlockSize {
-    fn from(vec: glam::UVec2) -> Self {
-        BlockSize::xy(vec.x, vec.y)
-    }
-}
-impl From<glam::UVec3> for BlockSize {
-    fn from(vec: glam::UVec3) -> Self {
-        BlockSize::xyz(vec.x, vec.y, vec.z)
-    }
-}
 impl From<glam::USizeVec2> for BlockSize {
     fn from(vec: glam::USizeVec2) -> Self {
-        BlockSize::xy(vec.x as u32, vec.y as u32)
+        BlockSize::xy(vec.x, vec.y)
     }
 }
 impl From<glam::USizeVec3> for BlockSize {
     fn from(vec: glam::USizeVec3) -> Self {
-        BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+        BlockSize::xyz(vec.x, vec.y, vec.z)
     }
 }
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index 8df6f1b8..a772c351 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -66,7 +66,7 @@
 //! vary by device. Query device properties when you need exact limits.
 //!
 use cuda_std_macros::gpu_only;
-use glam::{UVec2, UVec3};
+use glam::{USizeVec2, USizeVec3};
 
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 unsafe extern "C" {
@@ -99,116 +99,116 @@ macro_rules! in_range {
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_x() -> u32 {
-    // The range is derived from the `block_idx_x` range.
-    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
+pub fn thread_idx_x() -> usize {
+    // The range is derived from the `block_dim_x` range.
+    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_y() -> u32 {
-    // The range is derived from the `block_idx_y` range.
-    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
+pub fn thread_idx_y() -> usize {
+    // The range is derived from the `block_dim_y` range.
+    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx_z() -> u32 {
-    // The range is derived from the `block_idx_z` range.
-    in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
+pub fn thread_idx_z() -> usize {
+    // The range is derived from the `block_dim_z` range.
+    in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_x() -> u32 {
-    // The range is derived from the `grid_idx_x` range.
-    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
+pub fn block_idx_x() -> usize {
+    // The range is derived from the `grid_dim_x` range.
+    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_y() -> u32 {
-    // The range is derived from the `grid_idx_y` range.
-    in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
+pub fn block_idx_y() -> usize {
+    // The range is derived from the `grid_dim_y` range.
+    in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx_z() -> u32 {
-    // The range is derived from the `grid_idx_z` range.
-    in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
+pub fn block_idx_z() -> usize {
+    // The range is derived from the `grid_dim_z` range.
+    in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_x() -> u32 {
+pub fn block_dim_x() -> usize {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_y() -> u32 {
+pub fn block_dim_y() -> usize {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim_z() -> u32 {
+pub fn block_dim_z() -> usize {
     // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
-    in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
+    in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_x() -> u32 {
+pub fn grid_dim_x() -> usize {
     // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
-    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
+    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_y() -> u32 {
+pub fn grid_dim_y() -> usize {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize
 }
 
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim_z() -> u32 {
+pub fn grid_dim_z() -> usize {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx() -> UVec3 {
-    UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
+pub fn thread_idx() -> USizeVec3 {
+    USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
 }
 
 /// Gets the 3d index of the block that the thread currently executing the kernel is located in.
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx() -> UVec3 {
-    UVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
+pub fn block_idx() -> USizeVec3 {
+    USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
 }
 
 /// Gets the 3d layout of the thread blocks executing this kernel. In other words,
 /// how many threads exist in each thread block in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim() -> UVec3 {
-    UVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
+pub fn block_dim() -> USizeVec3 {
+    USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
 }
 
 /// Gets the 3d layout of the block grids executing this kernel. In other words,
 /// how many thread blocks exist in each grid in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim() -> UVec3 {
-    UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
+pub fn grid_dim() -> USizeVec3 {
+    USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
 }
 
 /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
@@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 {
 #[gpu_only]
 #[rustfmt::skip]
 #[inline(always)]
-pub fn index() -> u32 {
+pub fn index() -> usize {
     let grid_dim = grid_dim();
     let block_idx = block_idx();
     let block_dim = block_dim();
@@ -235,23 +235,23 @@ pub fn index() -> u32 {
 }
 
 #[inline(always)]
-pub fn index_1d() -> u32 {
-    thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32
+pub fn index_1d() -> usize {
+    thread_idx_x() + block_idx_x() * block_dim_x()
 }
 
 #[inline(always)]
-pub fn index_2d() -> UVec2 {
+pub fn index_2d() -> USizeVec2 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
-    UVec2::new(i, j)
+    USizeVec2::new(i, j)
 }
 
 #[inline(always)]
-pub fn index_3d() -> UVec3 {
+pub fn index_3d() -> USizeVec3 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
     let k = thread_idx_z() + block_idx_z() * block_dim_z();
-    UVec3::new(i, j, k)
+    USizeVec3::new(i, j, k)
 }
 
 /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
@@ -259,7 +259,7 @@ pub fn index_3d() -> UVec3 {
 /// once.
 #[inline(always)]
 pub fn first() -> bool {
-    block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
+    block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO
 }
 
 /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.
diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
index 467a6319..acf2bb9e 100644
--- a/crates/cuda_std_macros/src/lib.rs
+++ b/crates/cuda_std_macros/src/lib.rs
@@ -253,7 +253,7 @@ pub fn externally_visible(
 /// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
 ///     ##[address_space(shared)]
 ///     static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
-///     let i = thread::thread_idx_x() as usize;
+///     let i = thread::thread_idx_x();
 ///     let ir = n - i - 1;
 ///     unsafe { S[i].write(*d.add(i)); };
 ///     thread::sync_threads();
diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs
index 67a5fb84..e73ad38c 100644
--- a/crates/cust/src/function.rs
+++ b/crates/cust/src/function.rs
@@ -18,43 +18,43 @@ use crate::module::Module;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct GridSize {
     /// Width of grid in blocks
-    pub x: u32,
+    pub x: usize,
     /// Height of grid in blocks
-    pub y: u32,
+    pub y: usize,
     /// Depth of grid in blocks
-    pub z: u32,
+    pub z: usize,
 }
 impl GridSize {
     /// Create a one-dimensional grid of `x` blocks
     #[inline]
-    pub fn x(x: u32) -> GridSize {
+    pub fn x(x: usize) -> GridSize {
         GridSize { x, y: 1, z: 1 }
     }
 
     /// Create a two-dimensional grid of `x * y` blocks
     #[inline]
-    pub fn xy(x: u32, y: u32) -> GridSize {
+    pub fn xy(x: usize, y: usize) -> GridSize {
         GridSize { x, y, z: 1 }
     }
 
     /// Create a three-dimensional grid of `x * y * z` blocks
     #[inline]
-    pub fn xyz(x: u32, y: u32, z: u32) -> GridSize {
+    pub fn xyz(x: usize, y: usize, z: usize) -> GridSize {
         GridSize { x, y, z }
     }
 }
-impl From<u32> for GridSize {
-    fn from(x: u32) -> GridSize {
+impl From<usize> for GridSize {
+    fn from(x: usize) -> GridSize {
         GridSize::x(x)
     }
 }
-impl From<(u32, u32)> for GridSize {
-    fn from((x, y): (u32, u32)) -> GridSize {
+impl From<(usize, usize)> for GridSize {
+    fn from((x, y): (usize, usize)) -> GridSize {
         GridSize::xy(x, y)
     }
 }
-impl From<(u32, u32, u32)> for GridSize {
-    fn from((x, y, z): (u32, u32, u32)) -> GridSize {
+impl From<(usize, usize, usize)> for GridSize {
+    fn from((x, y, z): (usize, usize, usize)) -> GridSize {
         GridSize::xyz(x, y, z)
     }
 }
@@ -64,52 +64,28 @@ impl From<&GridSize> for GridSize {
     }
 }
 #[cfg(feature = "vek")]
-impl From<vek::Vec2<u32>> for GridSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
-        GridSize::xy(vec.x, vec.y)
-    }
-}
-#[cfg(feature = "vek")]
-impl From<vek::Vec3<u32>> for GridSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
-        GridSize::xyz(vec.x, vec.y, vec.z)
-    }
-}
-#[cfg(feature = "vek")]
 impl From<vek::Vec2<usize>> for GridSize {
     fn from(vec: vek::Vec2<usize>) -> Self {
-        GridSize::xy(vec.x as u32, vec.y as u32)
+        GridSize::xy(vec.x, vec.y)
     }
 }
 #[cfg(feature = "vek")]
 impl From<vek::Vec3<usize>> for GridSize {
     fn from(vec: vek::Vec3<usize>) -> Self {
-        GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
-    }
-}
-
-#[cfg(feature = "glam")]
-impl From<glam::UVec2> for GridSize {
-    fn from(vec: glam::UVec2) -> Self {
-        GridSize::xy(vec.x, vec.y)
-    }
-}
-#[cfg(feature = "glam")]
-impl From<glam::UVec3> for GridSize {
-    fn from(vec: glam::UVec3) -> Self {
         GridSize::xyz(vec.x, vec.y, vec.z)
     }
 }
+
 #[cfg(feature = "glam")]
 impl From<glam::USizeVec2> for GridSize {
     fn from(vec: glam::USizeVec2) -> Self {
-        GridSize::xy(vec.x as u32, vec.y as u32)
+        GridSize::xy(vec.x, vec.y)
     }
 }
 #[cfg(feature = "glam")]
 impl From<glam::USizeVec3> for GridSize {
     fn from(vec: glam::USizeVec3) -> Self {
-        GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+        GridSize::xyz(vec.x, vec.y, vec.z)
     }
 }
 
@@ -123,43 +99,43 @@ impl From<glam::USizeVec3> for GridSize {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct BlockSize {
     /// X dimension of each thread block
-    pub x: u32,
+    pub x: usize,
     /// Y dimension of each thread block
-    pub y: u32,
+    pub y: usize,
     /// Z dimension of each thread block
-    pub z: u32,
+    pub z: usize,
 }
 impl BlockSize {
     /// Create a one-dimensional block of `x` threads
     #[inline]
-    pub fn x(x: u32) -> BlockSize {
+    pub fn x(x: usize) -> BlockSize {
         BlockSize { x, y: 1, z: 1 }
     }
 
     /// Create a two-dimensional block of `x * y` threads
     #[inline]
-    pub fn xy(x: u32, y: u32) -> BlockSize {
+    pub fn xy(x: usize, y: usize) -> BlockSize {
         BlockSize { x, y, z: 1 }
     }
 
     /// Create a three-dimensional block of `x * y * z` threads
     #[inline]
-    pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize {
+    pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize {
         BlockSize { x, y, z }
     }
 }
-impl From<u32> for BlockSize {
-    fn from(x: u32) -> BlockSize {
+impl From<usize> for BlockSize {
+    fn from(x: usize) -> BlockSize {
         BlockSize::x(x)
     }
 }
-impl From<(u32, u32)> for BlockSize {
-    fn from((x, y): (u32, u32)) -> BlockSize {
+impl From<(usize, usize)> for BlockSize {
+    fn from((x, y): (usize, usize)) -> BlockSize {
         BlockSize::xy(x, y)
     }
 }
-impl From<(u32, u32, u32)> for BlockSize {
-    fn from((x, y, z): (u32, u32, u32)) -> BlockSize {
+impl From<(usize, usize, usize)> for BlockSize {
+    fn from((x, y, z): (usize, usize, usize)) -> BlockSize {
         BlockSize::xyz(x, y, z)
     }
 }
@@ -169,52 +145,28 @@ impl From<&BlockSize> for BlockSize {
     }
 }
 #[cfg(feature = "vek")]
-impl From<vek::Vec2<u32>> for BlockSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
-        BlockSize::xy(vec.x, vec.y)
-    }
-}
-#[cfg(feature = "vek")]
-impl From<vek::Vec3<u32>> for BlockSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
-        BlockSize::xyz(vec.x, vec.y, vec.z)
-    }
-}
-#[cfg(feature = "vek")]
 impl From<vek::Vec2<usize>> for BlockSize {
     fn from(vec: vek::Vec2<usize>) -> Self {
-        BlockSize::xy(vec.x as u32, vec.y as u32)
+        BlockSize::xy(vec.x, vec.y)
     }
 }
 #[cfg(feature = "vek")]
 impl From<vek::Vec3<usize>> for BlockSize {
     fn from(vec: vek::Vec3<usize>) -> Self {
-        BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
-    }
-}
-
-#[cfg(feature = "glam")]
-impl From<glam::UVec2> for BlockSize {
-    fn from(vec: glam::UVec2) -> Self {
-        BlockSize::xy(vec.x, vec.y)
-    }
-}
-#[cfg(feature = "glam")]
-impl From<glam::UVec3> for BlockSize {
-    fn from(vec: glam::UVec3) -> Self {
         BlockSize::xyz(vec.x, vec.y, vec.z)
     }
 }
+
 #[cfg(feature = "glam")]
 impl From<glam::USizeVec2> for BlockSize {
     fn from(vec: glam::USizeVec2) -> Self {
-        BlockSize::xy(vec.x as u32, vec.y as u32)
+        BlockSize::xy(vec.x, vec.y)
     }
 }
 #[cfg(feature = "glam")]
 impl From<glam::USizeVec3> for BlockSize {
     fn from(vec: glam::USizeVec3) -> Self {
-        BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+        BlockSize::xyz(vec.x, vec.y, vec.z)
     }
 }
 
@@ -448,7 +400,7 @@ impl Function<'_> {
         &self,
         dynamic_smem_size: usize,
         block_size_limit: BlockSize,
-    ) -> CudaResult<(u32, u32)> {
+    ) -> CudaResult<(usize, usize)> {
         let mut min_grid_size = MaybeUninit::uninit();
         let mut block_size = MaybeUninit::uninit();
 
@@ -465,8 +417,8 @@ impl Function<'_> {
             )
             .to_result()?;
             Ok((
-                min_grid_size.assume_init() as u32,
-                block_size.assume_init() as u32,
+                min_grid_size.assume_init() as usize,
+                block_size.assume_init() as usize,
             ))
         }
     }
diff --git a/crates/cust/src/graph.rs b/crates/cust/src/graph.rs
index 9f04329e..f97a49d4 100644
--- a/crates/cust/src/graph.rs
+++ b/crates/cust/src/graph.rs
@@ -83,14 +83,18 @@ impl KernelInvocation {
     }
 
     pub fn to_raw(self) -> driver_sys::CUDA_KERNEL_NODE_PARAMS {
+        let to_u32 = |i: usize| {
+            i.try_into()
+                .expect("invocation dimension must fit in a `u32`")
+        };
         driver_sys::CUDA_KERNEL_NODE_PARAMS {
             func: self.func,
-            gridDimX: self.grid_dim.x,
-            gridDimY: self.grid_dim.y,
-            gridDimZ: self.grid_dim.z,
-            blockDimX: self.block_dim.x,
-            blockDimY: self.block_dim.y,
-            blockDimZ: self.block_dim.z,
+            gridDimX: to_u32(self.grid_dim.x),
+            gridDimY: to_u32(self.grid_dim.y),
+            gridDimZ: to_u32(self.grid_dim.z),
+            blockDimX: to_u32(self.block_dim.x),
+            blockDimY: to_u32(self.block_dim.y),
+            blockDimZ: to_u32(self.block_dim.z),
             kernelParams: Box::into_raw(self.params),
             sharedMemBytes: self.shared_mem_bytes,
             extra: ptr::null_mut(),
@@ -109,8 +113,16 @@ impl KernelInvocation {
     pub unsafe fn from_raw(raw: driver_sys::CUDA_KERNEL_NODE_PARAMS) -> Self {
         Self {
             func: raw.func,
-            grid_dim: GridSize::xyz(raw.gridDimX, raw.gridDimY, raw.gridDimZ),
-            block_dim: BlockSize::xyz(raw.blockDimX, raw.gridDimY, raw.gridDimZ),
+            grid_dim: GridSize::xyz(
+                raw.gridDimX as usize,
+                raw.gridDimY as usize,
+                raw.gridDimZ as usize,
+            ),
+            block_dim: BlockSize::xyz(
+                raw.blockDimX as usize,
+                raw.gridDimY as usize,
+                raw.gridDimZ as usize,
+            ),
             params: Box::from_raw(raw.kernelParams),
             shared_mem_bytes: raw.sharedMemBytes,
             params_len: None,
diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs
index 2cbbab84..34c24865 100644
--- a/crates/cust/src/stream.rs
+++ b/crates/cust/src/stream.rs
@@ -271,15 +271,16 @@ impl Stream {
         let grid_size: GridSize = grid_size.into();
         let block_size: BlockSize = block_size.into();
 
+        let to_u32 = |i: usize| i.try_into().expect("launch size must fit in a `u32`");
         unsafe {
             driver_sys::cuLaunchKernel(
                 func.to_raw(),
-                grid_size.x,
-                grid_size.y,
-                grid_size.z,
-                block_size.x,
-                block_size.y,
-                block_size.z,
+                to_u32(grid_size.x),
+                to_u32(grid_size.y),
+                to_u32(grid_size.z),
+                to_u32(block_size.x),
+                to_u32(block_size.y),
+                to_u32(block_size.z),
                 shared_mem_bytes,
                 self.inner,
                 args.as_ptr() as *mut _,
diff --git a/crates/optix/examples/path_tracer/kernels/src/render.rs b/crates/optix/examples/path_tracer/kernels/src/render.rs
index 86cd8cc6..eb589710 100644
--- a/crates/optix/examples/path_tracer/kernels/src/render.rs
+++ b/crates/optix/examples/path_tracer/kernels/src/render.rs
@@ -1,5 +1,5 @@
 use crate::*;
-use cuda_std::glam::UVec2;
+use cuda_std::glam::USizeVec2;
 
 const BACKGROUND_BLUE_MULTIPLIER: f32 = 0.7;
 
@@ -9,7 +9,7 @@ pub fn color(ray: Ray) -> Vec3 {
     (1.0 - t) * Vec3::ONE + t * Vec3::new(0.5, 0.7, 1.0)
 }
 
-pub fn generate_ray(idx: UVec2, view: &Viewport, offset: Vec2) -> Ray {
+pub fn generate_ray(idx: USizeVec2, view: &Viewport, offset: Vec2) -> Ray {
     let uv = (idx.as_vec2() + offset) / view.bounds.as_vec2();
     Ray {
         origin: view.origin,
diff --git a/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs b/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs
index e3141bf4..af63bee9 100644
--- a/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs
+++ b/crates/optix/examples/path_tracer/kernels/src/render_kernels.rs
@@ -6,10 +6,10 @@ use gpu_rand::{DefaultRand, GpuRand};
 #[kernel]
 pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: *mut DefaultRand) {
     let idx = thread::index_2d();
-    if idx.x >= view.bounds.x as u32 || idx.y >= view.bounds.y as u32 {
+    if idx.x >= view.bounds.x || idx.y >= view.bounds.y {
         return;
     }
-    let px_idx = idx.y as usize * view.bounds.x + idx.x as usize;
+    let px_idx = idx.y * view.bounds.x + idx.x;
 
     // generate a tiny offset for the ray for antialiasing
     let rng = unsafe { &mut *rand_states.add(px_idx) };
@@ -27,10 +27,10 @@ pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states:
 #[kernel]
 pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: Viewport) {
     let idx_2d = thread::index_2d();
-    if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
+    if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
         return;
     }
-    let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
+    let idx = idx_2d.y * view.bounds.x + idx_2d.x;
     let original = unsafe { &*fb.add(idx) };
     let out = unsafe { &mut *out.add(idx) };
 
@@ -43,10 +43,10 @@ pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view:
 #[kernel]
 pub unsafe fn postprocess(fb: *const Vec3, out: *mut U8Vec3, view: Viewport) {
     let idx_2d = thread::index_2d();
-    if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
+    if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
         return;
     }
-    let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
+    let idx = idx_2d.y * view.bounds.x + idx_2d.x;
     let original = unsafe { &*fb.add(idx) };
     let out = unsafe { &mut *out.add(idx) };
     // gamma=2.0
diff --git a/crates/optix/examples/path_tracer/src/cpu/mod.rs b/crates/optix/examples/path_tracer/src/cpu/mod.rs
index e5f3f1ee..d06c5485 100644
--- a/crates/optix/examples/path_tracer/src/cpu/mod.rs
+++ b/crates/optix/examples/path_tracer/src/cpu/mod.rs
@@ -1,6 +1,6 @@
 use std::time::Duration;
 
-use glam::{U8Vec3, USizeVec2, UVec2, Vec2, Vec3};
+use glam::{U8Vec3, USizeVec2, Vec2, Vec3};
 use gpu_rand::{DefaultRand, GpuRand};
 use imgui::Ui;
 use path_tracer_kernels::{
@@ -131,7 +131,7 @@ impl CpuRenderer {
             .for_each(|(idx, (px, rng))| {
                 let x = idx % viewport.bounds.x;
                 let y = idx / viewport.bounds.x;
-                let idx = UVec2::new(x as u32, y as u32);
+                let idx = USizeVec2::new(x, y);
 
                 let offset = Vec2::from(rng.normal_f32_2());
 
diff --git a/crates/optix/examples/path_tracer/src/main.rs b/crates/optix/examples/path_tracer/src/main.rs
index f65d75fa..da789a38 100644
--- a/crates/optix/examples/path_tracer/src/main.rs
+++ b/crates/optix/examples/path_tracer/src/main.rs
@@ -16,8 +16,8 @@ use path_tracer_kernels::{
 };
 use std::error::Error;
 
-pub const WIDTH: u32 = 1920;
-pub const HEIGHT: u32 = 1080;
+pub const WIDTH: usize = 1920;
+pub const HEIGHT: usize = 1080;
 
 fn main() -> Result<(), Box<dyn Error>> {
     let camera = Camera {
diff --git a/crates/optix/examples/path_tracer/src/viewer.rs b/crates/optix/examples/path_tracer/src/viewer.rs
index 3086d078..c15884ff 100644
--- a/crates/optix/examples/path_tracer/src/viewer.rs
+++ b/crates/optix/examples/path_tracer/src/viewer.rs
@@ -60,11 +60,7 @@ pub fn run(camera: &Camera, scene: &Scene) -> ! {
         .with_inner_size(PhysicalSize::new(WIDTH as f64, HEIGHT as f64));
     let cb = ContextBuilder::new().with_vsync(true);
     let display = Display::new(wb, cb, &event_loop).unwrap();
-    let renderer = Renderer::new(
-        USizeVec2::new(WIDTH as usize, HEIGHT as usize),
-        camera,
-        scene,
-    );
+    let renderer = Renderer::new(USizeVec2::new(WIDTH, HEIGHT), camera, scene);
     let mut viewer = ViewerRenderer::new(display, renderer);
 
     let mut last_frame = Instant::now();
@@ -93,8 +89,7 @@ impl ViewerRenderer {
 
         let size = display.gl_window().window().inner_size();
         let image_size = USizeVec2::new(size.width as usize, size.height as usize);
-        let texture =
-            SrgbTexture2d::empty(&display, image_size.x as u32, image_size.y as u32).unwrap();
+        let texture = SrgbTexture2d::empty(&display, size.width, size.height).unwrap();
 
         let mut imgui_ctx = imgui::Context::create();
         imgui_ctx.set_ini_filename(None);
diff --git a/examples/gemm/kernels/src/gemm_naive.rs b/examples/gemm/kernels/src/gemm_naive.rs
index 02ba504f..7897867b 100644
--- a/examples/gemm/kernels/src/gemm_naive.rs
+++ b/examples/gemm/kernels/src/gemm_naive.rs
@@ -32,15 +32,15 @@ pub unsafe fn gemm_naive(
     alpha: f32,
     beta: f32,
 ) {
-    let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
-    let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
+    let row = thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x();
+    let col = thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y();
 
     if row < m && col < n {
         let mut sum = 0.0f32;
         for i in 0..k {
             sum += mat_a[row * k + i] * mat_b[i * n + col];
         }
-        let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
+        let elem = unsafe { &mut *mat_c.add(row * n + col) };
         *elem = alpha * sum + beta * *elem;
     }
 }
diff --git a/examples/gemm/kernels/src/gemm_tiled.rs b/examples/gemm/kernels/src/gemm_tiled.rs
index 5cc172f8..378d7779 100644
--- a/examples/gemm/kernels/src/gemm_tiled.rs
+++ b/examples/gemm/kernels/src/gemm_tiled.rs
@@ -53,12 +53,12 @@ pub unsafe fn gemm_tiled(
     static mut TILE_B: [MaybeUninit<f32>; TILE_SIZE_2D] = [MaybeUninit::uninit(); TILE_SIZE_2D];
 
     // Thread indices within the block.
-    let tx = thread::thread_idx_x() as usize;
-    let ty = thread::thread_idx_y() as usize;
+    let tx = thread::thread_idx_x();
+    let ty = thread::thread_idx_y();
 
     // Calculate row and column in the mat_c.
-    let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
-    let col = thread::block_idx_y() as usize * TILE_SIZE + tx;
+    let row = thread::block_idx_x() * TILE_SIZE + ty;
+    let col = thread::block_idx_y() * TILE_SIZE + tx;
 
     let mut sum = 0.0f32;
     // Loop over tiles of mat_a and mat_b in the k dimension.
diff --git a/examples/gemm/src/main.rs b/examples/gemm/src/main.rs
index 531e0c7c..a57e91bd 100644
--- a/examples/gemm/src/main.rs
+++ b/examples/gemm/src/main.rs
@@ -365,16 +365,12 @@ pub fn gemm_naive(
     // This will try to maximize how much of the GPU is used by finding the best launch configuration for the
     // current CUDA device/architecture.
     let (_, block_size) = kernel.suggested_launch_configuration(0, 0.into())?;
-    let block_size = block_size as usize;
     let (block_size_x, block_size_y) = if block_size > m * n {
-        (block_size.div_ceil(m) as u32, m as u32)
+        (block_size.div_ceil(m), m)
     } else {
-        (1, block_size as u32)
+        (1, block_size)
     };
-    let (grid_size_x, grid_size_y) = (
-        (m as u32).div_ceil(block_size_x),
-        (n as u32).div_ceil(block_size_y),
-    );
+    let (grid_size_x, grid_size_y) = (m.div_ceil(block_size_x), n.div_ceil(block_size_y));
     unsafe {
         launch!(
             kernel<<<
@@ -438,12 +434,12 @@ pub fn gemm_tiled(
     });
     let kernel = &*kernel_cell;
 
-    let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE) as u32, m.div_ceil(TILE_SIZE) as u32);
+    let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE), m.div_ceil(TILE_SIZE));
     unsafe {
         launch!(
             kernel<<<
                 (grid_size_x, grid_size_y),
-                (TILE_SIZE as u32, TILE_SIZE as u32),
+                (TILE_SIZE, TILE_SIZE),
                 0,
                 stream
             >>>(
diff --git a/examples/i128_demo/kernels/src/lib.rs b/examples/i128_demo/kernels/src/lib.rs
index 8598e15e..5d4a9411 100644
--- a/examples/i128_demo/kernels/src/lib.rs
+++ b/examples/i128_demo/kernels/src/lib.rs
@@ -18,7 +18,7 @@ pub unsafe fn i128_ops(
     urem_out: *mut u128,
     srem_out: *mut u128,
 ) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
     if idx >= a.len() || idx >= b.len() {
         return;
     }
diff --git a/examples/i128_demo/src/main.rs b/examples/i128_demo/src/main.rs
index 3ed9f92d..a2e6e7b3 100644
--- a/examples/i128_demo/src/main.rs
+++ b/examples/i128_demo/src/main.rs
@@ -119,8 +119,8 @@ fn main() -> Result<(), Box<dyn Error>> {
     let urem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?;
     let srem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?;
 
-    let block_size = 128u32;
-    let grid_size = (len as u32).div_ceil(block_size);
+    let block_size = 128usize;
+    let grid_size = len.div_ceil(block_size);
 
     unsafe {
         launch!(
@@ -241,7 +241,7 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     let trap_launch = unsafe {
         launch!(
-            kernel<<<1u32, 1u32, 0, trap_stream>>>(
+            kernel<<<1, 1, 0, trap_stream>>>(
                 trap_a.as_device_ptr(),
                 trap_a.len(),
                 trap_b.as_device_ptr(),
diff --git a/examples/sha2_crates_io/kernels/src/lib.rs b/examples/sha2_crates_io/kernels/src/lib.rs
index 7d5da4da..49b17f8b 100644
--- a/examples/sha2_crates_io/kernels/src/lib.rs
+++ b/examples/sha2_crates_io/kernels/src/lib.rs
@@ -5,7 +5,7 @@ use sha2::{Digest, Sha256, Sha512};
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
 
     if idx == 0 {
         let hash = Sha256::digest(input);
@@ -21,7 +21,7 @@ pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) {
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8; 32]) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
 
     if idx == 0 {
         let mut hasher = Sha256::new();
@@ -40,7 +40,7 @@ pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8;
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
 
     if idx == 0 {
         let hash = Sha512::digest(input);
@@ -56,7 +56,7 @@ pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) {
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn sha512_incremental(input: &[u8], output: *mut [u8; 64]) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
 
     if idx == 0 {
         let mut hasher = Sha512::new();
diff --git a/examples/vecadd/kernels/src/lib.rs b/examples/vecadd/kernels/src/lib.rs
index ba1c3038..2020b256 100644
--- a/examples/vecadd/kernels/src/lib.rs
+++ b/examples/vecadd/kernels/src/lib.rs
@@ -3,7 +3,7 @@ use cuda_std::prelude::*;
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) {
-    let idx = thread::index_1d() as usize;
+    let idx = thread::index_1d();
     if idx < a.len() {
         let elem = unsafe { &mut *c.add(idx) };
         *elem = a[idx] + b[idx];
diff --git a/examples/vecadd/src/main.rs b/examples/vecadd/src/main.rs
index cf804d0c..73260c59 100644
--- a/examples/vecadd/src/main.rs
+++ b/examples/vecadd/src/main.rs
@@ -45,7 +45,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     // current CUDA device/architecture.
     let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?;
 
-    let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size);
+    let grid_size = NUMBERS_LEN.div_ceil(block_size);
 
     println!("using {grid_size} blocks and {block_size} threads per block");
 
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
index 762e17cb..1206bb3a 100644
--- a/guide/src/guide/getting_started.md
+++ b/guide/src/guide/getting_started.md
@@ -119,7 +119,7 @@ pub type T = f32;
 #[kernel]
 #[allow(improper_ctypes_definitions)]
 pub unsafe fn add(a: &[T], b: &[T], c: *mut T) {
-    let i = thread::index_1d() as usize;
+    let i = thread::index_1d();
     if i < a.len() {
         let elem = unsafe { &mut *c.add(i) };
         *elem = a[i] + b[i];
diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
index ffcc07d3..1d98c9a3 100644
--- a/samples/introduction/async_api/kernels/src/lib.rs
+++ b/samples/introduction/async_api/kernels/src/lib.rs
@@ -8,9 +8,9 @@ use cuda_std::prelude::*;
 pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
     // This can also be obtained directly as
     //
-    // let idx: usize = cuda_std::thread::index() as usize;
-    let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x
-        + cuda_std::thread::thread_idx().x) as usize;
+    // let idx: usize = cuda_std::thread::index();
+    let idx: usize = cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x
+        + cuda_std::thread::thread_idx().x;
 
     let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
     *elem += inc_value;
diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
index 063efbd3..9ddc31c9 100644
--- a/samples/introduction/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -37,7 +37,7 @@ fn main() -> Result<(), cust::error::CudaError> {
     let value = 26;
 
     let blocks = BlockSize::xy(512, 1);
-    let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);
+    let grids = GridSize::xy(N / blocks.x, 1);
 
     let start_event = Event::new(EventFlags::DEFAULT)?;
     let stop_event = Event::new(EventFlags::DEFAULT)?;
diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs
index a9ed3b7b..fd5e53da 100644
--- a/tests/compiletests/ui/shared/shared_memory.rs
+++ b/tests/compiletests/ui/shared/shared_memory.rs
@@ -13,7 +13,7 @@ pub unsafe fn test_static_shared_memory() {
     #[address_space(shared)]
     static mut SHARED_DATA: [MaybeUninit<i32>; 256] = [MaybeUninit::uninit(); 256];
 
-    let tid = thread::thread_idx_x() as usize;
+    let tid = thread::thread_idx_x();
 
     // Write to shared memory
     unsafe { SHARED_DATA[tid] }.write(tid as i32);