Skip to content

Commit 55cd2bc

Browse files
committed
Use usize in index and dimension functions.
Specifically: - `thread_idx*` - `block_idx*` - `block_dim*` - `grid_dim*` - `index*` This removes lots of `as u32`/`as usize` casts.
1 parent 92217f0 commit 55cd2bc

File tree

15 files changed

+74
-79
lines changed

15 files changed

+74
-79
lines changed

crates/cuda_std/src/thread.rs

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
//! vary by device. Query device properties when you need exact limits.
6767
//!
6868
use cuda_std_macros::gpu_only;
69-
use glam::{UVec2, UVec3};
69+
use glam::{USizeVec2, USizeVec3};
7070

7171
// different calling conventions dont exist in nvptx, so we just use C as a placeholder.
7272
unsafe extern "C" {
@@ -99,116 +99,116 @@ macro_rules! in_range {
9999

100100
#[gpu_only]
101101
#[inline(always)]
102-
pub fn thread_idx_x() -> u32 {
102+
pub fn thread_idx_x() -> usize {
103103
// The range is derived from the `block_idx_x` range.
104-
in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
104+
in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize
105105
}
106106

107107
#[gpu_only]
108108
#[inline(always)]
109-
pub fn thread_idx_y() -> u32 {
109+
pub fn thread_idx_y() -> usize {
110110
// The range is derived from the `block_idx_y` range.
111-
in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
111+
in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize
112112
}
113113

114114
#[gpu_only]
115115
#[inline(always)]
116-
pub fn thread_idx_z() -> u32 {
116+
pub fn thread_idx_z() -> usize {
117117
// The range is derived from the `block_idx_z` range.
118-
in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
118+
in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize
119119
}
120120

121121
#[gpu_only]
122122
#[inline(always)]
123-
pub fn block_idx_x() -> u32 {
123+
pub fn block_idx_x() -> usize {
124124
// The range is derived from the `grid_idx_x` range.
125-
in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
125+
in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize
126126
}
127127

128128
#[gpu_only]
129129
#[inline(always)]
130-
pub fn block_idx_y() -> u32 {
130+
pub fn block_idx_y() -> usize {
131131
// The range is derived from the `grid_idx_y` range.
132-
in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
132+
in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize
133133
}
134134

135135
#[gpu_only]
136136
#[inline(always)]
137-
pub fn block_idx_z() -> u32 {
137+
pub fn block_idx_z() -> usize {
138138
// The range is derived from the `grid_idx_z` range.
139-
in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
139+
in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize
140140
}
141141

142142
#[gpu_only]
143143
#[inline(always)]
144-
pub fn block_dim_x() -> u32 {
144+
pub fn block_dim_x() -> usize {
145145
// CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
146-
in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
146+
in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize
147147
}
148148

149149
#[gpu_only]
150150
#[inline(always)]
151-
pub fn block_dim_y() -> u32 {
151+
pub fn block_dim_y() -> usize {
152152
// CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
153-
in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
153+
in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize
154154
}
155155

156156
#[gpu_only]
157157
#[inline(always)]
158-
pub fn block_dim_z() -> u32 {
158+
pub fn block_dim_z() -> usize {
159159
// CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
160-
in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
160+
in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize
161161
}
162162

163163
#[gpu_only]
164164
#[inline(always)]
165-
pub fn grid_dim_x() -> u32 {
165+
pub fn grid_dim_x() -> usize {
166166
// CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
167-
in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
167+
in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize
168168
}
169169

170170
#[gpu_only]
171171
#[inline(always)]
172-
pub fn grid_dim_y() -> u32 {
172+
pub fn grid_dim_y() -> usize {
173173
// CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
174-
in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
174+
in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize
175175
}
176176

177177
#[gpu_only]
178178
#[inline(always)]
179-
pub fn grid_dim_z() -> u32 {
179+
pub fn grid_dim_z() -> usize {
180180
// CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
181-
in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
181+
in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize
182182
}
183183

184184
/// Gets the 3d index of the thread currently executing the kernel.
185185
#[gpu_only]
186186
#[inline(always)]
187-
pub fn thread_idx() -> UVec3 {
188-
UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
187+
pub fn thread_idx() -> USizeVec3 {
188+
USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
189189
}
190190

191191
/// Gets the 3d index of the block that the thread currently executing the kernel is located in.
192192
#[gpu_only]
193193
#[inline(always)]
194-
pub fn block_idx() -> UVec3 {
195-
UVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
194+
pub fn block_idx() -> USizeVec3 {
195+
USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
196196
}
197197

198198
/// Gets the 3d layout of the thread blocks executing this kernel. In other words,
199199
/// how many threads exist in each thread block in every direction.
200200
#[gpu_only]
201201
#[inline(always)]
202-
pub fn block_dim() -> UVec3 {
203-
UVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
202+
pub fn block_dim() -> USizeVec3 {
203+
USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
204204
}
205205

206206
/// Gets the 3d layout of the block grids executing this kernel. In other words,
207207
/// how many thread blocks exist in each grid in every direction.
208208
#[gpu_only]
209209
#[inline(always)]
210-
pub fn grid_dim() -> UVec3 {
211-
UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
210+
pub fn grid_dim() -> USizeVec3 {
211+
USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
212212
}
213213

214214
/// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
@@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 {
220220
#[gpu_only]
221221
#[rustfmt::skip]
222222
#[inline(always)]
223-
pub fn index() -> u32 {
223+
pub fn index() -> usize {
224224
let grid_dim = grid_dim();
225225
let block_idx = block_idx();
226226
let block_dim = block_dim();
@@ -235,31 +235,31 @@ pub fn index() -> u32 {
235235
}
236236

237237
#[inline(always)]
238-
pub fn index_1d() -> u32 {
239-
thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32
238+
pub fn index_1d() -> usize {
239+
thread_idx_x() + block_idx_x() * block_dim_x()
240240
}
241241

242242
#[inline(always)]
243-
pub fn index_2d() -> UVec2 {
243+
pub fn index_2d() -> USizeVec2 {
244244
let i = thread_idx_x() + block_idx_x() * block_dim_x();
245245
let j = thread_idx_y() + block_idx_y() * block_dim_y();
246-
UVec2::new(i, j)
246+
USizeVec2::new(i, j)
247247
}
248248

249249
#[inline(always)]
250-
pub fn index_3d() -> UVec3 {
250+
pub fn index_3d() -> USizeVec3 {
251251
let i = thread_idx_x() + block_idx_x() * block_dim_x();
252252
let j = thread_idx_y() + block_idx_y() * block_dim_y();
253253
let k = thread_idx_z() + block_idx_z() * block_dim_z();
254-
UVec3::new(i, j, k)
254+
USizeVec3::new(i, j, k)
255255
}
256256

257257
/// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
258258
/// to only return true in a single thread that is invoking it. This is useful for only doing something
259259
/// once.
260260
#[inline(always)]
261261
pub fn first() -> bool {
262-
block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
262+
block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO
263263
}
264264

265265
/// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.

crates/cuda_std_macros/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ pub fn externally_visible(
253253
/// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
254254
/// ##[address_space(shared)]
255255
/// static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
256-
/// let i = thread::thread_idx_x() as usize;
256+
/// let i = thread::thread_idx_x();
257257
/// let ir = n - i - 1;
258258
/// unsafe { S[i].write(*d.add(i)); };
259259
/// thread::sync_threads();

crates/optix/examples/path_tracer/kernels/src/render.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::*;
2-
use cuda_std::glam::UVec2;
2+
use cuda_std::glam::USizeVec2;
33

44
const BACKGROUND_BLUE_MULTIPLIER: f32 = 0.7;
55

@@ -9,7 +9,7 @@ pub fn color(ray: Ray) -> Vec3 {
99
(1.0 - t) * Vec3::ONE + t * Vec3::new(0.5, 0.7, 1.0)
1010
}
1111

12-
pub fn generate_ray(idx: UVec2, view: &Viewport, offset: Vec2) -> Ray {
12+
pub fn generate_ray(idx: USizeVec2, view: &Viewport, offset: Vec2) -> Ray {
1313
let uv = (idx.as_vec2() + offset) / view.bounds.as_vec2();
1414
Ray {
1515
origin: view.origin,

crates/optix/examples/path_tracer/kernels/src/render_kernels.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ use gpu_rand::{DefaultRand, GpuRand};
66
#[kernel]
77
pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states: *mut DefaultRand) {
88
let idx = thread::index_2d();
9-
if idx.x >= view.bounds.x as u32 || idx.y >= view.bounds.y as u32 {
9+
if idx.x >= view.bounds.x || idx.y >= view.bounds.y {
1010
return;
1111
}
12-
let px_idx = idx.y as usize * view.bounds.x + idx.x as usize;
12+
let px_idx = idx.y * view.bounds.x + idx.x;
1313

1414
// generate a tiny offset for the ray for antialiasing
1515
let rng = unsafe { &mut *rand_states.add(px_idx) };
@@ -27,10 +27,10 @@ pub unsafe fn render(fb: *mut Vec3, view: Viewport, scene: &Scene, rand_states:
2727
#[kernel]
2828
pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view: Viewport) {
2929
let idx_2d = thread::index_2d();
30-
if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
30+
if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
3131
return;
3232
}
33-
let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
33+
let idx = idx_2d.y * view.bounds.x + idx_2d.x;
3434
let original = unsafe { &*fb.add(idx) };
3535
let out = unsafe { &mut *out.add(idx) };
3636

@@ -43,10 +43,10 @@ pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view:
4343
#[kernel]
4444
pub unsafe fn postprocess(fb: *const Vec3, out: *mut U8Vec3, view: Viewport) {
4545
let idx_2d = thread::index_2d();
46-
if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
46+
if idx_2d.x >= view.bounds.x || idx_2d.y >= view.bounds.y {
4747
return;
4848
}
49-
let idx = idx_2d.y as usize * view.bounds.x + idx_2d.x as usize;
49+
let idx = idx_2d.y * view.bounds.x + idx_2d.x;
5050
let original = unsafe { &*fb.add(idx) };
5151
let out = unsafe { &mut *out.add(idx) };
5252
// gamma=2.0

crates/optix/examples/path_tracer/src/cpu/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::time::Duration;
22

3-
use glam::{U8Vec3, USizeVec2, UVec2, Vec2, Vec3};
3+
use glam::{U8Vec3, USizeVec2, Vec2, Vec3};
44
use gpu_rand::{DefaultRand, GpuRand};
55
use imgui::Ui;
66
use path_tracer_kernels::{
@@ -131,7 +131,7 @@ impl CpuRenderer {
131131
.for_each(|(idx, (px, rng))| {
132132
let x = idx % viewport.bounds.x;
133133
let y = idx / viewport.bounds.x;
134-
let idx = UVec2::new(x as u32, y as u32);
134+
let idx = USizeVec2::new(x, y);
135135

136136
let offset = Vec2::from(rng.normal_f32_2());
137137

crates/optix/examples/path_tracer/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ use path_tracer_kernels::{
1616
};
1717
use std::error::Error;
1818

19-
pub const WIDTH: u32 = 1920;
20-
pub const HEIGHT: u32 = 1080;
19+
pub const WIDTH: usize = 1920;
20+
pub const HEIGHT: usize = 1080;
2121

2222
fn main() -> Result<(), Box<dyn Error>> {
2323
let camera = Camera {

crates/optix/examples/path_tracer/src/viewer.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,7 @@ pub fn run(camera: &Camera, scene: &Scene) -> ! {
6060
.with_inner_size(PhysicalSize::new(WIDTH as f64, HEIGHT as f64));
6161
let cb = ContextBuilder::new().with_vsync(true);
6262
let display = Display::new(wb, cb, &event_loop).unwrap();
63-
let renderer = Renderer::new(
64-
USizeVec2::new(WIDTH as usize, HEIGHT as usize),
65-
camera,
66-
scene,
67-
);
63+
let renderer = Renderer::new(USizeVec2::new(WIDTH, HEIGHT), camera, scene);
6864
let mut viewer = ViewerRenderer::new(display, renderer);
6965

7066
let mut last_frame = Instant::now();
@@ -93,8 +89,7 @@ impl ViewerRenderer {
9389

9490
let size = display.gl_window().window().inner_size();
9591
let image_size = USizeVec2::new(size.width as usize, size.height as usize);
96-
let texture =
97-
SrgbTexture2d::empty(&display, image_size.x as u32, image_size.y as u32).unwrap();
92+
let texture = SrgbTexture2d::empty(&display, size.width, size.height).unwrap();
9893

9994
let mut imgui_ctx = imgui::Context::create();
10095
imgui_ctx.set_ini_filename(None);

examples/gemm/kernels/src/gemm_naive.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,15 @@ pub unsafe fn gemm_naive(
3232
alpha: f32,
3333
beta: f32,
3434
) {
35-
let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
36-
let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
35+
let row = thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x();
36+
let col = thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y();
3737

3838
if row < m && col < n {
3939
let mut sum = 0.0f32;
4040
for i in 0..k {
4141
sum += mat_a[row * k + i] * mat_b[i * n + col];
4242
}
43-
let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
43+
let elem = unsafe { &mut *mat_c.add(row * n + col) };
4444
*elem = alpha * sum + beta * *elem;
4545
}
4646
}

examples/gemm/kernels/src/gemm_tiled.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ pub unsafe fn gemm_tiled(
5353
static mut TILE_B: [MaybeUninit<f32>; TILE_SIZE_2D] = [MaybeUninit::uninit(); TILE_SIZE_2D];
5454

5555
// Thread indices within the block.
56-
let tx = thread::thread_idx_x() as usize;
57-
let ty = thread::thread_idx_y() as usize;
56+
let tx = thread::thread_idx_x();
57+
let ty = thread::thread_idx_y();
5858

5959
// Calculate row and column in the mat_c.
60-
let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
61-
let col = thread::block_idx_y() as usize * TILE_SIZE + tx;
60+
let row = thread::block_idx_x() * TILE_SIZE + ty;
61+
let col = thread::block_idx_y() * TILE_SIZE + tx;
6262

6363
let mut sum = 0.0f32;
6464
// Loop over tiles of mat_a and mat_b in the k dimension.

examples/i128_demo/kernels/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pub unsafe fn i128_ops(
1818
urem_out: *mut u128,
1919
srem_out: *mut u128,
2020
) {
21-
let idx = thread::index_1d() as usize;
21+
let idx = thread::index_1d();
2222
if idx >= a.len() || idx >= b.len() {
2323
return;
2424
}

0 commit comments

Comments
 (0)