6666//! vary by device. Query device properties when you need exact limits.
6767//!
6868use cuda_std_macros:: gpu_only;
69- use glam:: { UVec2 , UVec3 } ;
69+ use glam:: { USizeVec2 , USizeVec3 } ;
7070
7171// different calling conventions dont exist in nvptx, so we just use C as a placeholder.
7272unsafe extern "C" {
@@ -99,116 +99,116 @@ macro_rules! in_range {
9999
100100#[ gpu_only]
101101#[ inline( always) ]
102- pub fn thread_idx_x ( ) -> u32 {
102+ pub fn thread_idx_x ( ) -> usize {
103103 // The range is derived from the `block_idx_x` range.
104- in_range ! ( core:: arch:: nvptx:: _thread_idx_x, 0 ..1024 )
104+ in_range ! ( core:: arch:: nvptx:: _thread_idx_x, 0 ..1024 ) as usize
105105}
106106
107107#[ gpu_only]
108108#[ inline( always) ]
109- pub fn thread_idx_y ( ) -> u32 {
109+ pub fn thread_idx_y ( ) -> usize {
110110 // The range is derived from the `block_idx_y` range.
111- in_range ! ( core:: arch:: nvptx:: _thread_idx_y, 0 ..1024 )
111+ in_range ! ( core:: arch:: nvptx:: _thread_idx_y, 0 ..1024 ) as usize
112112}
113113
114114#[ gpu_only]
115115#[ inline( always) ]
116- pub fn thread_idx_z ( ) -> u32 {
116+ pub fn thread_idx_z ( ) -> usize {
117117 // The range is derived from the `block_idx_z` range.
118- in_range ! ( core:: arch:: nvptx:: _thread_idx_z, 0 ..64 )
118+ in_range ! ( core:: arch:: nvptx:: _thread_idx_z, 0 ..64 ) as usize
119119}
120120
121121#[ gpu_only]
122122#[ inline( always) ]
123- pub fn block_idx_x ( ) -> u32 {
123+ pub fn block_idx_x ( ) -> usize {
124124 // The range is derived from the `grid_idx_x` range.
125- in_range ! ( core:: arch:: nvptx:: _block_idx_x, 0 ..2147483647 )
125+ in_range ! ( core:: arch:: nvptx:: _block_idx_x, 0 ..2147483647 ) as usize
126126}
127127
128128#[ gpu_only]
129129#[ inline( always) ]
130- pub fn block_idx_y ( ) -> u32 {
130+ pub fn block_idx_y ( ) -> usize {
131131 // The range is derived from the `grid_idx_y` range.
132- in_range ! ( core:: arch:: nvptx:: _block_idx_y, 0 ..65535 )
132+ in_range ! ( core:: arch:: nvptx:: _block_idx_y, 0 ..65535 ) as usize
133133}
134134
135135#[ gpu_only]
136136#[ inline( always) ]
137- pub fn block_idx_z ( ) -> u32 {
137+ pub fn block_idx_z ( ) -> usize {
138138 // The range is derived from the `grid_idx_z` range.
139- in_range ! ( core:: arch:: nvptx:: _block_idx_z, 0 ..65535 )
139+ in_range ! ( core:: arch:: nvptx:: _block_idx_z, 0 ..65535 ) as usize
140140}
141141
142142#[ gpu_only]
143143#[ inline( always) ]
144- pub fn block_dim_x ( ) -> u32 {
144+ pub fn block_dim_x ( ) -> usize {
145145 // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
146- in_range ! ( core:: arch:: nvptx:: _block_dim_x, 1 ..=1024 )
146+ in_range ! ( core:: arch:: nvptx:: _block_dim_x, 1 ..=1024 ) as usize
147147}
148148
149149#[ gpu_only]
150150#[ inline( always) ]
151- pub fn block_dim_y ( ) -> u32 {
151+ pub fn block_dim_y ( ) -> usize {
152152 // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
153- in_range ! ( core:: arch:: nvptx:: _block_dim_y, 1 ..=1024 )
153+ in_range ! ( core:: arch:: nvptx:: _block_dim_y, 1 ..=1024 ) as usize
154154}
155155
156156#[ gpu_only]
157157#[ inline( always) ]
158- pub fn block_dim_z ( ) -> u32 {
158+ pub fn block_dim_z ( ) -> usize {
159159 // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
160- in_range ! ( core:: arch:: nvptx:: _block_dim_z, 1 ..=64 )
160+ in_range ! ( core:: arch:: nvptx:: _block_dim_z, 1 ..=64 ) as usize
161161}
162162
163163#[ gpu_only]
164164#[ inline( always) ]
165- pub fn grid_dim_x ( ) -> u32 {
165+ pub fn grid_dim_x ( ) -> usize {
166166 // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
167- in_range ! ( core:: arch:: nvptx:: _grid_dim_x, 1 ..=2147483647 )
167+ in_range ! ( core:: arch:: nvptx:: _grid_dim_x, 1 ..=2147483647 ) as usize
168168}
169169
170170#[ gpu_only]
171171#[ inline( always) ]
172- pub fn grid_dim_y ( ) -> u32 {
172+ pub fn grid_dim_y ( ) -> usize {
173173 // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
174- in_range ! ( core:: arch:: nvptx:: _grid_dim_y, 1 ..=65535 )
174+ in_range ! ( core:: arch:: nvptx:: _grid_dim_y, 1 ..=65535 ) as usize
175175}
176176
177177#[ gpu_only]
178178#[ inline( always) ]
179- pub fn grid_dim_z ( ) -> u32 {
179+ pub fn grid_dim_z ( ) -> usize {
180180 // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
181- in_range ! ( core:: arch:: nvptx:: _grid_dim_z, 1 ..=65535 )
181+ in_range ! ( core:: arch:: nvptx:: _grid_dim_z, 1 ..=65535 ) as usize
182182}
183183
184184/// Gets the 3d index of the thread currently executing the kernel.
185185#[ gpu_only]
186186#[ inline( always) ]
187- pub fn thread_idx ( ) -> UVec3 {
188- UVec3 :: new ( thread_idx_x ( ) , thread_idx_y ( ) , thread_idx_z ( ) )
187+ pub fn thread_idx ( ) -> USizeVec3 {
188+ USizeVec3 :: new ( thread_idx_x ( ) , thread_idx_y ( ) , thread_idx_z ( ) )
189189}
190190
191191/// Gets the 3d index of the block that the thread currently executing the kernel is located in.
192192#[ gpu_only]
193193#[ inline( always) ]
194- pub fn block_idx ( ) -> UVec3 {
195- UVec3 :: new ( block_idx_x ( ) , block_idx_y ( ) , block_idx_z ( ) )
194+ pub fn block_idx ( ) -> USizeVec3 {
195+ USizeVec3 :: new ( block_idx_x ( ) , block_idx_y ( ) , block_idx_z ( ) )
196196}
197197
198198/// Gets the 3d layout of the thread blocks executing this kernel. In other words,
199199/// how many threads exist in each thread block in every direction.
200200#[ gpu_only]
201201#[ inline( always) ]
202- pub fn block_dim ( ) -> UVec3 {
203- UVec3 :: new ( block_dim_x ( ) , block_dim_y ( ) , block_dim_z ( ) )
202+ pub fn block_dim ( ) -> USizeVec3 {
203+ USizeVec3 :: new ( block_dim_x ( ) , block_dim_y ( ) , block_dim_z ( ) )
204204}
205205
206206/// Gets the 3d layout of the block grids executing this kernel. In other words,
207207/// how many thread blocks exist in each grid in every direction.
208208#[ gpu_only]
209209#[ inline( always) ]
210- pub fn grid_dim ( ) -> UVec3 {
211- UVec3 :: new ( grid_dim_x ( ) , grid_dim_y ( ) , grid_dim_z ( ) )
210+ pub fn grid_dim ( ) -> USizeVec3 {
211+ USizeVec3 :: new ( grid_dim_x ( ) , grid_dim_y ( ) , grid_dim_z ( ) )
212212}
213213
214214/// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
@@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 {
220220#[ gpu_only]
221221#[ rustfmt:: skip]
222222#[ inline( always) ]
223- pub fn index ( ) -> u32 {
223+ pub fn index ( ) -> usize {
224224 let grid_dim = grid_dim ( ) ;
225225 let block_idx = block_idx ( ) ;
226226 let block_dim = block_dim ( ) ;
@@ -235,31 +235,31 @@ pub fn index() -> u32 {
235235}
236236
237237#[ inline( always) ]
238- pub fn index_1d ( ) -> u32 {
239- thread_idx_x ( ) as u32 + block_idx_x ( ) as u32 * block_dim_x ( ) as u32
238+ pub fn index_1d ( ) -> usize {
239+ thread_idx_x ( ) + block_idx_x ( ) * block_dim_x ( )
240240}
241241
242242#[ inline( always) ]
243- pub fn index_2d ( ) -> UVec2 {
243+ pub fn index_2d ( ) -> USizeVec2 {
244244 let i = thread_idx_x ( ) + block_idx_x ( ) * block_dim_x ( ) ;
245245 let j = thread_idx_y ( ) + block_idx_y ( ) * block_dim_y ( ) ;
246- UVec2 :: new ( i, j)
246+ USizeVec2 :: new ( i, j)
247247}
248248
249249#[ inline( always) ]
250- pub fn index_3d ( ) -> UVec3 {
250+ pub fn index_3d ( ) -> USizeVec3 {
251251 let i = thread_idx_x ( ) + block_idx_x ( ) * block_dim_x ( ) ;
252252 let j = thread_idx_y ( ) + block_idx_y ( ) * block_dim_y ( ) ;
253253 let k = thread_idx_z ( ) + block_idx_z ( ) * block_dim_z ( ) ;
254- UVec3 :: new ( i, j, k)
254+ USizeVec3 :: new ( i, j, k)
255255}
256256
257257/// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
258258/// to only return true in a single thread that is invoking it. This is useful for only doing something
259259/// once.
260260#[ inline( always) ]
261261pub fn first ( ) -> bool {
262- block_idx ( ) == UVec3 :: ZERO && thread_idx ( ) == UVec3 :: ZERO
262+ block_idx ( ) == USizeVec3 :: ZERO && thread_idx ( ) == USizeVec3 :: ZERO
263263}
264264
265265/// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.
0 commit comments