Skip to content

Commit 641c868

Browse files
amukkaraPointKernelpre-commit-ci[bot]
authored
Add dynamic_bitset (#352)
This PR adds `dynamic_bitset` code that will be used in Trie data structure. Trie will be integrated in a separate PR #350. Since `dynamic_bitset` is not intended to be part of public-facing API, all files (.cuh and .inl) are located in include/cuco/detail/trie/dynamic_bitset Tests are added in tests/dynamic_bitset --------- Co-authored-by: Yunsong Wang <yunsongw@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 7c76a12 commit 641c868

9 files changed

Lines changed: 1355 additions & 0 deletions

File tree

Lines changed: 375 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
#pragma once
19+
20+
#include <cuco/cuda_stream_ref.hpp>
21+
22+
#include <thrust/device_malloc_allocator.h>
23+
#include <thrust/device_vector.h>
24+
25+
#include <cuda/std/array>
26+
27+
#include <climits>
28+
#include <cstddef>
29+
30+
namespace cuco {
31+
namespace experimental {
32+
namespace detail {
33+
34+
/**
35+
* @brief Struct to store ranks of bits at 256-bit intervals (or blocks)
36+
*
37+
* This struct encodes a list of four rank values using base + offset format
38+
* e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9]
39+
* base uses 40 bits, split between one uint32_t and one uint8_t
40+
* each offset uses 8 bits
41+
*/
42+
struct rank {
43+
uint32_t base_hi_; ///< Upper 32 bits of base
44+
uint8_t base_lo_; ///< Lower 8 bits of base
45+
cuda::std::array<uint8_t, 3> offsets_; ///< Offsets for 64-bit sub-intervals, relative to base
46+
47+
/**
48+
* @brief Gets base rank of current 256-bit interval
49+
*
50+
* @return The base rank
51+
*/
52+
__host__ __device__ constexpr uint64_t base() const noexcept
53+
{
54+
return (static_cast<uint64_t>(base_hi_) << CHAR_BIT) | base_lo_;
55+
}
56+
57+
/**
58+
* @brief Sets base rank of current 256-bit interval
59+
*
60+
* @param base Base rank
61+
*/
62+
__host__ __device__ constexpr void set_base(uint64_t base) noexcept
63+
{
64+
base_hi_ = static_cast<uint32_t>(base >> CHAR_BIT);
65+
base_lo_ = static_cast<uint8_t>(base);
66+
}
67+
};
68+
69+
/**
70+
* @brief Bitset class with rank and select index structures
71+
*
72+
* In addition to standard bitset set/test operations, this class provides
73+
* rank and select operation API. It maintains index structures to make both these
74+
* new operations close to constant time.
75+
*
76+
* Current limitations:
77+
* - Stream controls are partially supported due to the use of `thrust::device_vector` as storage
78+
* - Device ref doesn't support modifiers like `set`, `reset`, etc.
79+
*
80+
* @tparam Allocator Type of allocator used for device storage
81+
*/
82+
// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
83+
template <class Allocator = thrust::device_malloc_allocator<std::byte>>
84+
class dynamic_bitset {
85+
public:
86+
using size_type = std::size_t; ///< size type to specify bit index
87+
using word_type = uint64_t; ///< word type
88+
/// Type of the allocator to (de)allocate words
89+
using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<word_type>;
90+
91+
/// Number of bits per block. Note this is a tradeoff between space efficiency and perf.
92+
static constexpr size_type words_per_block = 4;
93+
/// Number of bits in a word
94+
static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT;
95+
/// Number of bits in a block
96+
static constexpr size_type bits_per_block = words_per_block * bits_per_word;
97+
98+
/**
99+
* @brief Constructs an empty bitset
100+
*
101+
* @param allocator Allocator used for allocating device storage
102+
*/
103+
constexpr dynamic_bitset(Allocator const& allocator = Allocator{});
104+
105+
/**
106+
* @brief Appends the given element `value` to the end of the bitset
107+
*
108+
* This API may involve data reallocation if the current storage is exhausted.
109+
*
110+
* @param value Boolean value of the new bit to be added
111+
*/
112+
constexpr void push_back(bool value) noexcept;
113+
114+
/**
115+
* @brief Sets the target bit indexed by `index` to a specified `value`.
116+
*
117+
* @param index Position of bit to be modified
118+
* @param value New value of the target bit
119+
*/
120+
constexpr void set(size_type index, bool value) noexcept;
121+
122+
/**
123+
* @brief Sets the last bit to a specified value
124+
*
125+
* @param value New value of the last bit
126+
*/
127+
constexpr void set_last(bool value) noexcept;
128+
129+
/**
130+
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
131+
* boolean value at position `keys_begin[i]` to `output_begin[i]`.
132+
*
133+
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
134+
* `size_type`
135+
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
136+
* type
137+
*
138+
* @param keys_begin Begin iterator to keys list whose values are queried
139+
* @param keys_end End iterator to keys list
140+
* @param outputs_begin Begin iterator to outputs of test operation
141+
* @param stream Stream to execute test kernel
142+
*/
143+
template <typename KeyIt, typename OutputIt>
144+
constexpr void test(KeyIt keys_begin,
145+
KeyIt keys_end,
146+
OutputIt outputs_begin,
147+
cuda_stream_ref stream = {}) noexcept;
148+
149+
/**
150+
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
151+
* count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`.
152+
*
153+
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
154+
* `size_type`
155+
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
156+
* `size_type`
157+
*
158+
* @param keys_begin Begin iterator to keys list whose ranks are queried
159+
* @param keys_end End iterator to keys list
160+
* @param outputs_begin Begin iterator to outputs ranks list
161+
* @param stream Stream to execute ranks kernel
162+
*/
163+
template <typename KeyIt, typename OutputIt>
164+
constexpr void rank(KeyIt keys_begin,
165+
KeyIt keys_end,
166+
OutputIt outputs_begin,
167+
cuda_stream_ref stream = {}) noexcept;
168+
169+
/**
170+
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
171+
* position of `keys_begin[i]`th `1` bit to `output_begin[i]`.
172+
*
173+
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
174+
* `size_type`
175+
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
176+
* `size_type`
177+
*
178+
* @param keys_begin Begin iterator to keys list whose select values are queried
179+
* @param keys_end End iterator to keys list
180+
* @param outputs_begin Begin iterator to outputs selects list
181+
* @param stream Stream to execute selects kernel
182+
*/
183+
template <typename KeyIt, typename OutputIt>
184+
constexpr void select(KeyIt keys_begin,
185+
KeyIt keys_end,
186+
OutputIt outputs_begin,
187+
cuda_stream_ref stream = {}) noexcept;
188+
189+
using rank_type = cuco::experimental::detail::rank; ///< Rank type
190+
191+
/**
192+
*@brief Struct to hold all storage refs needed by reference
193+
*/
194+
// TODO: this is not a real ref type, to be changed
195+
struct storage_ref_type {
196+
const word_type* words_ref_; ///< Words ref
197+
198+
const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits
199+
const size_type* selects_true_ref_; ///< Selects ref for 1 bits
200+
201+
const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits
202+
const size_type* selects_false_ref_; ///< Selects ref 0 bits
203+
};
204+
205+
/**
206+
* @brief Device non-owning reference type of dynamic_bitset
207+
*/
208+
class reference {
209+
public:
210+
/**
211+
* @brief Constructs a reference
212+
*
213+
* @param storage Struct with non-owning refs to bitset storage arrays
214+
*/
215+
__host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept;
216+
217+
/**
218+
* @brief Access value of a single bit
219+
*
220+
* @param key Position of bit
221+
*
222+
* @return Value of bit at position specified by key
223+
*/
224+
[[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept;
225+
226+
/**
227+
* @brief Access a single word of internal storage
228+
*
229+
* @param word_id Index of word
230+
*
231+
* @return Word at position specified by index
232+
*/
233+
[[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept;
234+
235+
/**
236+
* @brief Find position of first set bit starting from a given position (inclusive)
237+
*
238+
* @param key Position of starting bit
239+
*
240+
* @return Index of next set bit
241+
*/
242+
[[nodiscard]] __device__ size_type find_next(size_type key) const noexcept;
243+
244+
/**
245+
* @brief Find number of set bits (rank) in all positions before the input position (exclusive)
246+
*
247+
* @param key Input bit position
248+
*
249+
* @return Rank of input position
250+
*/
251+
[[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept;
252+
253+
/**
254+
* @brief Find position of Nth set (1) bit counting from start
255+
*
256+
* @param count Input N
257+
*
258+
* @return Position of Nth set bit
259+
*/
260+
[[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept;
261+
262+
/**
263+
* @brief Find position of Nth not-set (0) bit counting from start
264+
*
265+
* @param count Input N
266+
*
267+
* @return Position of Nth not-set bit
268+
*/
269+
[[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept;
270+
271+
private:
272+
/**
273+
* @brief Helper function for select operation that computes an initial rank estimate
274+
*
275+
* @param count Input count for which select operation is being performed
276+
* @param selects Selects array
277+
* @param ranks Ranks array
278+
*
279+
* @return index in ranks which corresponds to highest rank less than count (least upper bound)
280+
*/
281+
template <typename SelectsRef, typename RanksRef>
282+
[[nodiscard]] __device__ constexpr size_type initial_rank_estimate(
283+
size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept;
284+
285+
/**
286+
* @brief Subtract rank estimate from input count and return an increment to word_id
287+
*
288+
* @tparam Rank type
289+
*
290+
* @param count Input count that will be updated
291+
* @param rank Initial rank estimate for count
292+
*
293+
* @return Increment to word_id based on rank values
294+
*/
295+
template <typename Rank>
296+
[[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count,
297+
Rank rank) const noexcept;
298+
299+
/**
300+
* @brief Find position of Nth set bit in a 64-bit word
301+
*
302+
* @param N Input count
303+
*
304+
* @return Position of Nth set bit
305+
*/
306+
[[nodiscard]] __device__ size_type select_bit_in_word(size_type N,
307+
word_type word) const noexcept;
308+
309+
storage_ref_type storage_; ///< Non-owning storage
310+
};
311+
312+
using ref_type = reference; ///< Non-owning container ref type
313+
314+
/**
315+
* @brief Gets non-owning device ref of the current object
316+
*
317+
* @return Device ref of the current `dynamic_bitset` object
318+
*/
319+
[[nodiscard]] constexpr ref_type ref() const noexcept;
320+
321+
/**
322+
* @brief Gets the number of bits dynamic_bitset holds
323+
*
324+
* @return Number of bits dynamic_bitset holds
325+
*/
326+
[[nodiscard]] constexpr size_type size() const noexcept;
327+
328+
private:
329+
/// Type of the allocator to (de)allocate ranks
330+
using rank_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<rank_type>;
331+
/// Type of the allocator to (de)allocate indices
332+
using size_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<size_type>;
333+
334+
allocator_type allocator_; ///< Words allocator
335+
size_type n_bits_; ///< Number of bits dynamic_bitset currently holds
336+
bool is_built_; ///< Flag indicating whether the rank and select indices are built or not
337+
338+
/// Words vector that represents all bits
339+
thrust::device_vector<word_type, allocator_type> words_;
340+
/// Rank values for every 256-th bit (4-th word)
341+
thrust::device_vector<rank_type, rank_allocator_type> ranks_true_;
342+
/// Same as ranks_ but for `0` bits
343+
thrust::device_vector<rank_type, rank_allocator_type> ranks_false_;
344+
/// Block indices of (0, 256, 512...)th `1` bit
345+
thrust::device_vector<size_type, size_allocator_type> selects_true_;
346+
/// Same as selects_, but for `0` bits
347+
thrust::device_vector<size_type, size_allocator_type> selects_false_;
348+
349+
/**
350+
* @brief Builds indexes for rank and select
351+
*
352+
* @param stream Stream to execute kernels
353+
*/
354+
constexpr void build(cuda_stream_ref stream = {}) noexcept;
355+
356+
/**
357+
* @brief Populates rank and select indexes for true or false bits
358+
*
359+
* @param ranks Output array of ranks
360+
* @param selects Output array of selects
361+
* @param flip_bits If true, negate bits to construct indexes for false bits
362+
* @param stream Stream to execute kernels
363+
*/
364+
constexpr void build_ranks_and_selects(
365+
thrust::device_vector<rank_type, rank_allocator_type>& ranks,
366+
thrust::device_vector<size_type, size_allocator_type>& selects,
367+
bool flip_bits,
368+
cuda_stream_ref stream = {});
369+
};
370+
371+
} // namespace detail
372+
} // namespace experimental
373+
} // namespace cuco
374+
375+
#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl>

0 commit comments

Comments
 (0)