Skip to content

Commit f4efc1a

Browse files
committed
feat(gpu): add boolean bitops in cuda backend
1 parent 19c2146 commit f4efc1a

File tree

8 files changed

+1534
-77
lines changed

8 files changed

+1534
-77
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,111 @@
11
#pragma once
22
#include "integer_utilities.h"
33

4+
template <typename Torus> struct boolean_bitop_buffer {
5+
6+
int_radix_params params;
7+
int_radix_lut<Torus> *lut;
8+
int_radix_lut<Torus> *message_extract_lut;
9+
10+
CudaRadixCiphertextFFI *tmp_lwe_left;
11+
CudaRadixCiphertextFFI *tmp_lwe_right;
12+
13+
BITOP_TYPE op;
14+
bool unchecked;
15+
bool gpu_memory_allocated;
16+
17+
boolean_bitop_buffer(CudaStreams streams, BITOP_TYPE op, bool is_unchecked,
18+
int_radix_params params, uint32_t lwe_ciphertext_count,
19+
bool allocate_gpu_memory, uint64_t &size_tracker) {
20+
gpu_memory_allocated = allocate_gpu_memory;
21+
this->op = op;
22+
this->params = params;
23+
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
24+
this->unchecked = is_unchecked;
25+
switch (op) {
26+
case BITAND:
27+
case BITOR:
28+
case BITXOR:
29+
lut = new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
30+
allocate_gpu_memory, size_tracker);
31+
{
32+
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
33+
if (op == BITOP_TYPE::BITAND) {
34+
// AND
35+
return lhs & rhs;
36+
} else if (op == BITOP_TYPE::BITOR) {
37+
// OR
38+
return lhs | rhs;
39+
} else {
40+
// XOR
41+
return lhs ^ rhs;
42+
}
43+
};
44+
45+
// BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
46+
// only lut for degree = 1 is generated
47+
generate_device_accumulator_bivariate_with_factor<Torus>(
48+
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
49+
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
50+
params.polynomial_size, params.message_modulus,
51+
params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
52+
lut->broadcast_lut(active_streams);
53+
}
54+
break;
55+
default:
56+
PANIC("Boolean bitwise operation type is not specified");
57+
}
58+
59+
if (!unchecked) {
60+
message_extract_lut =
61+
new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
62+
gpu_memory_allocated, size_tracker);
63+
auto lut_f_message_extract = [params](Torus x) -> Torus {
64+
return x % params.message_modulus;
65+
};
66+
67+
generate_device_accumulator<Torus>(
68+
streams.stream(0), streams.gpu_index(0),
69+
message_extract_lut->get_lut(0, 0),
70+
message_extract_lut->get_degree(0),
71+
message_extract_lut->get_max_degree(0), params.glwe_dimension,
72+
params.polynomial_size, params.message_modulus, params.carry_modulus,
73+
lut_f_message_extract, gpu_memory_allocated);
74+
message_extract_lut->broadcast_lut(active_streams);
75+
}
76+
tmp_lwe_left = new CudaRadixCiphertextFFI;
77+
create_zero_radix_ciphertext_async<Torus>(
78+
streams.stream(0), streams.gpu_index(0), tmp_lwe_left,
79+
lwe_ciphertext_count, params.big_lwe_dimension, size_tracker,
80+
allocate_gpu_memory);
81+
tmp_lwe_right = new CudaRadixCiphertextFFI;
82+
create_zero_radix_ciphertext_async<Torus>(
83+
streams.stream(0), streams.gpu_index(0), tmp_lwe_right,
84+
lwe_ciphertext_count, params.big_lwe_dimension, size_tracker,
85+
allocate_gpu_memory);
86+
}
87+
88+
void release(CudaStreams streams) {
89+
if (!unchecked) {
90+
message_extract_lut->release(streams);
91+
delete message_extract_lut;
92+
}
93+
94+
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
95+
tmp_lwe_left, gpu_memory_allocated);
96+
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
97+
tmp_lwe_right, gpu_memory_allocated);
98+
delete tmp_lwe_left;
99+
delete tmp_lwe_right;
100+
tmp_lwe_left = nullptr;
101+
tmp_lwe_right = nullptr;
102+
103+
lut->release(streams);
104+
delete lut;
105+
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
106+
}
107+
};
108+
4109
template <typename Torus> struct int_bitop_buffer {
5110

6111
int_radix_params params;
@@ -81,6 +186,50 @@ template <typename Torus> struct int_bitop_buffer {
81186
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
82187
}
83188
};
189+
190+
template <typename Torus> struct boolean_bitnot_buffer {
191+
int_radix_params params;
192+
int_radix_lut<Torus> *message_extract_lut;
193+
bool gpu_memory_allocated;
194+
bool unchecked;
195+
boolean_bitnot_buffer(CudaStreams streams, int_radix_params params,
196+
uint32_t lwe_ciphertext_count, bool is_unchecked,
197+
bool allocate_gpu_memory, uint64_t &size_tracker) {
198+
gpu_memory_allocated = allocate_gpu_memory;
199+
unchecked = is_unchecked;
200+
this->params = params;
201+
202+
auto message_modulus = params.message_modulus;
203+
204+
if (!unchecked) {
205+
message_extract_lut =
206+
new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
207+
gpu_memory_allocated, size_tracker);
208+
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
209+
return x % message_modulus;
210+
};
211+
212+
generate_device_accumulator<Torus>(
213+
streams.stream(0), streams.gpu_index(0),
214+
message_extract_lut->get_lut(0, 0),
215+
message_extract_lut->get_degree(0),
216+
message_extract_lut->get_max_degree(0), params.glwe_dimension,
217+
params.polynomial_size, params.message_modulus, params.carry_modulus,
218+
lut_f_message_extract, gpu_memory_allocated);
219+
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
220+
message_extract_lut->broadcast_lut(active_streams);
221+
}
222+
}
223+
224+
void release(CudaStreams streams) {
225+
if (!unchecked) {
226+
message_extract_lut->release(streams);
227+
delete message_extract_lut;
228+
}
229+
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
230+
}
231+
};
232+
84233
void update_degrees_after_bitand(uint64_t *output_degrees,
85234
uint64_t *lwe_array_1_degrees,
86235
uint64_t *lwe_array_2_degrees,

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,47 @@ void cuda_scalar_comparison_ciphertext_64(
280280
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
281281
int8_t **mem_ptr_void);
282282

283+
uint64_t scratch_cuda_boolean_bitop_64(
284+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
285+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
286+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
287+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
288+
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
289+
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
290+
bool is_unchecked, bool allocate_gpu_memory,
291+
PBS_MS_REDUCTION_T noise_reduction_type);
292+
293+
void cuda_boolean_bitop_ciphertext_64(CudaStreamsFFI streams,
294+
CudaRadixCiphertextFFI *lwe_array_out,
295+
CudaRadixCiphertextFFI const *lwe_array_1,
296+
CudaRadixCiphertextFFI const *lwe_array_2,
297+
int8_t *mem_ptr, void *const *bsks,
298+
void *const *ksks);
299+
300+
void cleanup_cuda_boolean_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
301+
302+
uint64_t scratch_cuda_boolean_bitnot_64(
303+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
304+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
305+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
306+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
307+
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
308+
uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
309+
PBS_MS_REDUCTION_T noise_reduction_type);
310+
311+
void cuda_boolean_bitnot_ciphertext_64(CudaStreamsFFI streams,
312+
CudaRadixCiphertextFFI *lwe_array,
313+
int8_t *mem_ptr, void *const *bsks,
314+
void *const *ksks);
315+
316+
void cleanup_cuda_boolean_bitnot(CudaStreamsFFI streams, int8_t **mem_ptr_void);
317+
318+
void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
319+
CudaRadixCiphertextFFI *radix_ciphertext,
320+
uint32_t ct_message_modulus,
321+
uint32_t param_message_modulus,
322+
uint32_t param_carry_modulus);
323+
283324
uint64_t scratch_cuda_bitop_64(
284325
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
285326
uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -289,19 +330,19 @@ uint64_t scratch_cuda_bitop_64(
289330
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
290331
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
291332

333+
void cuda_scalar_bitop_ciphertext_64(
334+
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
335+
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
336+
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
337+
void *const *bsks, void *const *ksks);
338+
292339
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
293340
CudaRadixCiphertextFFI *lwe_array_out,
294341
CudaRadixCiphertextFFI const *lwe_array_1,
295342
CudaRadixCiphertextFFI const *lwe_array_2,
296343
int8_t *mem_ptr, void *const *bsks,
297344
void *const *ksks);
298345

299-
void cuda_scalar_bitop_ciphertext_64(
300-
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
301-
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
302-
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
303-
void *const *bsks, void *const *ksks);
304-
305346
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
306347

307348
uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,

backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,84 @@
11
#include "integer/bitwise_ops.cuh"
22

3+
uint64_t scratch_cuda_boolean_bitop_64(
4+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
5+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
6+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
7+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
8+
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
9+
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
10+
bool is_unchecked, bool allocate_gpu_memory,
11+
PBS_MS_REDUCTION_T noise_reduction_type) {
12+
13+
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
14+
big_lwe_dimension, small_lwe_dimension, ks_level,
15+
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
16+
message_modulus, carry_modulus, noise_reduction_type);
17+
18+
return scratch_cuda_boolean_bitop<uint64_t>(
19+
CudaStreams(streams), (boolean_bitop_buffer<uint64_t> **)mem_ptr,
20+
lwe_ciphertext_count, params, op_type, is_unchecked, allocate_gpu_memory);
21+
}
22+
23+
void cuda_boolean_bitop_ciphertext_64(CudaStreamsFFI streams,
24+
CudaRadixCiphertextFFI *lwe_array_out,
25+
CudaRadixCiphertextFFI const *lwe_array_1,
26+
CudaRadixCiphertextFFI const *lwe_array_2,
27+
int8_t *mem_ptr, void *const *bsks,
28+
void *const *ksks) {
29+
30+
host_boolean_bitop<uint64_t>(
31+
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
32+
(boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
33+
}
34+
35+
void cleanup_cuda_boolean_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
36+
37+
boolean_bitop_buffer<uint64_t> *mem_ptr =
38+
(boolean_bitop_buffer<uint64_t> *)(*mem_ptr_void);
39+
mem_ptr->release(CudaStreams(streams));
40+
delete mem_ptr;
41+
*mem_ptr_void = nullptr;
42+
}
43+
44+
uint64_t scratch_cuda_boolean_bitnot_64(
45+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
46+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
47+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
48+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
49+
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
50+
uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
51+
PBS_MS_REDUCTION_T noise_reduction_type) {
52+
53+
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
54+
big_lwe_dimension, small_lwe_dimension, ks_level,
55+
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
56+
message_modulus, carry_modulus, noise_reduction_type);
57+
58+
return scratch_cuda_boolean_bitnot<uint64_t>(
59+
CudaStreams(streams), (boolean_bitnot_buffer<uint64_t> **)mem_ptr, params,
60+
lwe_ciphertext_count, is_unchecked, allocate_gpu_memory);
61+
}
62+
63+
void cuda_boolean_bitnot_ciphertext_64(CudaStreamsFFI streams,
64+
CudaRadixCiphertextFFI *lwe_array,
65+
int8_t *mem_ptr, void *const *bsks,
66+
void *const *ksks) {
67+
host_boolean_bitnot<uint64_t>(CudaStreams(streams), lwe_array,
68+
(boolean_bitnot_buffer<uint64_t> *)mem_ptr,
69+
bsks, (uint64_t **)(ksks));
70+
}
71+
72+
void cleanup_cuda_boolean_bitnot(CudaStreamsFFI streams,
73+
int8_t **mem_ptr_void) {
74+
75+
boolean_bitnot_buffer<uint64_t> *mem_ptr =
76+
(boolean_bitnot_buffer<uint64_t> *)(*mem_ptr_void);
77+
mem_ptr->release(CudaStreams(streams));
78+
delete mem_ptr;
79+
*mem_ptr_void = nullptr;
80+
}
81+
382
uint64_t scratch_cuda_bitop_64(
483
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
584
uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -19,6 +98,17 @@ uint64_t scratch_cuda_bitop_64(
1998
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
2099
}
21100

101+
void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
102+
CudaRadixCiphertextFFI *radix_ciphertext,
103+
uint32_t ct_message_modulus,
104+
uint32_t param_message_modulus,
105+
uint32_t param_carry_modulus) {
106+
auto cuda_streams = CudaStreams(streams);
107+
host_bitnot<uint64_t>(cuda_streams, radix_ciphertext, ct_message_modulus,
108+
param_message_modulus, param_carry_modulus);
109+
cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
110+
}
111+
22112
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
23113
CudaRadixCiphertextFFI *lwe_array_out,
24114
CudaRadixCiphertextFFI const *lwe_array_1,

0 commit comments

Comments
 (0)