Skip to content

Commit 5111ae1

Browse files
committed
refactor(gpu): moving vector_comparisons's functions to the backend
1 parent 3cfbaa4 commit 5111ae1

File tree

10 files changed

+1128
-565
lines changed

10 files changed

+1128
-565
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,46 @@ void cuda_unchecked_index_of_clear_64(
10041004

10051005
void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
10061006
int8_t **mem_ptr_void);
1007+
1008+
uint64_t scratch_cuda_unchecked_all_eq_slices_64(
1009+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
1010+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
1011+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
1012+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
1013+
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
1014+
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
1015+
PBS_MS_REDUCTION_T noise_reduction_type);
1016+
1017+
void cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
1018+
CudaRadixCiphertextFFI *match_ct,
1019+
CudaRadixCiphertextFFI const *lhs,
1020+
CudaRadixCiphertextFFI const *rhs,
1021+
uint32_t num_inputs, uint32_t num_blocks,
1022+
int8_t *mem, void *const *bsks,
1023+
void *const *ksks);
1024+
1025+
void cleanup_cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
1026+
int8_t **mem_ptr_void);
1027+
1028+
uint64_t scratch_cuda_unchecked_contains_sub_slice_64(
1029+
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
1030+
uint32_t polynomial_size, uint32_t big_lwe_dimension,
1031+
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
1032+
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
1033+
uint32_t num_lhs, uint32_t num_rhs, uint32_t num_blocks,
1034+
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
1035+
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
1036+
1037+
void cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
1038+
CudaRadixCiphertextFFI *match_ct,
1039+
CudaRadixCiphertextFFI const *lhs,
1040+
CudaRadixCiphertextFFI const *rhs,
1041+
uint32_t num_rhs, uint32_t num_blocks,
1042+
int8_t *mem, void *const *bsks,
1043+
void *const *ksks);
1044+
1045+
void cleanup_cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
1046+
int8_t **mem_ptr_void);
10071047
} // extern C
10081048

10091049
#endif // CUDA_INTEGER_H
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
#pragma once
2+
#include "helper_multi_gpu.h"
3+
#include "integer/comparison.h"
4+
#include "integer/radix_ciphertext.cuh"
5+
#include "integer_utilities.h"
6+
7+
const uint32_t MAX_STREAMS_FOR_VECTOR_COMPARISON = 8;
8+
9+
template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
10+
int_radix_params params;
11+
bool allocate_gpu_memory;
12+
uint32_t num_inputs;
13+
14+
int_comparison_buffer<Torus> **eq_buffers;
15+
int_comparison_buffer<Torus> *reduction_buffer;
16+
17+
CudaRadixCiphertextFFI *packed_results;
18+
19+
CudaStreams active_streams;
20+
21+
CudaStreams *sub_streams;
22+
cudaEvent_t incoming_event;
23+
cudaEvent_t *outgoing_events;
24+
uint32_t num_streams;
25+
26+
int_unchecked_all_eq_slices_buffer(CudaStreams streams,
27+
int_radix_params params,
28+
uint32_t num_inputs, uint32_t num_blocks,
29+
bool allocate_gpu_memory,
30+
uint64_t &size_tracker) {
31+
this->params = params;
32+
this->allocate_gpu_memory = allocate_gpu_memory;
33+
this->num_inputs = num_inputs;
34+
35+
uint32_t num_streams_to_use =
36+
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_COMPARISON, num_inputs);
37+
if (num_streams_to_use == 0)
38+
num_streams_to_use = 1;
39+
40+
this->num_streams = num_streams_to_use;
41+
this->active_streams = streams.active_gpu_subset(num_blocks);
42+
43+
uint32_t num_gpus = active_streams.count();
44+
45+
this->sub_streams = nullptr;
46+
this->outgoing_events = nullptr;
47+
this->incoming_event = nullptr;
48+
49+
if (num_streams_to_use > 0) {
50+
this->sub_streams = new CudaStreams[num_streams_to_use];
51+
for (uint32_t i = 0; i < num_streams_to_use; ++i) {
52+
this->sub_streams[i].create_on_same_gpus(active_streams);
53+
}
54+
}
55+
56+
if (num_gpus > 0) {
57+
this->incoming_event = cuda_create_event(active_streams.gpu_index(0));
58+
}
59+
60+
uint32_t total_events = num_streams_to_use * num_gpus;
61+
if (total_events > 0) {
62+
this->outgoing_events = new cudaEvent_t[total_events];
63+
for (uint32_t s = 0; s < num_streams_to_use; ++s) {
64+
for (uint32_t g = 0; g < num_gpus; ++g) {
65+
this->outgoing_events[s * num_gpus + g] =
66+
cuda_create_event(active_streams.gpu_index(g));
67+
}
68+
}
69+
}
70+
71+
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
72+
for (uint32_t i = 0; i < num_streams; i++) {
73+
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
74+
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
75+
size_tracker);
76+
}
77+
78+
this->reduction_buffer =
79+
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
80+
allocate_gpu_memory, size_tracker);
81+
82+
this->packed_results = new CudaRadixCiphertextFFI;
83+
create_zero_radix_ciphertext_async<Torus>(
84+
streams.stream(0), streams.gpu_index(0), this->packed_results,
85+
num_inputs, params.big_lwe_dimension, size_tracker,
86+
allocate_gpu_memory);
87+
}
88+
89+
void release(CudaStreams streams) {
90+
for (uint32_t i = 0; i < num_streams; i++) {
91+
eq_buffers[i]->release(streams);
92+
delete eq_buffers[i];
93+
}
94+
delete[] eq_buffers;
95+
96+
this->reduction_buffer->release(streams);
97+
delete this->reduction_buffer;
98+
99+
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
100+
this->packed_results,
101+
this->allocate_gpu_memory);
102+
delete this->packed_results;
103+
104+
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
105+
106+
if (this->outgoing_events && this->sub_streams) {
107+
for (uint32_t s = 0; s < this->num_streams; ++s) {
108+
for (uint32_t g = 0; g < active_streams.count(); ++g) {
109+
cuda_event_destroy(
110+
this->outgoing_events[s * active_streams.count() + g],
111+
this->sub_streams[s].gpu_index(g));
112+
}
113+
}
114+
delete[] this->outgoing_events;
115+
this->outgoing_events = nullptr;
116+
}
117+
118+
if (this->incoming_event && this->sub_streams) {
119+
cuda_event_destroy(this->incoming_event,
120+
this->sub_streams[0].gpu_index(0));
121+
this->incoming_event = nullptr;
122+
}
123+
124+
if (this->sub_streams) {
125+
for (uint32_t i = 0; i < this->num_streams; ++i) {
126+
this->sub_streams[i].release();
127+
}
128+
delete[] this->sub_streams;
129+
this->sub_streams = nullptr;
130+
}
131+
132+
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
133+
}
134+
};
135+
136+
template <typename Torus> struct int_unchecked_contains_sub_slice_buffer {
137+
int_radix_params params;
138+
bool allocate_gpu_memory;
139+
uint32_t num_windows;
140+
141+
int_unchecked_all_eq_slices_buffer<Torus> *all_eq_buffer;
142+
CudaRadixCiphertextFFI *packed_results;
143+
int_comparison_buffer<Torus> *final_reduction_buffer;
144+
145+
int_unchecked_contains_sub_slice_buffer(CudaStreams streams,
146+
int_radix_params params,
147+
uint32_t num_lhs, uint32_t num_rhs,
148+
uint32_t num_blocks,
149+
bool allocate_gpu_memory,
150+
uint64_t &size_tracker) {
151+
this->params = params;
152+
this->allocate_gpu_memory = allocate_gpu_memory;
153+
this->num_windows = num_lhs - num_rhs + 1;
154+
155+
this->all_eq_buffer = new int_unchecked_all_eq_slices_buffer<Torus>(
156+
streams, params, num_rhs, num_blocks, allocate_gpu_memory,
157+
size_tracker);
158+
159+
this->packed_results = new CudaRadixCiphertextFFI;
160+
create_zero_radix_ciphertext_async<Torus>(
161+
streams.stream(0), streams.gpu_index(0), this->packed_results,
162+
this->num_windows, params.big_lwe_dimension, size_tracker,
163+
allocate_gpu_memory);
164+
165+
this->final_reduction_buffer = new int_comparison_buffer<Torus>(
166+
streams, EQ, params, this->num_windows, false, allocate_gpu_memory,
167+
size_tracker);
168+
}
169+
170+
void release(CudaStreams streams) {
171+
this->all_eq_buffer->release(streams);
172+
delete this->all_eq_buffer;
173+
174+
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
175+
this->packed_results,
176+
this->allocate_gpu_memory);
177+
delete this->packed_results;
178+
179+
this->final_reduction_buffer->release(streams);
180+
delete this->final_reduction_buffer;
181+
182+
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
183+
}
184+
};

0 commit comments

Comments
 (0)