Skip to content

Commit 28ef35b

Browse files
committed
overlap copies
1 parent b3ff768 commit 28ef35b

File tree

4 files changed

+35
-32
lines changed

4 files changed

+35
-32
lines changed

backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ __host__ void host_integer_radix_bitop_kb(
5353
}
5454

5555
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
56-
streams, lwe_array_out, lwe_array_1, lwe_array_2,
57-
bsks, ksks, ms_noise_reduction_key, lut,
56+
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
57+
ms_noise_reduction_key, lut,
5858
lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts,
5959
lut->params.message_modulus);
6060

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1+
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
12
use benchmark::utilities::{write_to_json, OperatorType};
2-
use criterion::{Criterion};
3+
use criterion::Criterion;
34
use rand::prelude::*;
4-
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
5+
use tfhe::array::GpuFheUint64Array;
56
use tfhe::keycache::NamedParam;
67
use tfhe::prelude::*;
7-
use tfhe::{
8-
ClientKey, CompressedServerKey,
9-
};
10-
use tfhe::array::GpuFheUint64Array;
8+
use tfhe::{ClientKey, CompressedServerKey};
119

1210
#[cfg(feature = "gpu")]
1311
fn main() {
@@ -16,7 +14,7 @@ fn main() {
1614
let config = ConfigBuilder::with_custom_parameters(
1715
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
1816
)
19-
.build();
17+
.build();
2018
let cks = ClientKey::generate(config);
2119
let compressed_sks = CompressedServerKey::new(&cks);
2220

@@ -27,16 +25,23 @@ fn main() {
2725
let array_dim = 32;
2826
let num_elems = array_dim * array_dim;
2927
let mut rng = thread_rng();
30-
let clear_xs = (0..num_elems as u64).map(| _ | rng.gen::<u64>()).collect::<Vec<_>>();
31-
let clear_ys = (0..num_elems as u64).map(| _ | rng.gen::<u64>()).collect::<Vec<_>>();
32-
33-
let xs = GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks).unwrap();
34-
let ys = GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks).unwrap();
28+
let clear_xs = (0..num_elems as u64)
29+
.map(|_| rng.gen::<u64>())
30+
.collect::<Vec<_>>();
31+
let clear_ys = (0..num_elems as u64)
32+
.map(|_| rng.gen::<u64>())
33+
.collect::<Vec<_>>();
34+
35+
let xs =
36+
GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks)
37+
.unwrap();
38+
let ys =
39+
GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks)
40+
.unwrap();
3541

3642
let mut c = Criterion::default().configure_from_args();
3743
let bench_id = format!("bench::hlapi::array::cuda::bitand::");
3844
c.bench_function(&bench_id, |b| {
39-
4045
b.iter(|| {
4146
let _ = &xs & &ys;
4247
})
@@ -54,6 +59,5 @@ fn main() {
5459
vec![],
5560
);
5661

57-
5862
c.final_summary();
5963
}

tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,19 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
104104
.map(|list| list.0.lwe_ciphertext_count.0)
105105
.sum(),
106106
);
107-
108107
assert_ne!(
109108
lwe_ciphertext_count.0, 0,
110109
"Empty iterator of CudaLweCiphertextList"
111110
);
112111

112+
let stream_count = lwe_ciphertext_count.0.max(6);
113+
let mut new_streams: Vec<CudaStreams> = Vec::with_capacity(stream_count);
114+
115+
for _ in 0..stream_count {
116+
let stream = CudaStreams::new_single_gpu(streams.gpu_indexes[0]);
117+
new_streams.push(stream);
118+
}
119+
113120
let first_item = cuda_ciphertexts_list_vec.next().unwrap();
114121
let lwe_dimension = first_item.lwe_dimension();
115122
let mut d_vec = CudaVec::new(
@@ -123,25 +130,20 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
123130
* std::mem::size_of::<T>();
124131
// Concatenate gpu_index memory
125132
unsafe {
126-
cuda_memcpy_async_gpu_to_gpu(
127-
ptr,
128-
first_item.0.d_vec.as_c_ptr(0),
129-
size as u64,
130-
streams.ptr[0],
131-
streams.gpu_indexes[0].get(),
132-
);
133-
ptr = ptr.wrapping_byte_add(size);
134-
for list in cuda_ciphertexts_list_vec {
133+
for (i, list) in cuda_ciphertexts_list_vec.enumerate() {
135134
cuda_memcpy_async_gpu_to_gpu(
136135
ptr,
137136
list.0.d_vec.as_c_ptr(0),
138137
size as u64,
139-
streams.ptr[0],
140-
streams.gpu_indexes[0].get(),
138+
new_streams[i % stream_count].ptr[0],
139+
new_streams[i % stream_count].gpu_indexes[0].get(),
141140
);
142141
ptr = ptr.wrapping_byte_add(size);
143142
}
144143
}
144+
for s in new_streams.iter() {
145+
s.synchronize();
146+
}
145147

146148
let cuda_lwe_list = CudaLweList {
147149
d_vec,

tfhe/src/integer/gpu/mod.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7761,10 +7761,7 @@ pub unsafe fn unchecked_bitop_vec_radix_kb_assign<T: UnsignedInteger, B: Numeric
77617761
keyswitch_key.ptr.as_ptr(),
77627762
&raw const ms_noise_reduction_key_ffi,
77637763
);
7764-
cleanup_cuda_integer_bitop(
7765-
streams.ffi(),
7766-
std::ptr::addr_of_mut!(mem_ptr),
7767-
);
7764+
cleanup_cuda_integer_bitop(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
77687765
update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left);
77697766
streams.synchronize();
77707767
}

0 commit comments

Comments
 (0)