zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/cast.h‎
Lines changed: 52 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/cast.h‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h‎
Lines changed: 44 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h‎
Lines changed: 96 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/cast.cu‎
Lines changed: 52 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/cast.cu‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh‎
Lines changed: 69 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh‎
Lines changed: 69 additions & 0 deletions
@@ -75,3 +75,55 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
   }
 };
+
+template <typename Torus> struct int_cast_to_unsigned_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  bool requires_full_propagate;
+  bool requires_sign_extension;
+
+  int_fullprop_buffer<Torus> *prop_buffer;
+  int_extend_radix_with_sign_msb_buffer<Torus> *extend_buffer;
+
+  int_cast_to_unsigned_buffer(CudaStreams streams, int_radix_params params,
+                              uint32_t num_input_blocks,
+                              uint32_t target_num_blocks, bool input_is_signed,
+                              bool requires_full_propagate,
+                              bool allocate_gpu_memory,
+                              uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->requires_full_propagate = requires_full_propagate;
+
+    this->prop_buffer = nullptr;
+    this->extend_buffer = nullptr;
+
+    if (requires_full_propagate) {
+      this->prop_buffer = new int_fullprop_buffer<Torus>(
+          streams, params, allocate_gpu_memory, size_tracker);
+    }
+
+    this->requires_sign_extension =
+        (target_num_blocks > num_input_blocks) && input_is_signed;
+
+    if (this->requires_sign_extension) {
+      uint32_t num_blocks_to_add = target_num_blocks - num_input_blocks;
+      this->extend_buffer = new int_extend_radix_with_sign_msb_buffer<Torus>(
+          streams, params, num_input_blocks, num_blocks_to_add,
+          allocate_gpu_memory, size_tracker);
+    }
+  }
+
+  void release(CudaStreams streams) {
+    if (this->prop_buffer) {
+      this->prop_buffer->release(streams);
+      delete this->prop_buffer;
+    }
+    if (this->extend_buffer) {
+      this->extend_buffer->release(streams);
+      delete this->extend_buffer;
+    }
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
@@ -569,6 +569,10 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                               CudaRadixCiphertextFFI const *input,
                               CudaStreamsFFI streams);
 
+void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              CudaStreamsFFI streams);
+
 uint64_t scratch_cuda_apply_noise_squashing(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -850,6 +854,46 @@ void cuda_unchecked_match_value_64(
 
 void cleanup_cuda_unchecked_match_value_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_cast_to_unsigned_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
+    bool requires_full_propagate, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
+                              CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI *input, int8_t *mem_ptr,
+                              uint32_t target_num_blocks, bool input_is_signed,
+                              void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
+                                      int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_match_value_or_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_matches, uint32_t num_input_blocks,
+    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
+    uint32_t max_output_is_zero, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_match_value_or_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in_ct,
+    const uint64_t *h_match_inputs, const uint64_t *h_match_outputs,
+    const uint64_t *h_or_value, int8_t *mem, void *const *bsks,
+    void *const *ksks);
+
+void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void);
 } // extern C
 
 #endif // CUDA_INTEGER_H
@@ -1,4 +1,5 @@
 #pragma once
+#include "cast.h"
 #include "integer/comparison.h"
 #include "integer/radix_ciphertext.cuh"
 #include "integer_utilities.h"
@@ -593,3 +594,98 @@ template <typename Torus> struct int_unchecked_match_buffer {
     delete this->packed_selectors_ct;
   }
 };
+
+template <typename Torus> struct int_unchecked_match_value_or_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  uint32_t num_matches;
+  uint32_t num_input_blocks;
+  uint32_t num_match_packed_blocks;
+  uint32_t num_final_blocks;
+  bool max_output_is_zero;
+
+  int_unchecked_match_buffer<Torus> *match_buffer;
+  int_cmux_buffer<Torus> *cmux_buffer;
+
+  CudaRadixCiphertextFFI *tmp_match_result;
+  CudaRadixCiphertextFFI *tmp_match_bool;
+  CudaRadixCiphertextFFI *tmp_or_value;
+
+  Torus *d_or_value;
+
+  int_unchecked_match_value_or_buffer(
+      CudaStreams streams, int_radix_params params, uint32_t num_matches,
+      uint32_t num_input_blocks, uint32_t num_match_packed_blocks,
+      uint32_t num_final_blocks, bool max_output_is_zero,
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_matches = num_matches;
+    this->num_input_blocks = num_input_blocks;
+    this->num_match_packed_blocks = num_match_packed_blocks;
+    this->num_final_blocks = num_final_blocks;
+    this->max_output_is_zero = max_output_is_zero;
+
+    this->match_buffer = new int_unchecked_match_buffer<Torus>(
+        streams, params, num_matches, num_input_blocks, num_match_packed_blocks,
+        max_output_is_zero, allocate_gpu_memory, size_tracker);
+
+    this->cmux_buffer = new int_cmux_buffer<Torus>(
+        streams, [](Torus x) -> Torus { return x == 1; }, params,
+        num_final_blocks, allocate_gpu_memory, size_tracker);
+
+    this->tmp_match_result = new CudaRadixCiphertextFFI;
+    this->tmp_match_bool = new CudaRadixCiphertextFFI;
+    this->tmp_or_value = new CudaRadixCiphertextFFI;
+
+    this->d_or_value = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_final_blocks * sizeof(Torus), streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+
+    if (!max_output_is_zero) {
+      create_zero_radix_ciphertext_async<Torus>(
+          streams.stream(0), streams.gpu_index(0), this->tmp_match_result,
+          num_final_blocks, params.big_lwe_dimension, size_tracker,
+          allocate_gpu_memory);
+    }
+
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_match_bool, 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_or_value,
+        num_final_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams) {
+    this->match_buffer->release(streams);
+    delete this->match_buffer;
+
+    this->cmux_buffer->release(streams);
+    delete this->cmux_buffer;
+
+    if (!max_output_is_zero) {
+      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                     this->tmp_match_result,
+                                     this->allocate_gpu_memory);
+    }
+    delete this->tmp_match_result;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_match_bool,
+                                   this->allocate_gpu_memory);
+    delete this->tmp_match_bool;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_or_value,
+                                   this->allocate_gpu_memory);
+    delete this->tmp_or_value;
+
+    cuda_drop_async(this->d_or_value, streams.stream(0), streams.gpu_index(0));
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
@@ -18,6 +18,15 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
   cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
 }
 
+void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              CudaStreamsFFI streams) {
+
+  auto cuda_streams = CudaStreams(streams);
+  host_trim_radix_blocks_msb<uint64_t>(output, input, cuda_streams);
+  cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
+}
+
 uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
@@ -64,3 +73,46 @@ void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
   delete mem_ptr;
   *mem_ptr_void = nullptr;
 }
+
+uint64_t scratch_cuda_cast_to_unsigned_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
+    bool requires_full_propagate, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_cast_to_unsigned<uint64_t>(
+      CudaStreams(streams), (int_cast_to_unsigned_buffer<uint64_t> **)mem_ptr,
+      params, num_input_blocks, target_num_blocks, input_is_signed,
+      requires_full_propagate, allocate_gpu_memory);
+}
+
+void cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
+                              CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI *input, int8_t *mem_ptr,
+                              uint32_t target_num_blocks, bool input_is_signed,
+                              void *const *bsks, void *const *ksks) {
+
+  host_cast_to_unsigned<uint64_t>(
+      CudaStreams(streams), output, input,
+      (int_cast_to_unsigned_buffer<uint64_t> *)mem_ptr, target_num_blocks,
+      input_is_signed, bsks, (uint64_t **)ksks);
+}
+
+void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
+                                      int8_t **mem_ptr_void) {
+  int_cast_to_unsigned_buffer<uint64_t> *mem_ptr =
+      (int_cast_to_unsigned_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
@@ -36,6 +36,23 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
       input->num_radix_blocks);
 }
 
+template <typename Torus>
+__host__ void
+host_trim_radix_blocks_msb(CudaRadixCiphertextFFI *output_radix,
+                           const CudaRadixCiphertextFFI *input_radix,
+                           CudaStreams streams) {
+
+  PANIC_IF_FALSE(input_radix->num_radix_blocks >=
+                     output_radix->num_radix_blocks,
+                 "Cuda error: input radix ciphertext has fewer blocks than "
+                 "required to keep");
+
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), output_radix, 0,
+      output_radix->num_radix_blocks, input_radix, 0,
+      output_radix->num_radix_blocks);
+}
+
 template <typename Torus>
 __host__ uint64_t scratch_extend_radix_with_sign_msb(
     CudaStreams streams, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
@@ -91,4 +108,56 @@ __host__ void host_extend_radix_with_sign_msb(
   POP_RANGE()
 }
 
+template <typename Torus>
+uint64_t scratch_cuda_cast_to_unsigned(
+    CudaStreams streams, int_cast_to_unsigned_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_input_blocks,
+    uint32_t target_num_blocks, bool input_is_signed,
+    bool requires_full_propagate, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_cast_to_unsigned_buffer<Torus>(
+      streams, params, num_input_blocks, target_num_blocks, input_is_signed,
+      requires_full_propagate, allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void
+host_cast_to_unsigned(CudaStreams streams, CudaRadixCiphertextFFI *output,
+                      CudaRadixCiphertextFFI *input,
+                      int_cast_to_unsigned_buffer<Torus> *mem_ptr,
+                      uint32_t target_num_blocks, bool input_is_signed,
+                      void *const *bsks, Torus *const *ksks) {
+
+  uint32_t current_num_blocks = input->num_radix_blocks;
+
+  if (mem_ptr->requires_full_propagate) {
+    host_full_propagate_inplace<Torus>(streams, input, mem_ptr->prop_buffer,
+                                       ksks, bsks, current_num_blocks);
+  }
+
+  if (target_num_blocks > current_num_blocks) {
+    uint32_t num_blocks_to_add = target_num_blocks - current_num_blocks;
+
+    if (input_is_signed) {
+      host_extend_radix_with_sign_msb<Torus>(
+          streams, output, input, mem_ptr->extend_buffer, num_blocks_to_add,
+          bsks, (Torus **)ksks);
+    } else {
+      host_extend_radix_with_trivial_zero_blocks_msb<Torus>(output, input,
+                                                            streams);
+    }
+
+  } else if (target_num_blocks < current_num_blocks) {
+    host_trim_radix_blocks_msb<Torus>(output, input, streams);
+
+  } else {
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), output, 0, current_num_blocks,
+        input, 0, current_num_blocks);
+  }
+}
+
 #endif