zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h‎
Lines changed: 149 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h‎
Lines changed: 47 additions & 6 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer.h‎
Lines changed: 47 additions & 6 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu‎
Lines changed: 90 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu‎
Lines changed: 90 additions & 0 deletions
@@ -1,6 +1,111 @@
 #pragma once
 #include "integer_utilities.h"
 
+template <typename Torus> struct boolean_bitop_buffer {
+
+  int_radix_params params;
+  int_radix_lut<Torus> *lut;
+  int_radix_lut<Torus> *message_extract_lut;
+
+  CudaRadixCiphertextFFI *tmp_lwe_left;
+  CudaRadixCiphertextFFI *tmp_lwe_right;
+
+  BITOP_TYPE op;
+  bool unchecked;
+  bool gpu_memory_allocated;
+
+  boolean_bitop_buffer(CudaStreams streams, BITOP_TYPE op, bool is_unchecked,
+                       int_radix_params params, uint32_t lwe_ciphertext_count,
+                       bool allocate_gpu_memory, uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    this->op = op;
+    this->params = params;
+    auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+    this->unchecked = is_unchecked;
+    switch (op) {
+    case BITAND:
+    case BITOR:
+    case BITXOR:
+      lut = new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
+                                     allocate_gpu_memory, size_tracker);
+      {
+        auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
+          if (op == BITOP_TYPE::BITAND) {
+            // AND
+            return lhs & rhs;
+          } else if (op == BITOP_TYPE::BITOR) {
+            // OR
+            return lhs | rhs;
+          } else {
+            // XOR
+            return lhs ^ rhs;
+          }
+        };
+
+        // BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
+        // only lut for degree = 1 is generated
+        generate_device_accumulator_bivariate_with_factor<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
+      }
+      break;
+    default:
+      PANIC("Boolean bitwise operation type is not specified");
+    }
+
+    if (!unchecked) {
+      message_extract_lut =
+          new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
+                                   gpu_memory_allocated, size_tracker);
+      auto lut_f_message_extract = [params](Torus x) -> Torus {
+        return x % params.message_modulus;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
+    }
+    tmp_lwe_left = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), tmp_lwe_left,
+        lwe_ciphertext_count, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+    tmp_lwe_right = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), tmp_lwe_right,
+        lwe_ciphertext_count, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams) {
+    if (!unchecked) {
+      message_extract_lut->release(streams);
+      delete message_extract_lut;
+    }
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   tmp_lwe_left, gpu_memory_allocated);
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   tmp_lwe_right, gpu_memory_allocated);
+    delete tmp_lwe_left;
+    delete tmp_lwe_right;
+    tmp_lwe_left = nullptr;
+    tmp_lwe_right = nullptr;
+
+    lut->release(streams);
+    delete lut;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
 template <typename Torus> struct int_bitop_buffer {
 
   int_radix_params params;
@@ -81,6 +186,50 @@ template <typename Torus> struct int_bitop_buffer {
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
   }
 };
+
+template <typename Torus> struct boolean_bitnot_buffer {
+  int_radix_params params;
+  int_radix_lut<Torus> *message_extract_lut;
+  bool gpu_memory_allocated;
+  bool unchecked;
+  boolean_bitnot_buffer(CudaStreams streams, int_radix_params params,
+                        uint32_t lwe_ciphertext_count, bool is_unchecked,
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    unchecked = is_unchecked;
+    this->params = params;
+
+    auto message_modulus = params.message_modulus;
+
+    if (!unchecked) {
+      message_extract_lut =
+          new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
+                                   gpu_memory_allocated, size_tracker);
+      auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
+        return x % message_modulus;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+      message_extract_lut->broadcast_lut(active_streams);
+    }
+  }
+
+  void release(CudaStreams streams) {
+    if (!unchecked) {
+      message_extract_lut->release(streams);
+      delete message_extract_lut;
+    }
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
 void update_degrees_after_bitand(uint64_t *output_degrees,
                                  uint64_t *lwe_array_1_degrees,
                                  uint64_t *lwe_array_2_degrees,
 
@@ -280,6 +280,47 @@ void cuda_scalar_comparison_ciphertext_64(
 void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);
 
+uint64_t scratch_cuda_boolean_bitop_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    bool is_unchecked, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_boolean_bitop_ciphertext_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *lwe_array_out,
+                                      CudaRadixCiphertextFFI const *lwe_array_1,
+                                      CudaRadixCiphertextFFI const *lwe_array_2,
+                                      int8_t *mem_ptr, void *const *bsks,
+                                      void *const *ksks);
+
+void cleanup_cuda_boolean_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_boolean_bitnot_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_boolean_bitnot_ciphertext_64(CudaStreamsFFI streams,
+                                       CudaRadixCiphertextFFI *lwe_array,
+                                       int8_t *mem_ptr, void *const *bsks,
+                                       void *const *ksks);
+
+void cleanup_cuda_boolean_bitnot(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+
+void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
+                               CudaRadixCiphertextFFI *radix_ciphertext,
+                               uint32_t ct_message_modulus,
+                               uint32_t param_message_modulus,
+                               uint32_t param_carry_modulus);
+
 uint64_t scratch_cuda_bitop_64(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -289,19 +330,19 @@ uint64_t scratch_cuda_bitop_64(
     uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
     bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 
+void cuda_scalar_bitop_ciphertext_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
+    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
+
 void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
                               CudaRadixCiphertextFFI *lwe_array_out,
                               CudaRadixCiphertextFFI const *lwe_array_1,
                               CudaRadixCiphertextFFI const *lwe_array_2,
                               int8_t *mem_ptr, void *const *bsks,
                               void *const *ksks);
 
-void cuda_scalar_bitop_ciphertext_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
-    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
-
 void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
 
 uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
 
@@ -1,5 +1,84 @@
 #include "integer/bitwise_ops.cuh"
 
+uint64_t scratch_cuda_boolean_bitop_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    bool is_unchecked, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_boolean_bitop<uint64_t>(
+      CudaStreams(streams), (boolean_bitop_buffer<uint64_t> **)mem_ptr,
+      lwe_ciphertext_count, params, op_type, is_unchecked, allocate_gpu_memory);
+}
+
+void cuda_boolean_bitop_ciphertext_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *lwe_array_out,
+                                      CudaRadixCiphertextFFI const *lwe_array_1,
+                                      CudaRadixCiphertextFFI const *lwe_array_2,
+                                      int8_t *mem_ptr, void *const *bsks,
+                                      void *const *ksks) {
+
+  host_boolean_bitop<uint64_t>(
+      CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
+      (boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+}
+
+void cleanup_cuda_boolean_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+
+  boolean_bitop_buffer<uint64_t> *mem_ptr =
+      (boolean_bitop_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_boolean_bitnot_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_boolean_bitnot<uint64_t>(
+      CudaStreams(streams), (boolean_bitnot_buffer<uint64_t> **)mem_ptr, params,
+      lwe_ciphertext_count, is_unchecked, allocate_gpu_memory);
+}
+
+void cuda_boolean_bitnot_ciphertext_64(CudaStreamsFFI streams,
+                                       CudaRadixCiphertextFFI *lwe_array,
+                                       int8_t *mem_ptr, void *const *bsks,
+                                       void *const *ksks) {
+  host_boolean_bitnot<uint64_t>(CudaStreams(streams), lwe_array,
+                                (boolean_bitnot_buffer<uint64_t> *)mem_ptr,
+                                bsks, (uint64_t **)(ksks));
+}
+
+void cleanup_cuda_boolean_bitnot(CudaStreamsFFI streams,
+                                 int8_t **mem_ptr_void) {
+
+  boolean_bitnot_buffer<uint64_t> *mem_ptr =
+      (boolean_bitnot_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
 uint64_t scratch_cuda_bitop_64(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -19,6 +98,17 @@ uint64_t scratch_cuda_bitop_64(
       lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
 }
 
+void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
+                               CudaRadixCiphertextFFI *radix_ciphertext,
+                               uint32_t ct_message_modulus,
+                               uint32_t param_message_modulus,
+                               uint32_t param_carry_modulus) {
+  auto cuda_streams = CudaStreams(streams);
+  host_bitnot<uint64_t>(cuda_streams, radix_ciphertext, ct_message_modulus,
+                        param_message_modulus, param_carry_modulus);
+  cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
+}
+
 void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
                               CudaRadixCiphertextFFI *lwe_array_out,
                               CudaRadixCiphertextFFI const *lwe_array_1,