cwida · lkuffo · Mar 5, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 3, 2026
diff --git a/.gitignore b/.gitignore
@@ -39,6 +39,9 @@ __pycache__/
 venv
 .venv
 /build/
+/.cache/
+/.vscode/
+/.claude/
 /cmake-build-debug/
 /cmake-build-release/
 /cmake-build*/
@@ -76,6 +79,7 @@ pdxearch.egg-info
 /benchmarks/datasets/queries
 /benchmarks/datasets/selection_vectors
 /benchmarks/datasets/ground_truth_filtered
+/benchmarks/results/DEFAULT/*.csv
 
 /benchmarks/gt_filtered
 
@@ -101,6 +105,8 @@ cmake_install.cmake
 /benchmarks/BenchmarkPDXIVF
 /benchmarks/BenchmarkFiltered
 /benchmarks/BenchmarkSpecialFilters
+/benchmarks/BenchmarkInsertion
+/benchmarks/BenchmarkWorkload
 
 # Test binaries (but keep the committed test data)
 *.bin

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -5,11 +5,9 @@ We are actively developing PDX and accepting contributions! Any kind of PR is we
 These are our current priorities:
 
 **Features**:
-- Inserts and Updates (wip).
 - Out-of-core execution (disk-based setting).
 - Implement multi-threading capabilities.
 - Add PDX to the [VIBE benchmark](https://vector-index-bench.github.io/).
-- Create a documentation.
 
 **Improvements**:
 - Regression tests on CI.

diff --git a/README.md b/README.md
@@ -21,8 +21,10 @@
 - ⚡ [**Sub-millisecond similarity search**](https://www.lkuffo.com/sub-milisecond-similarity-search-with-pdx/), up to [**10x faster**](./BENCHMARKING.md#two-level-ivf-ivf2-) than FAISS IVF.
 - ⚡ Up to [**30x faster**](./BENCHMARKING.md#exhaustive-search--ivf) exhaustive search.
 - 🔍 Efficient [**filtered search**](https://github.com/cwida/PDX/issues/7).
+- ⚙️ Fast and reliable [**index maintenance**](https://github.com/cwida/PDX/pull/13).
 - Query latency competitive with HNSW, with the ease of use of IVF.
 
+
 ## Our secret sauce
 
 [PDX](https://ir.cwi.nl/pub/35044/35044.pdf) is a data layout that **transposes** vectors in a column-major order. This layout unleashes the true potential of dimension pruning.
@@ -42,14 +44,20 @@ query = ... # Numpy 1D array
 d = 1024
 knn = 20
 
+# Build
 index = IndexPDXIVFTreeSQ8(num_dimensions=d)
 index.build(data)
 
+# Search
 ids, dists = index.search(query, knn)
 
+# Maintenance
+index.append(row_id_to_insert, new_embedding)
+index.delete(row_id_to_delete)
+
 ```
 
-`IndexPDXIVFTreeSQ8` is our fastest index that will give you the best performance. It is a two-level IVF index with 8-bit quantization.
+`IndexPDXIVFTreeSQ8` is our fastest index that will give you the best performance alongside lightweight maintenance. It is a two-level IVF index with 8-bit quantization.
 
 Check our [examples](./examples/) for fully working examples in Python and our [benchmarks](./benchmarks) for fully working examples in C++. We support Flat (`float32`) and Quantized (`8-bit`) indexes, as well as the most common distance metrics. 
 

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -12,12 +12,16 @@ add_executable(BenchmarkEndToEnd pdx_end_to_end.cpp)
 add_executable(BenchmarkSerialization pdx_serialization.cpp)
 add_executable(BenchmarkFiltered pdx_filtered.cpp)
 add_executable(BenchmarkSpecialFilters pdx_special_filtered.cpp)
+add_executable(BenchmarkInsertion pdx_insertion.cpp)
+add_executable(BenchmarkWorkload pdx_workload.cpp)
 
 target_link_libraries(BenchmarkPDXIVF ${BENCH_COMMON_LIBS})
 target_link_libraries(BenchmarkEndToEnd ${BENCH_COMMON_LIBS})
 target_link_libraries(BenchmarkSerialization ${BENCH_COMMON_LIBS})
 target_link_libraries(BenchmarkFiltered ${BENCH_COMMON_LIBS})
 target_link_libraries(BenchmarkSpecialFilters ${BENCH_COMMON_LIBS})
+target_link_libraries(BenchmarkInsertion ${BENCH_COMMON_LIBS})
+target_link_libraries(BenchmarkWorkload ${BENCH_COMMON_LIBS})
 
 add_custom_target(benchmarks
     DEPENDS
@@ -26,4 +30,6 @@ add_custom_target(benchmarks
         BenchmarkSerialization
         BenchmarkFiltered
         BenchmarkSpecialFilters
+        BenchmarkInsertion
+        BenchmarkWorkload
 )
diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp
@@ -43,7 +43,8 @@ class TicToc {
 };
 
 // Raw binary data paths (SuperKMeans convention: data_<name>.bin / data_<name>_test.bin)
-inline std::string RAW_DATA_DIR = std::string{CMAKE_SOURCE_DIR} + "/../SuperKMeans/benchmarks/data";
+inline std::string RAW_DATA_DIR =
+    std::string{CMAKE_SOURCE_DIR} + "/../../SuperKMeans/benchmarks/data";
 inline std::string GROUND_TRUTH_JSON_DIR =
     std::string{CMAKE_SOURCE_DIR} + "/../SuperKMeans/benchmarks/ground_truth";
 
@@ -88,6 +89,13 @@ struct PhasesRuntime {
     size_t end_to_end{0};
 };
 
+enum class StepType { BUILD, INSERT, DELETE };
+
+struct WorkloadStep {
+    StepType type;
+    float proportion; // fraction of total dataset size N
+};
+
 class BenchmarkUtils {
   public:
     inline static std::string PDX_DATA =

diff --git a/benchmarks/pdx_end_to_end.cpp b/benchmarks/pdx_end_to_end.cpp
@@ -7,11 +7,11 @@
 #include <iomanip>
 #include <iostream>
 #include <memory>
-#include <numeric>
 #include <vector>
 
 #include "benchmark_utils.hpp"
 #include "pdx/index.hpp"
+#include "pdx/profiler.hpp"
 #include "pdx/utils.hpp"
 
 template <typename IndexT>
@@ -106,6 +106,7 @@ void RunBenchmark(
                 runtimes[j + l * NUM_MEASURE_RUNS] = {clock.accum_time};
             }
         }
+        PDX::Profiler::Get().PrintHierarchical();
 
         BenchmarkMetadata results_metadata = {
             dataset,

diff --git a/benchmarks/pdx_insertion.cpp b/benchmarks/pdx_insertion.cpp
@@ -0,0 +1,225 @@
+#ifndef BENCHMARK_TIME
+#define BENCHMARK_TIME = true
+#endif
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "benchmark_utils.hpp"
+#include "pdx/index.hpp"
+#include "pdx/profiler.hpp"
+#include "pdx/utils.hpp"
+
+template <typename IndexT>
+void RunBenchmark(
+    const RawDatasetInfo& info,
+    const std::string& dataset,
+    const std::string& algorithm,
+    const float* data,
+    const float* queries,
+    const std::vector<size_t>& nprobes_to_use,
+    const float proportion_to_build
+) {
+    const size_t d = info.num_dimensions;
+    const size_t n = info.num_embeddings;
+    const size_t n_queries = info.num_queries;
+    uint8_t KNN = BenchmarkUtils::KNN;
+    size_t NUM_MEASURE_RUNS = BenchmarkUtils::NUM_MEASURE_RUNS;
+    std::string RESULTS_PATH = BENCHMARK_UTILS.RESULTS_DIR_PATH + "INSERTION_PDX.csv";
+
+    const size_t n_build = static_cast<size_t>(n * proportion_to_build);
+    const size_t n_insert = n - n_build;
+
+    PDX::PDXIndexConfig index_config{
+        .num_dimensions = static_cast<uint32_t>(d),
+        .distance_metric = info.distance_metric,
+        .seed = 42,
+        .normalize = true,
+        .sampling_fraction = 1.0f
+    };
+
+    // Build index with 75% of the data
+    TicToc clock;
+    std::cout << "Building index with " << n_build << " / " << n << " embeddings...\n";
+    clock.Reset();
+    clock.Tic();
+    IndexT pdx_index(index_config);
+    pdx_index.BuildIndex(data, n_build);
+    clock.Toc();
+    std::cout << "Build time: " << clock.GetMilliseconds() << " ms\n";
+    std::cout << "Clusters: " << pdx_index.GetNumClusters() << "\n";
+    std::cout << "Index in-memory size: " << std::fixed << std::setprecision(2)
+              << static_cast<double>(pdx_index.GetInMemorySizeInBytes()) / (1024.0 * 1024.0)
+              << " MB\n";
+
+    // Insert remaining 25%
+    std::cout << "Inserting " << n_insert << " embeddings...\n";
+    clock.Reset();
+    clock.Tic();
+    for (size_t i = 0; i < n_insert; ++i) {
+        size_t row_id = n_build + i;
+        std::cout << "Inserting embedding " << row_id << " / " << n - 1 << "\r" << std::flush;
+        pdx_index.Append(row_id, data + row_id * d);
+    }
+    clock.Toc();
+    std::cout << "Insertion time: " << clock.GetMilliseconds() << " ms\n";
+    std::cout << "Avg insertion time: " << clock.GetMilliseconds() / n_insert << " ms/embedding\n";
+    std::cout << "Clusters after insertion: " << pdx_index.GetNumClusters() << "\n";
+    std::cout << "Index in-memory size after insertion: " << std::fixed << std::setprecision(2)
+              << static_cast<double>(pdx_index.GetInMemorySizeInBytes()) / (1024.0 * 1024.0)
+              << " MB\n";
+
+    PDX::Profiler::Get().PrintHierarchical();
+
+    // Load ground truth
+    std::string gt_path = BenchmarkUtils::GROUND_TRUTH_DATA + info.pdx_dataset_name + "_100_norm";
+    auto gt_buffer = MmapFile(gt_path);
+    uint32_t* int_ground_truth = reinterpret_cast<uint32_t*>(gt_buffer.get());
+    std::cout << "Ground truth loaded: " << gt_path << "\n";
+
+    for (size_t ivf_nprobe : nprobes_to_use) {
+        if (pdx_index.GetNumClusters() < ivf_nprobe)
+            continue;
+
+        pdx_index.SetNProbe(ivf_nprobe);
+
+        // Recall pass
+        float recalls = 0;
+        for (size_t l = 0; l < n_queries; ++l) {
+            auto result = pdx_index.Search(queries + l * d, KNN);
+            BenchmarkUtils::VerifyResult<true>(recalls, result, KNN, int_ground_truth, l);
+        }
+
+        // Timing pass
+        std::vector<PhasesRuntime> runtimes;
+        runtimes.resize(NUM_MEASURE_RUNS * n_queries);
+        TicToc search_clock;
+        for (size_t j = 0; j < NUM_MEASURE_RUNS; ++j) {
+            for (size_t l = 0; l < n_queries; ++l) {
+                search_clock.Reset();
+                search_clock.Tic();
+                pdx_index.Search(queries + l * d, KNN);
+                search_clock.Toc();
+                runtimes[j + l * NUM_MEASURE_RUNS] = {search_clock.accum_time};
+            }
+        }
+
+        BenchmarkMetadata results_metadata = {
+            dataset,
+            algorithm,
+            NUM_MEASURE_RUNS,
+            n_queries,
+            ivf_nprobe,
+            KNN,
+            recalls,
+        };
+        BenchmarkUtils::SaveResults(runtimes, RESULTS_PATH, results_metadata);
+    }
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 2) {
+        std::cerr << "Usage: " << argv[0] << " <dataset> [index_type] [nprobe] [build_fraction]\n";
+        std::cerr << "Index types: pdx_tree_f32 (default), pdx_tree_u8\n";
+        std::cerr << "Available datasets:";
+        for (const auto& [name, _] : RAW_DATASET_PARAMS) {
+            std::cerr << " " << name;
+        }
+        std::cerr << "\n";
+        return 1;
+    }
+    std::string dataset = argv[1];
+    std::string index_type = (argc > 2) ? argv[2] : "pdx_tree_f32";
+    size_t arg_ivf_nprobe = (argc > 3) ? std::atoi(argv[3]) : 0;
+    float proportion_to_build = (argc > 4) ? std::atof(argv[4]) : 0.75f;
+
+    if (proportion_to_build <= 0.0f || proportion_to_build >= 1.0f) {
+        std::cerr << "Error: build_fraction must be in (0, 1). Got: " << proportion_to_build
+                  << "\n";
+        return 1;
+    }
+
+    if (index_type != "pdx_tree_f32" && index_type != "pdx_tree_u8") {
+        std::cerr << "Error: Only pdx_tree_f32 and pdx_tree_u8 support maintenance (insertion).\n";
+        std::cerr << "Got: " << index_type << "\n";
+        return 1;
+    }
+
+    auto it = RAW_DATASET_PARAMS.find(dataset);
+    if (it == RAW_DATASET_PARAMS.end()) {
+        std::cerr << "Unknown dataset: " << dataset << "\n";
+        return 1;
+    }
+    const auto& info = it->second;
+    const size_t n = info.num_embeddings;
+    const size_t d = info.num_dimensions;
+    const size_t n_queries = info.num_queries;
+
+    std::cout << "==> PDX Insertion Benchmark (Build "
+              << static_cast<int>(proportion_to_build * 100) << "% + Insert "
+              << static_cast<int>((1.0f - proportion_to_build) * 100) << "% + Search)\n";
+    std::cout << "Dataset: " << dataset << " (n=" << n << ", d=" << d << ")\n";
+    std::cout << "Index type: " << index_type << "\n";
+
+    // Read data
+    std::string data_path = RAW_DATA_DIR + "/data_" + dataset + ".bin";
+    std::string query_path = RAW_DATA_DIR + "/data_" + dataset + "_test.bin";
+
+    std::vector<float> data(n * d);
+    {
+        std::ifstream file(data_path, std::ios::binary);
+        if (!file) {
+            std::cerr << "Failed to open " << data_path << "\n";
+            return 1;
+        }
+        file.read(reinterpret_cast<char*>(data.data()), n * d * sizeof(float));
+    }
+
+    std::vector<float> queries(n_queries * d);
+    {
+        std::ifstream file(query_path, std::ios::binary);
+        if (!file) {
+            std::cerr << "Failed to open " << query_path << "\n";
+            return 1;
+        }
+        file.read(reinterpret_cast<char*>(queries.data()), n_queries * d * sizeof(float));
+    }
+
+    std::vector<size_t> nprobes_to_use;
+    if (arg_ivf_nprobe > 0) {
+        nprobes_to_use = {arg_ivf_nprobe};
+    } else {
+        nprobes_to_use.assign(
+            std::begin(BenchmarkUtils::IVF_PROBES), std::end(BenchmarkUtils::IVF_PROBES)
+        );
+    }
+
+    std::string algorithm = "insertion_" + index_type;
+
+    if (index_type == "pdx_tree_f32") {
+        RunBenchmark<PDX::PDXTreeIndexF32>(
+            info,
+            dataset,
+            algorithm,
+            data.data(),
+            queries.data(),
+            nprobes_to_use,
+            proportion_to_build
+        );
+    } else if (index_type == "pdx_tree_u8") {
+        RunBenchmark<PDX::PDXTreeIndexU8>(
+            info,
+            dataset,
+            algorithm,
+            data.data(),
+            queries.data(),
+            nprobes_to_use,
+            proportion_to_build
+        );
+    }
+
+    return 0;
+}