Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ __pycache__/
venv
.venv
/build/
/.cache/
/.vscode/
/.claude/
/cmake-build-debug/
/cmake-build-release/
/cmake-build*/
Expand Down Expand Up @@ -76,6 +79,7 @@ pdxearch.egg-info
/benchmarks/datasets/queries
/benchmarks/datasets/selection_vectors
/benchmarks/datasets/ground_truth_filtered
/benchmarks/results/DEFAULT/*.csv

/benchmarks/gt_filtered

Expand All @@ -101,6 +105,8 @@ cmake_install.cmake
/benchmarks/BenchmarkPDXIVF
/benchmarks/BenchmarkFiltered
/benchmarks/BenchmarkSpecialFilters
/benchmarks/BenchmarkInsertion
/benchmarks/BenchmarkWorkload

# Test binaries (but keep the committed test data)
*.bin
Expand Down
2 changes: 0 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@ We are actively developing PDX and accepting contributions! Any kind of PR is we
These are our current priorities:

**Features**:
- Inserts and Updates (wip).
- Out-of-core execution (disk-based setting).
- Implement multi-threading capabilities.
- Add PDX to the [VIBE benchmark](https://vector-index-bench.github.io/).
- Create a documentation.

**Improvements**:
- Regression tests on CI.
Expand Down
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
- ⚡ [**Sub-millisecond similarity search**](https://www.lkuffo.com/sub-milisecond-similarity-search-with-pdx/), up to [**10x faster**](./BENCHMARKING.md#two-level-ivf-ivf2-) than FAISS IVF.
- ⚡ Up to [**30x faster**](./BENCHMARKING.md#exhaustive-search--ivf) exhaustive search.
- 🔍 Efficient [**filtered search**](https://github.com/cwida/PDX/issues/7).
- ⚙️ Fast and reliable [**index maintenance**](https://github.com/cwida/PDX/pull/13).
- Query latency competitive with HNSW, with the ease of use of IVF.


## Our secret sauce

[PDX](https://ir.cwi.nl/pub/35044/35044.pdf) is a data layout that **transposes** vectors in a column-major order. This layout unleashes the true potential of dimension pruning.
Expand All @@ -42,14 +44,20 @@ query = ... # Numpy 1D array
d = 1024
knn = 20

# Build
index = IndexPDXIVFTreeSQ8(num_dimensions=d)
index.build(data)

# Search
ids, dists = index.search(query, knn)

# Maintenance
index.append(row_id_to_insert, new_embedding)
index.delete(row_id_to_delete)

```

`IndexPDXIVFTreeSQ8` is our fastest index that will give you the best performance. It is a two-level IVF index with 8-bit quantization.
`IndexPDXIVFTreeSQ8` is our fastest index that will give you the best performance alongside lightweight maintenance. It is a two-level IVF index with 8-bit quantization.

Check our [examples](./examples/) for fully working examples in Python and our [benchmarks](./benchmarks) for fully working examples in C++. We support Flat (`float32`) and Quantized (`8-bit`) indexes, as well as the most common distance metrics.

Expand Down
6 changes: 6 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ add_executable(BenchmarkEndToEnd pdx_end_to_end.cpp)
add_executable(BenchmarkSerialization pdx_serialization.cpp)
add_executable(BenchmarkFiltered pdx_filtered.cpp)
add_executable(BenchmarkSpecialFilters pdx_special_filtered.cpp)
add_executable(BenchmarkInsertion pdx_insertion.cpp)
add_executable(BenchmarkWorkload pdx_workload.cpp)

target_link_libraries(BenchmarkPDXIVF ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkEndToEnd ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkSerialization ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkFiltered ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkSpecialFilters ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkInsertion ${BENCH_COMMON_LIBS})
target_link_libraries(BenchmarkWorkload ${BENCH_COMMON_LIBS})

add_custom_target(benchmarks
DEPENDS
Expand All @@ -26,4 +30,6 @@ add_custom_target(benchmarks
BenchmarkSerialization
BenchmarkFiltered
BenchmarkSpecialFilters
BenchmarkInsertion
BenchmarkWorkload
)
10 changes: 9 additions & 1 deletion benchmarks/benchmark_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ class TicToc {
};

// Raw binary data paths (SuperKMeans convention: data_<name>.bin / data_<name>_test.bin)
inline std::string RAW_DATA_DIR = std::string{CMAKE_SOURCE_DIR} + "/../SuperKMeans/benchmarks/data";
inline std::string RAW_DATA_DIR =
std::string{CMAKE_SOURCE_DIR} + "/../../SuperKMeans/benchmarks/data";
inline std::string GROUND_TRUTH_JSON_DIR =
std::string{CMAKE_SOURCE_DIR} + "/../SuperKMeans/benchmarks/ground_truth";

Expand Down Expand Up @@ -88,6 +89,13 @@ struct PhasesRuntime {
size_t end_to_end{0};
};

enum class StepType { BUILD, INSERT, DELETE };

struct WorkloadStep {
StepType type;
float proportion; // fraction of total dataset size N
};

class BenchmarkUtils {
public:
inline static std::string PDX_DATA =
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/pdx_end_to_end.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
#include <iomanip>
#include <iostream>
#include <memory>
#include <numeric>
#include <vector>

#include "benchmark_utils.hpp"
#include "pdx/index.hpp"
#include "pdx/profiler.hpp"
#include "pdx/utils.hpp"

template <typename IndexT>
Expand Down Expand Up @@ -106,6 +106,7 @@ void RunBenchmark(
runtimes[j + l * NUM_MEASURE_RUNS] = {clock.accum_time};
}
}
PDX::Profiler::Get().PrintHierarchical();

BenchmarkMetadata results_metadata = {
dataset,
Expand Down
225 changes: 225 additions & 0 deletions benchmarks/pdx_insertion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
#ifndef BENCHMARK_TIME
#define BENCHMARK_TIME = true
#endif

#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
#include <vector>

#include "benchmark_utils.hpp"
#include "pdx/index.hpp"
#include "pdx/profiler.hpp"
#include "pdx/utils.hpp"

template <typename IndexT>
void RunBenchmark(
const RawDatasetInfo& info,
const std::string& dataset,
const std::string& algorithm,
const float* data,
const float* queries,
const std::vector<size_t>& nprobes_to_use,
const float proportion_to_build
) {
const size_t d = info.num_dimensions;
const size_t n = info.num_embeddings;
const size_t n_queries = info.num_queries;
uint8_t KNN = BenchmarkUtils::KNN;
size_t NUM_MEASURE_RUNS = BenchmarkUtils::NUM_MEASURE_RUNS;
std::string RESULTS_PATH = BENCHMARK_UTILS.RESULTS_DIR_PATH + "INSERTION_PDX.csv";

const size_t n_build = static_cast<size_t>(n * proportion_to_build);
const size_t n_insert = n - n_build;

PDX::PDXIndexConfig index_config{
.num_dimensions = static_cast<uint32_t>(d),
.distance_metric = info.distance_metric,
.seed = 42,
.normalize = true,
.sampling_fraction = 1.0f
};

// Build index with 75% of the data
TicToc clock;
std::cout << "Building index with " << n_build << " / " << n << " embeddings...\n";
clock.Reset();
clock.Tic();
IndexT pdx_index(index_config);
pdx_index.BuildIndex(data, n_build);
clock.Toc();
std::cout << "Build time: " << clock.GetMilliseconds() << " ms\n";
std::cout << "Clusters: " << pdx_index.GetNumClusters() << "\n";
std::cout << "Index in-memory size: " << std::fixed << std::setprecision(2)
<< static_cast<double>(pdx_index.GetInMemorySizeInBytes()) / (1024.0 * 1024.0)
<< " MB\n";

// Insert remaining 25%
std::cout << "Inserting " << n_insert << " embeddings...\n";
clock.Reset();
clock.Tic();
for (size_t i = 0; i < n_insert; ++i) {
size_t row_id = n_build + i;
std::cout << "Inserting embedding " << row_id << " / " << n - 1 << "\r" << std::flush;
pdx_index.Append(row_id, data + row_id * d);
}
clock.Toc();
std::cout << "Insertion time: " << clock.GetMilliseconds() << " ms\n";
std::cout << "Avg insertion time: " << clock.GetMilliseconds() / n_insert << " ms/embedding\n";
std::cout << "Clusters after insertion: " << pdx_index.GetNumClusters() << "\n";
std::cout << "Index in-memory size after insertion: " << std::fixed << std::setprecision(2)
<< static_cast<double>(pdx_index.GetInMemorySizeInBytes()) / (1024.0 * 1024.0)
<< " MB\n";

PDX::Profiler::Get().PrintHierarchical();

// Load ground truth
std::string gt_path = BenchmarkUtils::GROUND_TRUTH_DATA + info.pdx_dataset_name + "_100_norm";
auto gt_buffer = MmapFile(gt_path);
uint32_t* int_ground_truth = reinterpret_cast<uint32_t*>(gt_buffer.get());
std::cout << "Ground truth loaded: " << gt_path << "\n";

for (size_t ivf_nprobe : nprobes_to_use) {
if (pdx_index.GetNumClusters() < ivf_nprobe)
continue;

pdx_index.SetNProbe(ivf_nprobe);

// Recall pass
float recalls = 0;
for (size_t l = 0; l < n_queries; ++l) {
auto result = pdx_index.Search(queries + l * d, KNN);
BenchmarkUtils::VerifyResult<true>(recalls, result, KNN, int_ground_truth, l);
}

// Timing pass
std::vector<PhasesRuntime> runtimes;
runtimes.resize(NUM_MEASURE_RUNS * n_queries);
TicToc search_clock;
for (size_t j = 0; j < NUM_MEASURE_RUNS; ++j) {
for (size_t l = 0; l < n_queries; ++l) {
search_clock.Reset();
search_clock.Tic();
pdx_index.Search(queries + l * d, KNN);
search_clock.Toc();
runtimes[j + l * NUM_MEASURE_RUNS] = {search_clock.accum_time};
}
}

BenchmarkMetadata results_metadata = {
dataset,
algorithm,
NUM_MEASURE_RUNS,
n_queries,
ivf_nprobe,
KNN,
recalls,
};
BenchmarkUtils::SaveResults(runtimes, RESULTS_PATH, results_metadata);
}
}

int main(int argc, char* argv[]) {
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " <dataset> [index_type] [nprobe] [build_fraction]\n";
std::cerr << "Index types: pdx_tree_f32 (default), pdx_tree_u8\n";
std::cerr << "Available datasets:";
for (const auto& [name, _] : RAW_DATASET_PARAMS) {
std::cerr << " " << name;
}
std::cerr << "\n";
return 1;
}
std::string dataset = argv[1];
std::string index_type = (argc > 2) ? argv[2] : "pdx_tree_f32";
size_t arg_ivf_nprobe = (argc > 3) ? std::atoi(argv[3]) : 0;
float proportion_to_build = (argc > 4) ? std::atof(argv[4]) : 0.75f;

if (proportion_to_build <= 0.0f || proportion_to_build >= 1.0f) {
std::cerr << "Error: build_fraction must be in (0, 1). Got: " << proportion_to_build
<< "\n";
return 1;
}

if (index_type != "pdx_tree_f32" && index_type != "pdx_tree_u8") {
std::cerr << "Error: Only pdx_tree_f32 and pdx_tree_u8 support maintenance (insertion).\n";
std::cerr << "Got: " << index_type << "\n";
return 1;
}

auto it = RAW_DATASET_PARAMS.find(dataset);
if (it == RAW_DATASET_PARAMS.end()) {
std::cerr << "Unknown dataset: " << dataset << "\n";
return 1;
}
const auto& info = it->second;
const size_t n = info.num_embeddings;
const size_t d = info.num_dimensions;
const size_t n_queries = info.num_queries;

std::cout << "==> PDX Insertion Benchmark (Build "
<< static_cast<int>(proportion_to_build * 100) << "% + Insert "
<< static_cast<int>((1.0f - proportion_to_build) * 100) << "% + Search)\n";
std::cout << "Dataset: " << dataset << " (n=" << n << ", d=" << d << ")\n";
std::cout << "Index type: " << index_type << "\n";

// Read data
std::string data_path = RAW_DATA_DIR + "/data_" + dataset + ".bin";
std::string query_path = RAW_DATA_DIR + "/data_" + dataset + "_test.bin";

std::vector<float> data(n * d);
{
std::ifstream file(data_path, std::ios::binary);
if (!file) {
std::cerr << "Failed to open " << data_path << "\n";
return 1;
}
file.read(reinterpret_cast<char*>(data.data()), n * d * sizeof(float));
}

std::vector<float> queries(n_queries * d);
{
std::ifstream file(query_path, std::ios::binary);
if (!file) {
std::cerr << "Failed to open " << query_path << "\n";
return 1;
}
file.read(reinterpret_cast<char*>(queries.data()), n_queries * d * sizeof(float));
}

std::vector<size_t> nprobes_to_use;
if (arg_ivf_nprobe > 0) {
nprobes_to_use = {arg_ivf_nprobe};
} else {
nprobes_to_use.assign(
std::begin(BenchmarkUtils::IVF_PROBES), std::end(BenchmarkUtils::IVF_PROBES)
);
}

std::string algorithm = "insertion_" + index_type;

if (index_type == "pdx_tree_f32") {
RunBenchmark<PDX::PDXTreeIndexF32>(
info,
dataset,
algorithm,
data.data(),
queries.data(),
nprobes_to_use,
proportion_to_build
);
} else if (index_type == "pdx_tree_u8") {
RunBenchmark<PDX::PDXTreeIndexU8>(
info,
dataset,
algorithm,
data.data(),
queries.data(),
nprobes_to_use,
proportion_to_build
);
}

return 0;
}
Loading