Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/paimon/global_index/bitmap_global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "paimon/visibility.h"

namespace paimon {
/// Represents a global index query result that **lazily materializes** its matching row IDs as a
/// Represents a global index query result that **lazily materializes** its matching row ids as a
/// Roaring bitmap. The underlying 64-bit Roaring bitmap is **not constructed during object
/// creation**; instead, it is built on-demand the first time GetBitmap() is called. This design
/// avoids unnecessary computation and memory allocation when the bitmap is not needed (e.g., during
Expand Down Expand Up @@ -67,6 +67,8 @@ class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {

Result<bool> IsEmpty() const override;

Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;

std::string ToString() const override;

/// @return A non-owning, const pointer to the bitmap. The returned pointer is valid as long as
Expand Down
10 changes: 6 additions & 4 deletions include/paimon/global_index/bitmap_topk_global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
#include "paimon/visibility.h"

namespace paimon {
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row IDs
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
/// with an array of associated relevance scores.
///
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
/// **ascending row ID**. This design enables efficient merging and set operations while preserving
/// **ascending row id**. This design enables efficient merging and set operations while preserving
/// row id-to-score mapping.
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
public:
Expand Down Expand Up @@ -74,16 +74,18 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
Result<std::shared_ptr<GlobalIndexResult>> Or(
const std::shared_ptr<GlobalIndexResult>& other) override;

Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;

Result<bool> IsEmpty() const override;

std::string ToString() const override;

/// @return A non-owning, const pointer to the bitmap. The row IDs in the bitmap are stored in
/// @return A non-owning, const pointer to the bitmap. The row ids in the bitmap are stored in
/// ascending order (as guaranteed by Roaring64 iteration).
Result<const RoaringBitmap64*> GetBitmap() const;

/// @return A const reference to a vector of float scores, where the i-th element corresponds to
/// the i-th row ID when iterating the bitmap in **ascending row ID order**.
/// the i-th row id when iterating the bitmap in **ascending row id order**.
const std::vector<float>& GetScores() const;

private:
Expand Down
8 changes: 4 additions & 4 deletions include/paimon/global_index/global_index_io_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@
namespace paimon {
/// Metadata describing a single file entry in a global index.
struct PAIMON_EXPORT GlobalIndexIOMeta {
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, const Range& _row_id_range,
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, int64_t _range_end,
const std::shared_ptr<Bytes>& _metadata)
: file_name(_file_name),
file_size(_file_size),
row_id_range(_row_id_range),
range_end(_range_end),
metadata(_metadata) {}

std::string file_name;
int64_t file_size;
/// The inclusive range of row IDs covered by this file (i.e., [from, to]).
Range row_id_range;
/// The inclusive range end covered by this file (i.e., the last local row id).
int64_t range_end;
/// Optional binary metadata associated with the file, such as serialized
/// secondary index structures or inline index bytes.
/// May be null if no additional metadata is available.
Expand Down
18 changes: 11 additions & 7 deletions include/paimon/global_index/global_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,20 @@

namespace paimon {
/// Reads and evaluates filter predicates against a global file index.
/// `GlobalIndexReader` is an implementation of the `FunctionVisitor` interface
/// specialized to produce `std::shared_ptr<GlobalIndexResult>` objects.
///
/// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`,
/// `VisitIsNull`, etc.) to return index-based results that indicate which
/// row satisfy the given predicate.
///
/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row
/// ids** that start from 0 — not global row ids in the entire table.
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
public:
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
/// It operates solely on row_id and is typically driven by other global index, such as bitmap,
/// or range index. This filter enables early pruning of irrelevant candidates (e.g., "only
/// consider rows with label X"), significantly reducing the search space. Returns true to
/// It operates solely on **local row ids** and is typically driven by other global index, such
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
/// include the row in Top-K computation; false to exclude it.
///
/// @note Must be thread-safe.
Expand All @@ -47,7 +49,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
///
/// @param k Number of top results to return.
/// @param query The query vector (must match the dimensionality of the indexed vectors).
/// @param filter A pre-filter based on row_id, implemented by leveraging other global index
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
/// global index
/// structures (e.g., bitmap index) for efficient candidate pruning.
/// @param predicate A runtime filtering condition that may involve graph traversal of
/// structured attributes. **Using this parameter often yields better
Expand All @@ -58,7 +61,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
/// context-aware filtering at query time.
/// @note All fields referenced in the predicate must have been materialized
/// in the index during build to ensure availability.
/// @note `VisitTopK` is thread-safe while other `VisitXXX` is not.
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
/// thread-safe.
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
const std::shared_ptr<Predicate>& predicate) = 0;
Expand Down
10 changes: 7 additions & 3 deletions include/paimon/global_index/global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
virtual int64_t Next() = 0;
};

/// Checks whether the global index result contains no matching row IDs.
/// Checks whether the global index result contains no matching row ids.
///
/// @return A `Result<bool>` where:
/// - `true` indicates the result is empty (no matching rows),
Expand All @@ -67,6 +67,10 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
virtual Result<std::shared_ptr<GlobalIndexResult>> Or(
const std::shared_ptr<GlobalIndexResult>& other);

/// Adds the given offset to each row id in current result and returns the new global index
/// result.
virtual Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) = 0;

virtual std::string ToString() const = 0;

/// Serializes a GlobalIndexResult object into a byte array.
Expand Down Expand Up @@ -103,7 +107,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
};

/// Represents the result of a Top-K query against a global index.
/// This class encapsulates a set of top-K candidates (row ID + score pairs) and provides
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
/// an iterator interface to traverse them.
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
public:
Expand All @@ -115,7 +119,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
public:
virtual ~TopKIterator() = default;

/// Checks whether more row IDs are available.
/// Checks whether more row ids are available.
virtual bool HasNext() const = 0;

/// Retrieves the next (row_id, score) pair and advances the iterator.
Expand Down
14 changes: 7 additions & 7 deletions include/paimon/global_index/global_index_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class PAIMON_EXPORT GlobalIndexScan {
/// Creates a `GlobalIndexScan` instance for the specified table and context.
///
/// @param table_path Root directory of the table.
/// @param snapshot_id Optional snapshot ID to read from; if not provided, uses the latest.
/// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest.
/// @param partitions Optional list of specific partitions to restrict the scan scope.
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
/// If omitted, scans all partitions.
Expand Down Expand Up @@ -65,23 +65,23 @@ class PAIMON_EXPORT GlobalIndexScan {

virtual ~GlobalIndexScan() = default;

/// Creates a scanner for the global index over the specified row ID range.
/// Creates a scanner for the global index over the specified row id range.
///
/// This method instantiates a low-level scanner that can evaluate predicates and
/// retrieve matching row IDs from the global index data corresponding to the given
/// row ID range.
/// retrieve matching row ids from the global index data corresponding to the given
/// row id range.
///
/// @param range The inclusive row ID range [start, end] for which to create the scanner.
/// @param range The inclusive row id range [start, end] for which to create the scanner.
/// The range must be fully covered by existing global index data (from
/// `GetRowRangeList()`).
/// @return A `Result` containing a range-level scanner, or an error if parse index meta fails.
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
const Range& range) = 0;

/// Returns row ID ranges covered by this global index (sorted and non-overlapping
/// Returns row id ranges covered by this global index (sorted and non-overlapping
/// ranges).
///
/// Each `Range` represents a contiguous segment of row IDs for which global index
/// Each `Range` represents a contiguous segment of row ids for which global index
/// data exists. This allows the query engine to parallelize scanning and be aware
/// of ranges that are not covered by any global index.
///
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

namespace paimon {
/// Writes a range-level global index for a specific data split and field.
class PAIMON_EXPORT RowRangeGlobalIndexWriter {
class PAIMON_EXPORT GlobalIndexWriteTask {
public:
RowRangeGlobalIndexWriter() = delete;
~RowRangeGlobalIndexWriter() = delete;
GlobalIndexWriteTask() = delete;
~GlobalIndexWriteTask() = delete;
/// Builds and writes a global index for the specified data range.
///
/// @param table_path Path to the table root directory where index files are stored.
Expand Down
12 changes: 2 additions & 10 deletions include/paimon/global_index/row_range_global_index_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <memory>
#include <string>

#include "paimon/global_index/global_index_evaluator.h"
#include "paimon/global_index/global_index_reader.h"
#include "paimon/visibility.h"

Expand All @@ -29,15 +28,6 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
public:
virtual ~RowRangeGlobalIndexScanner() = default;

/// Creates a `GlobalIndexEvaluator` tailored to this range's index layout.
///
/// The returned evaluator can be used to assess whether a given predicate can be
/// answered using the global index data of this shard (e.g., via bitmap intersection).
///
/// @return A `Result` containing a shared pointer to the evaluator, or an error
/// if the index metadata is invalid or unsupported.
virtual Result<std::shared_ptr<GlobalIndexEvaluator>> CreateIndexEvaluator() const = 0;

/// Creates a `GlobalIndexReader` for a specific field and index type within this range.
///
/// This reader provides low-level access to the serialized index data
Expand All @@ -50,6 +40,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
/// - Successful with a null pointer if no index was built for the given field and type;
/// - An error only if loading fails (e.g., file corruption, I/O error, unsupported
/// format).
/// @note All `GlobalIndexResult` objects returned by `GlobalIndexReader` use **local row
/// ids** that start from 0 — not global row ids in the entire table.
virtual Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
const std::string& field_name, const std::string& index_type) const = 0;

Expand Down
6 changes: 3 additions & 3 deletions include/paimon/schema/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PAIMON_EXPORT Schema {
virtual std::vector<std::string> FieldNames() const = 0;

/// Get the unique identifier of this table schema.
/// @return The schema ID
/// @return The schema id
virtual int64_t Id() const = 0;

/// Get the list of primary key field names.
Expand All @@ -65,8 +65,8 @@ class PAIMON_EXPORT Schema {
/// @return The number of buckets.
virtual int32_t NumBuckets() const = 0;

/// Get the highest field ID assigned in this schema.
/// @return The maximum field ID.
/// Get the highest field id assigned in this schema.
/// @return The maximum field id.
virtual int32_t HighestFieldId() const = 0;

/// Get the table-level options associated with this schema.
Expand Down
6 changes: 3 additions & 3 deletions include/paimon/utils/bucket_id_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ struct ArrowArray;
namespace paimon {
class MemoryPool;

/// Calculator for determining bucket IDs based on the given bucket keys.
/// Calculator for determining bucket ids based on the given bucket keys.
///
/// @note `BucketIdCalculator` is compatible with the Java implementation and uses
/// hash-based distribution to ensure even data distribution across buckets.
Expand All @@ -47,10 +47,10 @@ class PAIMON_EXPORT BucketIdCalculator {
/// @param num_buckets Number of buckets.
static Result<std::unique_ptr<BucketIdCalculator>> Create(bool is_pk_table,
int32_t num_buckets);
/// Calculate bucket IDs for the given bucket keys.
/// Calculate bucket ids for the given bucket keys.
/// @param bucket_keys Arrow struct array containing the bucket key values.
/// @param bucket_schema Arrow schema describing the structure of bucket_keys.
/// @param bucket_ids Output array to store calculated bucket IDs.
/// @param bucket_ids Output array to store calculated bucket ids.
/// @note 1. bucket_keys is a struct array, the order of fields needs to be consistent with
/// "bucket-key" options in table schema. 2. bucket_keys and bucket_schema match each other. 3.
/// bucket_ids is allocated enough space, at least >= bucket_keys->length
Expand Down
3 changes: 3 additions & 0 deletions include/paimon/utils/roaring_bitmap32.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ class PAIMON_EXPORT RoaringBitmap32 {
/// Fast union multiple bitmaps.
static RoaringBitmap32 FastUnion(const std::vector<RoaringBitmap32>& inputs);

class RoaringBitmap64;
friend class RoaringBitmap64;

private:
void* roaring_bitmap_ = nullptr;
};
Expand Down
4 changes: 4 additions & 0 deletions include/paimon/utils/roaring_bitmap64.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "paimon/memory/bytes.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/status.h"
#include "paimon/utils/roaring_bitmap32.h"
#include "paimon/visibility.h"

namespace paimon {
Expand All @@ -42,6 +43,9 @@ class PAIMON_EXPORT RoaringBitmap64 {
RoaringBitmap64(RoaringBitmap64&&) noexcept;
RoaringBitmap64& operator=(RoaringBitmap64&&) noexcept;

explicit RoaringBitmap64(const RoaringBitmap32&) noexcept;
RoaringBitmap64& operator=(const RoaringBitmap32&) noexcept;

class PAIMON_EXPORT Iterator {
public:
friend class RoaringBitmap64;
Expand Down
2 changes: 1 addition & 1 deletion src/paimon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ set(PAIMON_CORE_SRCS
core/global_index/global_index_scan.cpp
core/global_index/global_index_scan_impl.cpp
core/global_index/row_range_global_index_scanner_impl.cpp
core/global_index/row_range_global_index_writer.cpp
core/global_index/global_index_write_task.cpp
core/index/index_file_handler.cpp
core/index/global_index_meta.cpp
core/index/index_file_meta_serializer.cpp
Expand Down
18 changes: 7 additions & 11 deletions src/paimon/common/global_index/bitmap/bitmap_global_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,33 +45,29 @@ Result<std::shared_ptr<GlobalIndexReader>> BitmapGlobalIndex::CreateReader(
PAIMON_ASSIGN_OR_RAISE(
std::shared_ptr<FileIndexReader> reader,
index_->CreateReader(arrow_schema, /*start=*/0, meta.file_size, in, pool));
auto transform = [range = meta.row_id_range](const std::shared_ptr<FileIndexResult>& result)
auto transform = [range_end = meta.range_end](const std::shared_ptr<FileIndexResult>& result)
-> Result<std::shared_ptr<GlobalIndexResult>> {
return ToGlobalIndexResult(range, result);
return ToGlobalIndexResult(range_end, result);
};
return std::make_shared<FileIndexReaderWrapper>(reader, transform);
}

Result<std::shared_ptr<GlobalIndexResult>> BitmapGlobalIndex::ToGlobalIndexResult(
const Range& range, const std::shared_ptr<FileIndexResult>& result) {
int64_t range_end, const std::shared_ptr<FileIndexResult>& result) {
if (auto remain = std::dynamic_pointer_cast<Remain>(result)) {
return std::make_shared<BitmapGlobalIndexResult>([range]() -> Result<RoaringBitmap64> {
return std::make_shared<BitmapGlobalIndexResult>([range_end]() -> Result<RoaringBitmap64> {
RoaringBitmap64 bitmap;
bitmap.AddRange(range.from, range.to + 1);
bitmap.AddRange(0, range_end + 1);
return bitmap;
});
} else if (auto skip = std::dynamic_pointer_cast<Skip>(result)) {
return std::make_shared<BitmapGlobalIndexResult>(
[]() -> Result<RoaringBitmap64> { return RoaringBitmap64(); });
} else if (auto bitmap_result = std::dynamic_pointer_cast<BitmapIndexResult>(result)) {
return std::make_shared<BitmapGlobalIndexResult>(
[range, bitmap_result]() -> Result<RoaringBitmap64> {
[bitmap_result]() -> Result<RoaringBitmap64> {
PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* bitmap, bitmap_result->GetBitmap());
RoaringBitmap64 bitmap64;
for (auto iter = bitmap->Begin(); iter != bitmap->End(); ++iter) {
bitmap64.Add(range.from + (*iter));
}
return bitmap64;
return RoaringBitmap64(*bitmap);
});
}
return Status::Invalid(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class BitmapGlobalIndex : public GlobalIndexer {

private:
static Result<std::shared_ptr<GlobalIndexResult>> ToGlobalIndexResult(
const Range& range, const std::shared_ptr<FileIndexResult>& result);
int64_t range_end, const std::shared_ptr<FileIndexResult>& result);

private:
std::shared_ptr<BitmapFileIndex> index_;
Expand Down
Loading
Loading