Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_str("red");

assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred")
assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec())
}

#[test]
Expand All @@ -416,8 +416,8 @@ mod tests {
term.append_type_and_fast_value(-4i64);

assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
term.serialized_value_bytes(),
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec()
)
}

Expand All @@ -428,8 +428,8 @@ mod tests {
term.append_type_and_fast_value(4u64);

assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
term.serialized_value_bytes(),
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec()
)
}

Expand All @@ -439,8 +439,8 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(4.0f64);
assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
term.serialized_value_bytes(),
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec()
)
}

Expand All @@ -450,8 +450,8 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(true);
assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
term.serialized_value_bytes(),
b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec()
)
}

Expand Down
80 changes: 55 additions & 25 deletions src/indexer/indexing_term.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@ use std::net::Ipv6Addr;
use columnar::MonotonicallyMappableToU128;

use crate::fastfield::FastValue;
use crate::schema::{Field, Type};
use crate::schema::Field;

/// Term represents the value that the token can take.
/// It's a serialized representation over different types.
/// IndexingTerm is used to represent a term during indexing.
/// It's a serialized representation over field and value.
///
/// It actually wraps a `Vec<u8>`. The first 5 bytes are metadata.
/// 4 bytes are the field id, and the last byte is the type.
/// It actually wraps a `Vec<u8>`. The first 4 bytes are the field.
///
/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id).
/// We serialize the field, because we index everything in a single
/// global term dictionary during indexing.
#[derive(Clone)]
pub(crate) struct IndexingTerm<B = Vec<u8>>(B)
where B: AsRef<[u8]>;

/// The number of bytes used as metadata by `Term`.
const TERM_METADATA_LENGTH: usize = 5;
const TERM_METADATA_LENGTH: usize = 4;

impl IndexingTerm {
/// Create a new Term with a buffer with a given capacity.
Expand All @@ -31,21 +31,20 @@ impl IndexingTerm {
/// Use `clear_with_field_and_type` in that case.
///
/// Sets field and the type.
pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) {
pub(crate) fn set_field(&mut self, field: Field) {
assert!(self.is_empty());
self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref());
self.0[4] = typ.to_code();
}

/// Is empty if there are no value bytes.
pub fn is_empty(&self) -> bool {
self.0.len() == TERM_METADATA_LENGTH
}

/// Removes the value_bytes and set the field and type code.
pub(crate) fn clear_with_field_and_type(&mut self, typ: Type, field: Field) {
/// Removes the value_bytes and set the field
pub(crate) fn clear_with_field(&mut self, field: Field) {
self.truncate_value_bytes(0);
self.set_field_and_type(field, typ);
self.set_field(field);
}

/// Sets a u64 value in the term.
Expand Down Expand Up @@ -122,6 +121,23 @@ impl IndexingTerm {
impl<B> IndexingTerm<B>
where B: AsRef<[u8]>
{
/// Wraps serialized term bytes.
///
/// The input buffer is expected to be the concatenation of the big endian encoded field id
/// followed by the serialized value bytes (type tag + payload).
#[inline]
pub fn wrap(serialized_term: B) -> IndexingTerm<B> {
debug_assert!(serialized_term.as_ref().len() >= TERM_METADATA_LENGTH);
IndexingTerm(serialized_term)
}

/// Returns the field this term belongs to.
#[inline]
pub fn field(&self) -> Field {
let field_id_bytes: [u8; 4] = self.0.as_ref()[..4].try_into().unwrap();
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}

/// Returns the serialized representation of Term.
/// This includes field_id, value type and value.
///
Expand All @@ -136,49 +152,63 @@ where B: AsRef<[u8]>
#[cfg(test)]
mod tests {

use super::IndexingTerm;
use crate::schema::*;

#[test]
pub fn test_term_str() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("text", STRING);
let title_field = schema_builder.add_text_field("title", STRING);
let term = Term::from_field_text(title_field, "test");
let mut term = IndexingTerm::with_capacity(0);
term.set_field(title_field);
term.set_bytes(b"test");
assert_eq!(term.field(), title_field);
assert_eq!(term.typ(), Type::Str);
assert_eq!(term.value().as_str(), Some("test"))
assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01test".to_vec())
}

/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
/// <field> + <type byte> + <value len>
///
/// - <field> is a big endian encoded u32 field id
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not The
/// remaining 7 bits are used to encode the type of the value. If this is a JSON term, the
/// type is the type of the leaf of the json.
/// - <value> is, if this is not the json term, a binary representation specific to the type.
/// If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
const FAST_VALUE_TERM_LEN: usize = 4 + 8;

#[test]
pub fn test_term_u64() {
let mut schema_builder = Schema::builder();
let count_field = schema_builder.add_u64_field("count", INDEXED);
let term = Term::from_field_u64(count_field, 983u64);
let mut term = IndexingTerm::with_capacity(0);
term.set_field(count_field);
term.set_u64(983u64);
assert_eq!(term.field(), count_field);
assert_eq!(term.typ(), Type::U64);
assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.value().as_u64(), Some(983u64))
}

#[test]
pub fn test_term_bool() {
let mut schema_builder = Schema::builder();
let bool_field = schema_builder.add_bool_field("bool", INDEXED);
let term = Term::from_field_bool(bool_field, true);
let term = {
let mut term = IndexingTerm::with_capacity(0);
term.set_field(bool_field);
term.set_bool(true);
term
};
assert_eq!(term.field(), bool_field);
assert_eq!(term.typ(), Type::Bool);
assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
assert_eq!(term.value().as_bool(), Some(true))
}

#[test]
pub fn indexing_term_wrap_extracts_field() {
let field = Field::from_field_id(7u32);
let mut term = IndexingTerm::with_capacity(0);
term.set_field(field);
term.append_bytes(b"abc");

let wrapped = IndexingTerm::wrap(term.serialized_term());
assert_eq!(wrapped.field(), field);
assert_eq!(wrapped.serialized_term(), term.serialized_term());
}
}
2 changes: 1 addition & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ impl SegmentWriter {
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
let postings_writer: &mut dyn PostingsWriter =
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
term_buffer.clear_with_field(field);

match field_entry.field_type() {
FieldType::Facet(_) => {
Expand Down
7 changes: 4 additions & 3 deletions src/postings/json_postings_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::{Field, Type, ValueBytes};
use crate::schema::{Field, Type};
use crate::tokenizer::TokenStream;
use crate::DocId;

Expand Down Expand Up @@ -79,8 +79,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer.truncate(term_path_len);
term_buffer.append_bytes(term);

let json_value = ValueBytes::wrap(term);
let typ = json_value.typ();
let typ = Type::from_code(term[0]).expect("Invalid type code in JSON term");
if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.as_bytes(),
Expand All @@ -107,6 +106,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
}
}

/// Helper to build the JSON term bytes that land in the term dictionary.
/// Format: `[json path utf8][JSON_END_OF_PATH][type tag][payload]`
struct JsonTermSerializer(Vec<u8>);
impl JsonTermSerializer {
/// Appends a JSON path to the Term.
Expand Down
8 changes: 4 additions & 4 deletions src/postings/postings_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::postings::recorder::{BufferLender, Recorder};
use crate::postings::{
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
};
use crate::schema::{Field, Schema, Term, Type};
use crate::schema::{Field, Schema, Type};
use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN};
use crate::DocId;

Expand Down Expand Up @@ -59,14 +59,14 @@ pub(crate) fn serialize_postings(
let mut term_offsets: Vec<(Field, OrderedPathId, &[u8], Addr)> =
Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter().map(|(key, addr)| {
let field = Term::wrap(key).field();
let field = IndexingTerm::wrap(key).field();
if schema.get_field_entry(field).field_type().value_type() == Type::Json {
let byte_range_path = 5..5 + 4;
let byte_range_path = 4..4 + 4;
let unordered_id = u32::from_be_bytes(key[byte_range_path.clone()].try_into().unwrap());
let path_id = unordered_id_to_ordered_id[unordered_id as usize];
(field, path_id, &key[byte_range_path.end..], addr)
} else {
(field, 0.into(), &key[5..], addr)
(field, 0.into(), &key[4..], addr)
}
}));
// Sort by field, path, and term
Expand Down
Loading
Loading