diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index ade65fa110..7f2094e53a 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -406,7 +406,7 @@ mod tests { let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_str("red"); - assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred") + assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec()) } #[test] @@ -416,8 +416,8 @@ mod tests { term.append_type_and_fast_value(-4i64); assert_eq!( - term.serialized_term(), - b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc" + term.serialized_value_bytes(), + b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec() ) } @@ -428,8 +428,8 @@ mod tests { term.append_type_and_fast_value(4u64); assert_eq!( - term.serialized_term(), - b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04" + term.serialized_value_bytes(), + b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec() ) } @@ -439,8 +439,8 @@ mod tests { let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(4.0f64); assert_eq!( - term.serialized_term(), - b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00" + term.serialized_value_bytes(), + b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec() ) } @@ -450,8 +450,8 @@ mod tests { let mut term = Term::from_field_json_path(field, "color", false); term.append_type_and_fast_value(true); assert_eq!( - term.serialized_term(), - b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01" + term.serialized_value_bytes(), + b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec() ) } diff --git a/src/indexer/indexing_term.rs b/src/indexer/indexing_term.rs index e4ca2b2192..01282357e4 100644 --- a/src/indexer/indexing_term.rs +++ b/src/indexer/indexing_term.rs @@ -3,21 +3,21 @@ use std::net::Ipv6Addr; use columnar::MonotonicallyMappableToU128; use crate::fastfield::FastValue; -use crate::schema::{Field, Type}; +use crate::schema::Field; -/// Term represents the value that the token can take. -/// It's a serialized representation over different types. +/// IndexingTerm is used to represent a term during indexing. +/// It's a serialized representation over field and value. /// -/// It actually wraps a `Vec`. The first 5 bytes are metadata. -/// 4 bytes are the field id, and the last byte is the type. +/// It actually wraps a `Vec`. The first 4 bytes are the field. /// -/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id). +/// We serialize the field, because we index everything in a single +/// global term dictionary during indexing. #[derive(Clone)] pub(crate) struct IndexingTerm>(B) where B: AsRef<[u8]>; /// The number of bytes used as metadata by `Term`. -const TERM_METADATA_LENGTH: usize = 5; +const TERM_METADATA_LENGTH: usize = 4; impl IndexingTerm { /// Create a new Term with a buffer with a given capacity. @@ -31,10 +31,9 @@ impl IndexingTerm { /// Use `clear_with_field_and_type` in that case. /// /// Sets field and the type. - pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) { + pub(crate) fn set_field(&mut self, field: Field) { assert!(self.is_empty()); self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref()); - self.0[4] = typ.to_code(); } /// Is empty if there are no value bytes. @@ -42,10 +41,10 @@ impl IndexingTerm { self.0.len() == TERM_METADATA_LENGTH } - /// Removes the value_bytes and set the field and type code. - pub(crate) fn clear_with_field_and_type(&mut self, typ: Type, field: Field) { + /// Removes the value_bytes and set the field + pub(crate) fn clear_with_field(&mut self, field: Field) { self.truncate_value_bytes(0); - self.set_field_and_type(field, typ); + self.set_field(field); } /// Sets a u64 value in the term. @@ -122,6 +121,23 @@ impl IndexingTerm { impl IndexingTerm where B: AsRef<[u8]> { + /// Wraps serialized term bytes. + /// + /// The input buffer is expected to be the concatenation of the big endian encoded field id + /// followed by the serialized value bytes (type tag + payload). + #[inline] + pub fn wrap(serialized_term: B) -> IndexingTerm { + debug_assert!(serialized_term.as_ref().len() >= TERM_METADATA_LENGTH); + IndexingTerm(serialized_term) + } + + /// Returns the field this term belongs to. + #[inline] + pub fn field(&self) -> Field { + let field_id_bytes: [u8; 4] = self.0.as_ref()[..4].try_into().unwrap(); + Field::from_field_id(u32::from_be_bytes(field_id_bytes)) + } + /// Returns the serialized representation of Term. /// This includes field_id, value type and value. /// @@ -136,6 +152,7 @@ where B: AsRef<[u8]> #[cfg(test)] mod tests { + use super::IndexingTerm; use crate::schema::*; #[test] @@ -143,42 +160,55 @@ mod tests { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("text", STRING); let title_field = schema_builder.add_text_field("title", STRING); - let term = Term::from_field_text(title_field, "test"); + let mut term = IndexingTerm::with_capacity(0); + term.set_field(title_field); + term.set_bytes(b"test"); assert_eq!(term.field(), title_field); - assert_eq!(term.typ(), Type::Str); - assert_eq!(term.value().as_str(), Some("test")) + assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01test".to_vec()) } /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + /// /// - is a big endian encoded u32 field id - /// - 's most significant bit expresses whether the term is a json term or not The - /// remaining 7 bits are used to encode the type of the value. If this is a JSON term, the - /// type is the type of the leaf of the json. /// - is, if this is not the json term, a binary representation specific to the type. /// If it is a JSON Term, then it is prepended with the path that leads to this leaf value. - const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; + const FAST_VALUE_TERM_LEN: usize = 4 + 8; #[test] pub fn test_term_u64() { let mut schema_builder = Schema::builder(); let count_field = schema_builder.add_u64_field("count", INDEXED); - let term = Term::from_field_u64(count_field, 983u64); + let mut term = IndexingTerm::with_capacity(0); + term.set_field(count_field); + term.set_u64(983u64); assert_eq!(term.field(), count_field); - assert_eq!(term.typ(), Type::U64); assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); - assert_eq!(term.value().as_u64(), Some(983u64)) } #[test] pub fn test_term_bool() { let mut schema_builder = Schema::builder(); let bool_field = schema_builder.add_bool_field("bool", INDEXED); - let term = Term::from_field_bool(bool_field, true); + let term = { + let mut term = IndexingTerm::with_capacity(0); + term.set_field(bool_field); + term.set_bool(true); + term + }; assert_eq!(term.field(), bool_field); - assert_eq!(term.typ(), Type::Bool); assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); - assert_eq!(term.value().as_bool(), Some(true)) + } + + #[test] + pub fn indexing_term_wrap_extracts_field() { + let field = Field::from_field_id(7u32); + let mut term = IndexingTerm::with_capacity(0); + term.set_field(field); + term.append_bytes(b"abc"); + + let wrapped = IndexingTerm::wrap(term.serialized_term()); + assert_eq!(wrapped.field(), field); + assert_eq!(wrapped.serialized_term(), term.serialized_term()); } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index cfeea1177b..494936ef07 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -171,7 +171,7 @@ impl SegmentWriter { let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); let postings_writer: &mut dyn PostingsWriter = self.per_field_postings_writers.get_for_field_mut(field); - term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field); + term_buffer.clear_with_field(field); match field_entry.field_type() { FieldType::Facet(_) => { diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index 477b73ba66..99de174467 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -8,7 +8,7 @@ use crate::indexer::path_to_unordered_id::OrderedPathId; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder}; use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter}; -use crate::schema::{Field, Type, ValueBytes}; +use crate::schema::{Field, Type}; use crate::tokenizer::TokenStream; use crate::DocId; @@ -79,8 +79,7 @@ impl PostingsWriter for JsonPostingsWriter { term_buffer.truncate(term_path_len); term_buffer.append_bytes(term); - let json_value = ValueBytes::wrap(term); - let typ = json_value.typ(); + let typ = Type::from_code(term[0]).expect("Invalid type code in JSON term"); if typ == Type::Str { SpecializedPostingsWriter::::serialize_one_term( term_buffer.as_bytes(), @@ -107,6 +106,8 @@ impl PostingsWriter for JsonPostingsWriter { } } +/// Helper to build the JSON term bytes that land in the term dictionary. +/// Format: `[json path utf8][JSON_END_OF_PATH][type tag][payload]` struct JsonTermSerializer(Vec); impl JsonTermSerializer { /// Appends a JSON path to the Term. diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 1bf8262b9d..c7a94ecef0 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -11,7 +11,7 @@ use crate::postings::recorder::{BufferLender, Recorder}; use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, }; -use crate::schema::{Field, Schema, Term, Type}; +use crate::schema::{Field, Schema, Type}; use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN}; use crate::DocId; @@ -59,14 +59,14 @@ pub(crate) fn serialize_postings( let mut term_offsets: Vec<(Field, OrderedPathId, &[u8], Addr)> = Vec::with_capacity(ctx.term_index.len()); term_offsets.extend(ctx.term_index.iter().map(|(key, addr)| { - let field = Term::wrap(key).field(); + let field = IndexingTerm::wrap(key).field(); if schema.get_field_entry(field).field_type().value_type() == Type::Json { - let byte_range_path = 5..5 + 4; + let byte_range_path = 4..4 + 4; let unordered_id = u32::from_be_bytes(key[byte_range_path.clone()].try_into().unwrap()); let path_id = unordered_id_to_ordered_id[unordered_id as usize]; (field, path_id, &key[byte_range_path.end..], addr) } else { - (field, 0.into(), &key[5..], addr) + (field, 0.into(), &key[4..], addr) } })); // Sort by field, path, and term diff --git a/src/schema/term.rs b/src/schema/term.rs index 2dd78b82ae..e1e4f02e42 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,10 +1,11 @@ -use std::hash::{Hash, Hasher}; +use std::hash::Hash; use std::net::Ipv6Addr; use std::{fmt, str}; use columnar::MonotonicallyMappableToU128; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP_STR}; use common::JsonPathWriter; +use serde::{Deserialize, Serialize}; use super::date_time_options::DATE_TIME_PRECISION_INDEXED; use super::{Field, Schema}; @@ -16,23 +17,54 @@ use crate::DateTime; /// Term represents the value that the token can take. /// It's a serialized representation over different types. /// -/// It actually wraps a `Vec`. The first 5 bytes are metadata. -/// 4 bytes are the field id, and the last byte is the type. -/// -/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id). -#[derive(Clone)] -pub struct Term>(B) -where B: AsRef<[u8]>; +/// A term is composed of Field and the serialized value bytes. +/// The serialized value bytes themselves start with a one byte type tag followed by the payload. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)] +pub struct Term { + field: Field, + serialized_value_bytes: Vec, +} -/// The number of bytes used as metadata by `Term`. -const TERM_METADATA_LENGTH: usize = 5; +/// The number of bytes used as metadata when serializing a term. +const TERM_TYPE_TAG_LEN: usize = 1; impl Term { + /// Takes a serialized term and wraps it as a Term. + /// First 4 bytes are the field id + #[deprecated( + note = "we want to avoid working on the serialized representation directly, replace with \ + typed API calls (add more if needed) or use serde to serialize/deserialize" + )] + pub fn wrap(serialized: &[u8]) -> Term { + let field_id_bytes: [u8; 4] = serialized[0..4].try_into().unwrap(); + let field_id = u32::from_be_bytes(field_id_bytes); + Term { + field: Field::from_field_id(field_id), + serialized_value_bytes: serialized[4..].to_vec(), + } + } + + /// Returns the serialized representation of the term. + /// First 4 bytes are the field id + #[deprecated( + note = "we want to avoid working on the serialized representation directly, replace with \ + typed API calls (add more if needed) or use serde to serialize/deserialize" + )] + pub fn serialized_term(&self) -> Vec { + let mut serialized = Vec::with_capacity(4 + self.serialized_value_bytes.len()); + serialized.extend(self.field.field_id().to_be_bytes().as_ref()); + serialized.extend_from_slice(&self.serialized_value_bytes); + serialized + } + /// Create a new Term with a buffer with a given capacity. pub fn with_capacity(capacity: usize) -> Term { - let mut data = Vec::with_capacity(TERM_METADATA_LENGTH + capacity); - data.resize(TERM_METADATA_LENGTH, 0u8); - Term(data) + let mut data = Vec::with_capacity(TERM_TYPE_TAG_LEN + capacity); + data.resize(TERM_TYPE_TAG_LEN, 0u8); + Term { + field: Field::from_field_id(0u32), + serialized_value_bytes: data, + } } /// Creates a term from a json path. @@ -89,7 +121,7 @@ impl Term { fn with_bytes_and_field_and_payload(typ: Type, field: Field, bytes: &[u8]) -> Term { let mut term = Self::with_capacity(bytes.len()); term.set_field_and_type(field, typ); - term.0.extend_from_slice(bytes); + term.serialized_value_bytes.extend_from_slice(bytes); term } @@ -105,13 +137,13 @@ impl Term { /// Sets field and the type. pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) { assert!(self.is_empty()); - self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref()); - self.0[4] = typ.to_code(); + self.field = field; + self.serialized_value_bytes[0] = typ.to_code(); } /// Is empty if there are no value bytes. pub fn is_empty(&self) -> bool { - self.0.len() == TERM_METADATA_LENGTH + self.serialized_value_bytes.len() == TERM_TYPE_TAG_LEN } /// Builds a term given a field, and a `Ipv6Addr`-value @@ -177,7 +209,7 @@ impl Term { /// Removes the value_bytes and set the type code. pub fn clear_with_type(&mut self, typ: Type) { self.truncate_value_bytes(0); - self.0[4] = typ.to_code(); + self.serialized_value_bytes[0] = typ.to_code(); } /// Append a type marker + fast value to a term. @@ -185,9 +217,10 @@ impl Term { /// /// It will not clear existing bytes. pub fn append_type_and_fast_value(&mut self, val: T) { - self.0.push(T::to_type().to_code()); + self.serialized_value_bytes.push(T::to_type().to_code()); let value = val.to_u64(); - self.0.extend(value.to_be_bytes().as_ref()); + self.serialized_value_bytes + .extend(value.to_be_bytes().as_ref()); } /// Append a string type marker + string to a term. @@ -195,24 +228,25 @@ impl Term { /// /// It will not clear existing bytes. pub fn append_type_and_str(&mut self, val: &str) { - self.0.push(Type::Str.to_code()); - self.0.extend(val.as_bytes().as_ref()); + self.serialized_value_bytes.push(Type::Str.to_code()); + self.serialized_value_bytes.extend(val.as_bytes().as_ref()); } /// Sets the value of a `Bytes` field. pub fn set_bytes(&mut self, bytes: &[u8]) { self.truncate_value_bytes(0); - self.0.extend(bytes); + self.serialized_value_bytes.extend(bytes); } /// Truncates the value bytes of the term. Value and field type stays the same. pub fn truncate_value_bytes(&mut self, len: usize) { - self.0.truncate(len + TERM_METADATA_LENGTH); + self.serialized_value_bytes + .truncate(len + TERM_TYPE_TAG_LEN); } /// The length of the bytes. pub fn len_bytes(&self) -> usize { - self.0.len() - TERM_METADATA_LENGTH + self.serialized_value_bytes.len() - TERM_TYPE_TAG_LEN } /// Appends value bytes to the Term. @@ -220,18 +254,9 @@ impl Term { /// This function returns the segment that has just been added. #[inline] pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] { - let len_before = self.0.len(); - self.0.extend_from_slice(bytes); - &mut self.0[len_before..] - } -} - -impl Term -where B: AsRef<[u8]> -{ - /// Wraps a object holding bytes - pub fn wrap(data: B) -> Term { - Term(data) + let len_before = self.serialized_value_bytes.len(); + self.serialized_value_bytes.extend_from_slice(bytes); + &mut self.serialized_value_bytes[len_before..] } /// Return the type of the term. @@ -241,8 +266,7 @@ where B: AsRef<[u8]> /// Returns the field. pub fn field(&self) -> Field { - let field_id_bytes: [u8; 4] = (&self.0.as_ref()[..4]).try_into().unwrap(); - Field::from_field_id(u32::from_be_bytes(field_id_bytes)) + self.field } /// Returns the serialized representation of the value. @@ -252,23 +276,13 @@ where B: AsRef<[u8]> /// If the term is a u64, its value is encoded according /// to `byteorder::BigEndian`. pub fn serialized_value_bytes(&self) -> &[u8] { - &self.0.as_ref()[TERM_METADATA_LENGTH..] + &self.serialized_value_bytes[TERM_TYPE_TAG_LEN..] } /// Returns the value of the term. /// address or JSON path + value. (this does not include the field.) pub fn value(&self) -> ValueBytes<&[u8]> { - ValueBytes::wrap(&self.0.as_ref()[4..]) - } - - /// Returns the serialized representation of Term. - /// This includes field_id, value type and value. - /// - /// Do NOT rely on this byte representation in the index. - /// This value is likely to change in the future. - #[inline] - pub fn serialized_term(&self) -> &[u8] { - self.0.as_ref() + ValueBytes::wrap(self.serialized_value_bytes.as_ref()) } } @@ -452,10 +466,7 @@ where B: AsRef<[u8]> } } - /// Returns the serialized representation of Term. - /// - /// Do NOT rely on this byte representation in the index. - /// This value is likely to change in the future. + /// Returns the serialized representation of the value bytes including the type tag. pub fn as_serialized(&self) -> &[u8] { self.0.as_ref() } @@ -508,40 +519,6 @@ where B: AsRef<[u8]> } } -impl Ord for Term -where B: AsRef<[u8]> -{ - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.serialized_term().cmp(other.serialized_term()) - } -} - -impl PartialOrd for Term -where B: AsRef<[u8]> -{ - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for Term -where B: AsRef<[u8]> -{ - fn eq(&self, other: &Self) -> bool { - self.serialized_term() == other.serialized_term() - } -} - -impl Eq for Term where B: AsRef<[u8]> {} - -impl Hash for Term -where B: AsRef<[u8]> -{ - fn hash(&self, state: &mut H) { - self.0.as_ref().hash(state) - } -} - fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> fmt::Result { if let Some(val) = val_opt { write!(f, "{val:?}")?; @@ -549,13 +526,11 @@ fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> Ok(()) } -impl fmt::Debug for Term -where B: AsRef<[u8]> -{ +impl fmt::Debug for Term { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let field_id = self.field().field_id(); + let field_id = self.field.field_id(); write!(f, "Term(field={field_id}, ")?; - let value_bytes = ValueBytes::wrap(&self.0.as_ref()[4..]); + let value_bytes = ValueBytes::wrap(&self.serialized_value_bytes); value_bytes.debug_value_bytes(f)?; write!(f, ")",)?; Ok(()) @@ -578,17 +553,6 @@ mod tests { assert_eq!(term.value().as_str(), Some("test")) } - /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. - /// + + - /// - /// - is a big endian encoded u32 field id - /// - 's most significant bit expresses whether the term is a json term or not The - /// remaining 7 bits are used to encode the type of the value. If this is a JSON term, the - /// type is the type of the leaf of the json. - /// - is, if this is not the json term, a binary representation specific to the type. - /// If it is a JSON Term, then it is prepended with the path that leads to this leaf value. - const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; - #[test] pub fn test_term_u64() { let mut schema_builder = Schema::builder(); @@ -596,7 +560,7 @@ mod tests { let term = Term::from_field_u64(count_field, 983u64); assert_eq!(term.field(), count_field); assert_eq!(term.typ(), Type::U64); - assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); + assert_eq!(term.serialized_value_bytes().len(), 8); assert_eq!(term.value().as_u64(), Some(983u64)) } @@ -607,7 +571,7 @@ mod tests { let term = Term::from_field_bool(bool_field, true); assert_eq!(term.field(), bool_field); assert_eq!(term.typ(), Type::Bool); - assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN); + assert_eq!(term.serialized_value_bytes().len(), 8); assert_eq!(term.value().as_bool(), Some(true)) } }