Skip to content

Commit 23ba011

Browse files
committed
feat!: support FixedSizeList<Struct>
Prior to this commit, FixedSizeList was only supported with primitive element types (e.g., FSL<Float32> for vectors). This adds structural encoding support for FSL<Struct>, enabling use cases like fixed-size arrays of bounding boxes, coordinate tuples, or other structured data. Key changes: - New `FixedSizeListStructuralEncoder` that encodes FSL validity to rep/def and delegates child encoding to the struct encoder - New `StructuralFixedSizeListScheduler` that scales row ranges by the FSL dimension when scheduling reads - New `StructuralFixedSizeListDecoder` that reconstructs FSL arrays from child data and rep/def validity A key challenge is "garbage filtering": unlike variable-length lists which can omit children under null entries, FSL children always exist. When an FSL row is null, any nested list-like types within its children contain undefined "garbage" data. The encoder normalizes these to empty null lists before encoding.
1 parent 5e6a460 commit 23ba011

9 files changed

Lines changed: 926 additions & 36 deletions

File tree

rust/lance-core/src/datatypes.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ impl LogicalType {
9191
self.0 == "large_list" || self.0 == "large_list.struct"
9292
}
9393

94+
fn is_fixed_size_list_struct(&self) -> bool {
95+
self.0.starts_with("fixed_size_list:struct:")
96+
}
97+
9498
fn is_struct(&self) -> bool {
9599
self.0 == "struct"
96100
}

rust/lance-core/src/datatypes/field.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,16 @@ impl Field {
165165
lt if lt.is_large_list() => {
166166
DataType::LargeList(Arc::new(ArrowField::from(&self.children[0])))
167167
}
168+
lt if lt.is_fixed_size_list_struct() => {
169+
// Parse size from "fixed_size_list:struct:N"
170+
let size: i32 =
171+
lt.0.split(':')
172+
.next_back()
173+
.expect("fixed_size_list:struct logical type missing size suffix")
174+
.parse()
175+
.expect("fixed_size_list:struct logical type has invalid size");
176+
DataType::FixedSizeList(Arc::new(ArrowField::from(&self.children[0])), size)
177+
}
168178
lt if lt.is_struct() => {
169179
DataType::Struct(self.children.iter().map(ArrowField::from).collect())
170180
}
@@ -1076,6 +1086,9 @@ impl TryFrom<&ArrowField> for Field {
10761086
.collect::<Result<_>>()?,
10771087
DataType::List(item) => vec![Self::try_from(item.as_ref())?],
10781088
DataType::LargeList(item) => vec![Self::try_from(item.as_ref())?],
1089+
DataType::FixedSizeList(item, _) if matches!(item.data_type(), DataType::Struct(_)) => {
1090+
vec![Self::try_from(item.as_ref())?]
1091+
}
10791092
DataType::Map(entries, keys_sorted) => {
10801093
// TODO: We only support keys_sorted=false for now,
10811094
// because converting a rust arrow map field to the python arrow field will

rust/lance-datagen/src/generator.rs

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use arrow_array::{
1515
make_array,
1616
types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
1717
Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray,
18-
LargeStringArray, ListArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
18+
LargeStringArray, ListArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
1919
RecordBatchOptions, RecordBatchReader, StringArray, StructArray,
2020
};
2121
use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
@@ -1712,6 +1712,84 @@ impl ArrayGenerator for RandomListGenerator {
17121712
}
17131713
}
17141714

1715+
#[derive(Debug)]
1716+
struct RandomMapGenerator {
1717+
field: Arc<Field>,
1718+
entries_field: Arc<Field>,
1719+
keys_gen: Box<dyn ArrayGenerator>,
1720+
values_gen: Box<dyn ArrayGenerator>,
1721+
lengths_gen: Box<dyn ArrayGenerator>,
1722+
}
1723+
1724+
impl RandomMapGenerator {
1725+
fn new(keys_gen: Box<dyn ArrayGenerator>, values_gen: Box<dyn ArrayGenerator>) -> Self {
1726+
let entries_fields = Fields::from(vec![
1727+
Field::new("keys", keys_gen.data_type().clone(), false),
1728+
Field::new("values", values_gen.data_type().clone(), true),
1729+
]);
1730+
let entries_field = Arc::new(Field::new(
1731+
"entries",
1732+
DataType::Struct(entries_fields),
1733+
false,
1734+
));
1735+
let map_type = DataType::Map(entries_field.clone(), false);
1736+
let field = Arc::new(Field::new("", map_type, true));
1737+
let lengths_dist = Uniform::new_inclusive(0_i32, 4).unwrap();
1738+
let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1739+
1740+
Self {
1741+
field,
1742+
entries_field,
1743+
keys_gen,
1744+
values_gen,
1745+
lengths_gen,
1746+
}
1747+
}
1748+
}
1749+
1750+
impl ArrayGenerator for RandomMapGenerator {
1751+
fn generate(
1752+
&mut self,
1753+
length: RowCount,
1754+
rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1755+
) -> Result<Arc<dyn Array>, ArrowError> {
1756+
let lengths = self.lengths_gen.generate(length, rng)?;
1757+
let lengths = lengths.as_primitive::<Int32Type>();
1758+
let total_entries = lengths.values().iter().sum::<i32>() as u64;
1759+
let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1760+
1761+
let keys = self.keys_gen.generate(RowCount::from(total_entries), rng)?;
1762+
let values = self
1763+
.values_gen
1764+
.generate(RowCount::from(total_entries), rng)?;
1765+
1766+
let entries = StructArray::new(
1767+
Fields::from(vec![
1768+
Field::new("keys", keys.data_type().clone(), false),
1769+
Field::new("values", values.data_type().clone(), true),
1770+
]),
1771+
vec![keys, values],
1772+
None,
1773+
);
1774+
1775+
Ok(Arc::new(MapArray::try_new(
1776+
self.entries_field.clone(),
1777+
offsets,
1778+
entries,
1779+
None,
1780+
false,
1781+
)?))
1782+
}
1783+
1784+
fn data_type(&self) -> &DataType {
1785+
self.field.data_type()
1786+
}
1787+
1788+
fn element_size_bytes(&self) -> Option<ByteCount> {
1789+
None
1790+
}
1791+
}
1792+
17151793
#[derive(Debug)]
17161794
struct NullArrayGenerator {}
17171795

@@ -2754,6 +2832,12 @@ pub mod array {
27542832
Box::new(RandomListGenerator::new(item_gen, is_large))
27552833
}
27562834

2835+
pub fn rand_map(key_type: &DataType, value_type: &DataType) -> Box<dyn ArrayGenerator> {
2836+
let keys_gen = rand_type(key_type);
2837+
let values_gen = rand_type(value_type);
2838+
Box::new(RandomMapGenerator::new(keys_gen, values_gen))
2839+
}
2840+
27572841
pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
27582842
let child_gens = fields
27592843
.iter()
@@ -2797,6 +2881,14 @@ pub mod array {
27972881
DataType::FixedSizeBinary(size) => rand_fsb(*size),
27982882
DataType::List(child) => rand_list(child.data_type(), false),
27992883
DataType::LargeList(child) => rand_list(child.data_type(), true),
2884+
DataType::Map(entries_field, _) => {
2885+
let DataType::Struct(fields) = entries_field.data_type() else {
2886+
panic!("Map entries field must be a struct");
2887+
};
2888+
let key_type = fields[0].data_type();
2889+
let value_type = fields[1].data_type();
2890+
rand_map(key_type, value_type)
2891+
}
28002892
DataType::Duration(unit) => match unit {
28012893
TimeUnit::Second => rand::<DurationSecondType>(),
28022894
TimeUnit::Millisecond => rand::<DurationMillisecondType>(),

rust/lance-encoding/src/decoder.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ use tracing::instrument;
238238
use crate::compression::{DecompressionStrategy, DefaultDecompressionStrategy};
239239
use crate::data::DataBlock;
240240
use crate::encoder::EncodedBatch;
241+
use crate::encodings::logical::fixed_size_list::StructuralFixedSizeListScheduler;
241242
use crate::encodings::logical::list::StructuralListScheduler;
242243
use crate::encodings::logical::map::StructuralMapScheduler;
243244
use crate::encodings::logical::primitive::StructuralPrimitiveFieldScheduler;
@@ -774,6 +775,20 @@ impl CoreFieldDecoderStrategy {
774775
Ok(Box::new(StructuralListScheduler::new(child_scheduler))
775776
as Box<dyn StructuralFieldScheduler>)
776777
}
778+
DataType::FixedSizeList(inner, dimension)
779+
if matches!(inner.data_type(), DataType::Struct(_)) =>
780+
{
781+
let child = field
782+
.children
783+
.first()
784+
.expect("FixedSizeList field must have a child");
785+
let child_scheduler =
786+
self.create_structural_field_scheduler(child, column_infos)?;
787+
Ok(Box::new(StructuralFixedSizeListScheduler::new(
788+
child_scheduler,
789+
*dimension,
790+
)) as Box<dyn StructuralFieldScheduler>)
791+
}
777792
DataType::Map(_, keys_sorted) => {
778793
// TODO: We only support keys_sorted=false for now,
779794
// because converting a rust arrow map field to the python arrow field will

rust/lance-encoding/src/encoder.rs

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ use crate::compression::{CompressionStrategy, DefaultCompressionStrategy};
2929
use crate::compression_config::CompressionParams;
3030
use crate::decoder::PageEncoding;
3131
use crate::encodings::logical::blob::{BlobStructuralEncoder, BlobV2StructuralEncoder};
32+
use crate::encodings::logical::fixed_size_list::FixedSizeListStructuralEncoder;
3233
use crate::encodings::logical::list::ListStructuralEncoder;
3334
use crate::encodings::logical::map::MapStructuralEncoder;
3435
use crate::encodings::logical::primitive::PrimitiveStructuralEncoder;
@@ -345,37 +346,39 @@ impl StructuralEncodingStrategy {
345346
}
346347

347348
fn is_primitive_type(data_type: &DataType) -> bool {
348-
matches!(
349-
data_type,
350-
DataType::Boolean
351-
| DataType::Date32
352-
| DataType::Date64
353-
| DataType::Decimal128(_, _)
354-
| DataType::Decimal256(_, _)
355-
| DataType::Duration(_)
356-
| DataType::Float16
357-
| DataType::Float32
358-
| DataType::Float64
359-
| DataType::Int16
360-
| DataType::Int32
361-
| DataType::Int64
362-
| DataType::Int8
363-
| DataType::Interval(_)
364-
| DataType::Null
365-
| DataType::Time32(_)
366-
| DataType::Time64(_)
367-
| DataType::Timestamp(_, _)
368-
| DataType::UInt16
369-
| DataType::UInt32
370-
| DataType::UInt64
371-
| DataType::UInt8
372-
| DataType::FixedSizeBinary(_)
373-
| DataType::FixedSizeList(_, _)
374-
| DataType::Binary
375-
| DataType::LargeBinary
376-
| DataType::Utf8
377-
| DataType::LargeUtf8,
378-
)
349+
match data_type {
350+
DataType::FixedSizeList(inner, _) => Self::is_primitive_type(inner.data_type()),
351+
_ => matches!(
352+
data_type,
353+
DataType::Boolean
354+
| DataType::Date32
355+
| DataType::Date64
356+
| DataType::Decimal128(_, _)
357+
| DataType::Decimal256(_, _)
358+
| DataType::Duration(_)
359+
| DataType::Float16
360+
| DataType::Float32
361+
| DataType::Float64
362+
| DataType::Int16
363+
| DataType::Int32
364+
| DataType::Int64
365+
| DataType::Int8
366+
| DataType::Interval(_)
367+
| DataType::Null
368+
| DataType::Time32(_)
369+
| DataType::Time64(_)
370+
| DataType::Timestamp(_, _)
371+
| DataType::UInt16
372+
| DataType::UInt32
373+
| DataType::UInt64
374+
| DataType::UInt8
375+
| DataType::FixedSizeBinary(_)
376+
| DataType::Binary
377+
| DataType::LargeBinary
378+
| DataType::Utf8
379+
| DataType::LargeUtf8,
380+
),
381+
}
379382
}
380383

381384
fn do_create_field_encoder(
@@ -450,6 +453,26 @@ impl StructuralEncodingStrategy {
450453
child_encoder,
451454
)))
452455
}
456+
DataType::FixedSizeList(inner, _)
457+
if matches!(inner.data_type(), DataType::Struct(_)) =>
458+
{
459+
// Complex FixedSizeList needs structural encoding
460+
let child = field
461+
.children
462+
.first()
463+
.expect("FixedSizeList should have a child");
464+
let child_encoder = self.do_create_field_encoder(
465+
_encoding_strategy_root,
466+
child,
467+
column_index,
468+
options,
469+
root_field_metadata,
470+
)?;
471+
Ok(Box::new(FixedSizeListStructuralEncoder::new(
472+
options.keep_original_array,
473+
child_encoder,
474+
)))
475+
}
453476
DataType::Map(_, keys_sorted) => {
454477
// TODO: We only support keys_sorted=false for now,
455478
// because converting a rust arrow map field to the python arrow field will

rust/lance-encoding/src/encodings/logical.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// SPDX-FileCopyrightText: Copyright The Lance Authors
33

44
pub mod blob;
5+
pub mod fixed_size_list;
56
pub mod list;
67
pub mod map;
78
pub mod primitive;

0 commit comments

Comments
 (0)