Skip to content

Commit a68f0d8

Browse files
Hash UnionArrays
1 parent 2b3b220 commit a68f0d8

File tree

1 file changed

+118
-1
lines changed

1 file changed

+118
-1
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2828
use crate::cast::{
2929
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
3030
as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31-
as_string_array, as_string_view_array, as_struct_array,
31+
as_string_array, as_string_view_array, as_struct_array, as_union_array,
3232
};
3333
use crate::error::Result;
3434
#[cfg(not(feature = "force_hash_collisions"))]
@@ -329,6 +329,40 @@ where
329329
Ok(())
330330
}
331331

332+
#[cfg(not(feature = "force_hash_collisions"))]
333+
fn hash_union_array(
334+
array: &UnionArray,
335+
random_state: &RandomState,
336+
hashes_buffer: &mut [u64],
337+
) -> Result<()> {
338+
let DataType::Union(union_fields, _mode) = array.data_type() else {
339+
unreachable!()
340+
};
341+
342+
let mut child_hashes = vec![None; 128];
343+
for (type_id, _field) in union_fields.iter() {
344+
let child = array.child(type_id);
345+
let mut child_hash_buffer = vec![0; child.len()];
346+
create_hashes([child], random_state, &mut child_hash_buffer)?;
347+
348+
child_hashes[type_id as usize] = Some(child_hash_buffer);
349+
}
350+
351+
#[allow(clippy::needless_range_loop)]
352+
for i in 0..array.len() {
353+
let type_id = array.type_id(i);
354+
let child_offset = array.value_offset(i);
355+
356+
let child_hash = &child_hashes[type_id as usize]
357+
.as_ref()
358+
.expect("invalid type_id");
359+
360+
hashes_buffer[i] = child_hash[child_offset];
361+
}
362+
363+
Ok(())
364+
}
365+
332366
#[cfg(not(feature = "force_hash_collisions"))]
333367
fn hash_fixed_list_array(
334368
array: &FixedSizeListArray,
@@ -409,6 +443,10 @@ fn hash_single_array(
409443
let array = as_fixed_size_list_array(array)?;
410444
hash_fixed_list_array(array, random_state, hashes_buffer)?;
411445
}
446+
DataType::Union(_, _) => {
447+
let array = as_union_array(array)?;
448+
hash_union_array(array, random_state, hashes_buffer)?;
449+
}
412450
_ => {
413451
// This is internal because we should have caught this before.
414452
return _internal_err!(
@@ -1000,4 +1038,83 @@ mod tests {
10001038

10011039
assert_eq!(hashes1, hashes2);
10021040
}
1041+
1042+
#[test]
1043+
#[cfg(not(feature = "force_hash_collisions"))]
1044+
fn create_hashes_for_sparse_union_arrays() {
1045+
// Create a sparse union array with int and string types
1046+
// In sparse mode, row i uses child_array[i]
1047+
// Logical array: [int(5), str("foo"), int(10), int(5)]
1048+
let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
1049+
let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1050+
1051+
let type_ids = vec![0_i8, 1, 0, 0].into();
1052+
let children = vec![
1053+
Arc::new(int_array) as ArrayRef,
1054+
Arc::new(str_array) as ArrayRef,
1055+
];
1056+
1057+
let union_fields = [
1058+
(0, Arc::new(Field::new("a", DataType::Int32, true))),
1059+
(1, Arc::new(Field::new("b", DataType::Utf8, true))),
1060+
]
1061+
.into_iter()
1062+
.collect();
1063+
1064+
let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1065+
let array_ref = Arc::new(array) as ArrayRef;
1066+
1067+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1068+
let mut hashes = vec![0; array_ref.len()];
1069+
create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1070+
1071+
// Rows 0 and 3 both have type_id=0 (int) with value 5
1072+
assert_eq!(hashes[0], hashes[3]);
1073+
// Row 0 (int 5) vs Row 2 (int 10) - different values
1074+
assert_ne!(hashes[0], hashes[2]);
1075+
// Row 0 (int) vs Row 1 (string) - different types
1076+
assert_ne!(hashes[0], hashes[1]);
1077+
}
1078+
1079+
#[test]
1080+
#[cfg(not(feature = "force_hash_collisions"))]
1081+
fn create_hashes_for_dense_union_arrays() {
1082+
// creates a dense union array with int and string types
1083+
// [67, "norm", 100, "macdonald", 67]
1084+
let int_array = Int32Array::from(vec![67, 100, 67]);
1085+
let str_array = StringArray::from(vec!["norm", "macdonald"]);
1086+
1087+
let type_ids = vec![0, 1, 0, 1, 0].into();
1088+
let offsets = vec![0, 0, 1, 1, 2].into();
1089+
let children = vec![
1090+
Arc::new(int_array) as ArrayRef,
1091+
Arc::new(str_array) as ArrayRef,
1092+
];
1093+
1094+
let union_fields = [
1095+
(0, Arc::new(Field::new("a", DataType::Int32, false))),
1096+
(1, Arc::new(Field::new("b", DataType::Utf8, false))),
1097+
]
1098+
.into_iter()
1099+
.collect();
1100+
1101+
let array =
1102+
UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
1103+
let array_ref = Arc::new(array) as ArrayRef;
1104+
1105+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1106+
let mut hashes = vec![0; array_ref.len()];
1107+
create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1108+
1109+
// 67 vs "norm"
1110+
assert_ne!(hashes[0], hashes[1]);
1111+
// 67 vs 100
1112+
assert_ne!(hashes[0], hashes[2]);
1113+
// "norm" vs "macdonald"
1114+
assert_ne!(hashes[1], hashes[3]);
1115+
// 100 vs "macdonald"
1116+
assert_ne!(hashes[2], hashes[3]);
1117+
// 67 vs 67
1118+
assert_eq!(hashes[0], hashes[4]);
1119+
}
10031120
}

0 commit comments

Comments
 (0)