@@ -28,7 +28,7 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2828use crate :: cast:: {
2929 as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
3030 as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31- as_string_array, as_string_view_array, as_struct_array,
31+ as_string_array, as_string_view_array, as_struct_array, as_union_array ,
3232} ;
3333use crate :: error:: Result ;
3434#[ cfg( not( feature = "force_hash_collisions" ) ) ]
@@ -329,6 +329,40 @@ where
329329 Ok ( ( ) )
330330}
331331
332+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
333+ fn hash_union_array (
334+ array : & UnionArray ,
335+ random_state : & RandomState ,
336+ hashes_buffer : & mut [ u64 ] ,
337+ ) -> Result < ( ) > {
338+ let DataType :: Union ( union_fields, _mode) = array. data_type ( ) else {
339+ unreachable ! ( )
340+ } ;
341+
342+ let mut child_hashes = vec ! [ None ; 128 ] ;
343+ for ( type_id, _field) in union_fields. iter ( ) {
344+ let child = array. child ( type_id) ;
345+ let mut child_hash_buffer = vec ! [ 0 ; child. len( ) ] ;
346+ create_hashes ( [ child] , random_state, & mut child_hash_buffer) ?;
347+
348+ child_hashes[ type_id as usize ] = Some ( child_hash_buffer) ;
349+ }
350+
351+ #[ allow( clippy:: needless_range_loop) ]
352+ for i in 0 ..array. len ( ) {
353+ let type_id = array. type_id ( i) ;
354+ let child_offset = array. value_offset ( i) ;
355+
356+ let child_hash = & child_hashes[ type_id as usize ]
357+ . as_ref ( )
358+ . expect ( "invalid type_id" ) ;
359+
360+ hashes_buffer[ i] = child_hash[ child_offset] ;
361+ }
362+
363+ Ok ( ( ) )
364+ }
365+
332366#[ cfg( not( feature = "force_hash_collisions" ) ) ]
333367fn hash_fixed_list_array (
334368 array : & FixedSizeListArray ,
@@ -409,6 +443,10 @@ fn hash_single_array(
409443 let array = as_fixed_size_list_array( array) ?;
410444 hash_fixed_list_array( array, random_state, hashes_buffer) ?;
411445 }
446+ DataType :: Union ( _, _) => {
447+ let array = as_union_array( array) ?;
448+ hash_union_array( array, random_state, hashes_buffer) ?;
449+ }
412450 _ => {
413451 // This is internal because we should have caught this before.
414452 return _internal_err!(
@@ -1000,4 +1038,83 @@ mod tests {
10001038
10011039 assert_eq ! ( hashes1, hashes2) ;
10021040 }
1041+
1042+ #[ test]
1043+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
1044+ fn create_hashes_for_sparse_union_arrays ( ) {
1045+ // Create a sparse union array with int and string types
1046+ // In sparse mode, row i uses child_array[i]
1047+ // Logical array: [int(5), str("foo"), int(10), int(5)]
1048+ let int_array = Int32Array :: from ( vec ! [ Some ( 5 ) , None , Some ( 10 ) , Some ( 5 ) ] ) ;
1049+ let str_array = StringArray :: from ( vec ! [ None , Some ( "foo" ) , None , None ] ) ;
1050+
1051+ let type_ids = vec ! [ 0_i8 , 1 , 0 , 0 ] . into ( ) ;
1052+ let children = vec ! [
1053+ Arc :: new( int_array) as ArrayRef ,
1054+ Arc :: new( str_array) as ArrayRef ,
1055+ ] ;
1056+
1057+ let union_fields = [
1058+ ( 0 , Arc :: new ( Field :: new ( "a" , DataType :: Int32 , true ) ) ) ,
1059+ ( 1 , Arc :: new ( Field :: new ( "b" , DataType :: Utf8 , true ) ) ) ,
1060+ ]
1061+ . into_iter ( )
1062+ . collect ( ) ;
1063+
1064+ let array = UnionArray :: try_new ( union_fields, type_ids, None , children) . unwrap ( ) ;
1065+ let array_ref = Arc :: new ( array) as ArrayRef ;
1066+
1067+ let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
1068+ let mut hashes = vec ! [ 0 ; array_ref. len( ) ] ;
1069+ create_hashes ( & [ array_ref] , & random_state, & mut hashes) . unwrap ( ) ;
1070+
1071+ // Rows 0 and 3 both have type_id=0 (int) with value 5
1072+ assert_eq ! ( hashes[ 0 ] , hashes[ 3 ] ) ;
1073+ // Row 0 (int 5) vs Row 2 (int 10) - different values
1074+ assert_ne ! ( hashes[ 0 ] , hashes[ 2 ] ) ;
1075+ // Row 0 (int) vs Row 1 (string) - different types
1076+ assert_ne ! ( hashes[ 0 ] , hashes[ 1 ] ) ;
1077+ }
1078+
1079+ #[ test]
1080+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
1081+ fn create_hashes_for_dense_union_arrays ( ) {
1082+ // creates a dense union array with int and string types
1083+ // [67, "norm", 100, "macdonald", 67]
1084+ let int_array = Int32Array :: from ( vec ! [ 67 , 100 , 67 ] ) ;
1085+ let str_array = StringArray :: from ( vec ! [ "norm" , "macdonald" ] ) ;
1086+
1087+ let type_ids = vec ! [ 0 , 1 , 0 , 1 , 0 ] . into ( ) ;
1088+ let offsets = vec ! [ 0 , 0 , 1 , 1 , 2 ] . into ( ) ;
1089+ let children = vec ! [
1090+ Arc :: new( int_array) as ArrayRef ,
1091+ Arc :: new( str_array) as ArrayRef ,
1092+ ] ;
1093+
1094+ let union_fields = [
1095+ ( 0 , Arc :: new ( Field :: new ( "a" , DataType :: Int32 , false ) ) ) ,
1096+ ( 1 , Arc :: new ( Field :: new ( "b" , DataType :: Utf8 , false ) ) ) ,
1097+ ]
1098+ . into_iter ( )
1099+ . collect ( ) ;
1100+
1101+ let array =
1102+ UnionArray :: try_new ( union_fields, type_ids, Some ( offsets) , children) . unwrap ( ) ;
1103+ let array_ref = Arc :: new ( array) as ArrayRef ;
1104+
1105+ let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
1106+ let mut hashes = vec ! [ 0 ; array_ref. len( ) ] ;
1107+ create_hashes ( & [ array_ref] , & random_state, & mut hashes) . unwrap ( ) ;
1108+
1109+ // 67 vs "norm"
1110+ assert_ne ! ( hashes[ 0 ] , hashes[ 1 ] ) ;
1111+ // 67 vs 100
1112+ assert_ne ! ( hashes[ 0 ] , hashes[ 2 ] ) ;
1113+ // "norm" vs "macdonald"
1114+ assert_ne ! ( hashes[ 1 ] , hashes[ 3 ] ) ;
1115+ // 100 vs "macdonald"
1116+ assert_ne ! ( hashes[ 2 ] , hashes[ 3 ] ) ;
1117+ // 67 vs 67
1118+ assert_eq ! ( hashes[ 0 ] , hashes[ 4 ] ) ;
1119+ }
10031120}
0 commit comments