Skip to content

Commit f4fdf54

Browse files
authored
[Improvement](hash) remove nullable when _serialize_null_into_key is false and add int72 (#58316)
tpcds q97 8.8s -> 8.3s, q2 9s -> 8.7s This pull request adds support for a new fixed-width hash key type, `UInt72`, across the codebase. This enables more efficient handling of hash keys that are 72 bits wide in various data processing components, including aggregation, joins, sets, partitioning, and dictionary hash maps. The changes involve updating type variants, hash key type detection, and hash functions to accommodate the new type. ### Hash Key Type Support * Introduced the new `UInt72` struct and added it to the hash key type enumeration (`HashKeyType::fixed72`) and type detection logic in `hash_key_type.h`, allowing the system to recognize and use 72-bit hash keys. [[1]](diffhunk://#diff-0dea38f1f0f0f99ad74d97d77e100557d743ad599b3f5f75c825baf9c13ecdbfR64-R72) [[2]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R40) [[3]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R63-R64) ### Variant and Method Updates * Added `UInt72`-based variants to all major hash table, aggregation, distinct, set, partition, and dictionary hash map method variant definitions and their corresponding initialization logic, ensuring that all relevant components can utilize the new key type. [[1]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R84-R87) [[2]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R141-R144) [[3]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R108-R111) [[4]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R160-R163) [[5]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963L69-R70) [[6]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963R107-R110) [[7]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R143-R146) [[8]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R203-R206) [[9]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR70) [[10]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR109-R112) [[11]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650L50-R52) [[12]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650R87-R89) ### Hash Function Implementation * Implemented a specialized CRC32 hash function for `UInt72` in `hash.h`, ensuring proper hashing behavior for the new type. ### Code Generation and Instantiation * Updated template instantiations and code generation macros to include `FixedKeyHashTableContext<vectorized::UInt72>`, ensuring that join and hash table probing logic supports the new key type. ### Minor Logic Adjustment * Refactored build key column handling in hash join to correctly manage nullable and non-nullable types in `hashjoin_build_sink.cpp`.
1 parent 1bb4669 commit f4fdf54

File tree

13 files changed

+79
-19
lines changed

13 files changed

+79
-19
lines changed

be/src/pipeline/common/agg_utils.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,10 @@ using AggregatedMethodVariants = std::variant<
8181
vectorized::MethodSingleNullableColumn<
8282
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
8383
vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>,
84+
vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>,
8485
vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>,
85-
vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>,
86-
vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>>;
86+
vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>,
87+
vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>;
8788

8889
struct AggregatedDataVariants
8990
: public DataVariants<AggregatedMethodVariants, vectorized::MethodSingleNullableColumn,
@@ -137,6 +138,10 @@ struct AggregatedDataVariants
137138
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>>(
138139
get_key_sizes(data_types));
139140
break;
141+
case HashKeyType::fixed72:
142+
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>(
143+
get_key_sizes(data_types));
144+
break;
140145
case HashKeyType::fixed128:
141146
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>(
142147
get_key_sizes(data_types));

be/src/pipeline/common/distinct_agg_utils.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,10 @@ using DistinctMethodVariants = std::variant<
105105
vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
106106
vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>,
107107
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>,
108+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>,
108109
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>,
109-
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>,
110-
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>>;
110+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>,
111+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>;
111112

112113
struct DistinctDataVariants
113114
: public DataVariants<DistinctMethodVariants, vectorized::MethodSingleNullableColumn,
@@ -156,6 +157,10 @@ struct DistinctDataVariants
156157
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
157158
get_key_sizes(data_types));
158159
break;
160+
case HashKeyType::fixed72:
161+
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
162+
get_key_sizes(data_types));
163+
break;
159164
case HashKeyType::fixed128:
160165
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
161166
get_key_sizes(data_types));

be/src/pipeline/common/join_utils.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ using HashTableVariants = std::variant<
6666
DirectPrimaryTypeHashTableContext<vectorized::UInt32>,
6767
DirectPrimaryTypeHashTableContext<vectorized::UInt64>,
6868
DirectPrimaryTypeHashTableContext<vectorized::UInt128>,
69-
FixedKeyHashTableContext<vectorized::UInt64>, FixedKeyHashTableContext<vectorized::UInt128>,
69+
FixedKeyHashTableContext<vectorized::UInt64>, FixedKeyHashTableContext<vectorized::UInt72>,
70+
FixedKeyHashTableContext<vectorized::UInt128>,
7071
FixedKeyHashTableContext<vectorized::UInt136>,
7172
FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>;
7273

@@ -103,6 +104,10 @@ struct JoinDataVariants {
103104
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt64>>(
104105
get_key_sizes(data_types));
105106
break;
107+
case HashKeyType::fixed72:
108+
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>(
109+
get_key_sizes(data_types));
110+
break;
106111
case HashKeyType::fixed128:
107112
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>(
108113
get_key_sizes(data_types));

be/src/pipeline/common/partition_sort_utils.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,10 @@ using PartitionedMethodVariants = std::variant<
140140
PartitionDataSingleNullable<vectorized::UInt128>,
141141
PartitionDataSingleNullable<vectorized::UInt256>,
142142
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>,
143+
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>,
143144
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>,
144-
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
145145
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>,
146+
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
146147
vectorized::MethodStringNoCache<PartitionDataWithShortStringKey>,
147148
vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
148149
vectorized::DataWithNullKey<PartitionDataWithShortStringKey>>>>;
@@ -199,6 +200,10 @@ struct PartitionedHashMapVariants
199200
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>>(
200201
get_key_sizes(data_types));
201202
break;
203+
case HashKeyType::fixed72:
204+
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>(
205+
get_key_sizes(data_types));
206+
break;
202207
case HashKeyType::fixed128:
203208
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>(
204209
get_key_sizes(data_types));

be/src/pipeline/common/set_utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ using SetHashTableVariants =
6767
SetPrimaryTypeHashTableContext<vectorized::UInt128>,
6868
SetPrimaryTypeHashTableContext<vectorized::UInt256>,
6969
SetFixedKeyHashTableContext<vectorized::UInt64>,
70+
SetFixedKeyHashTableContext<vectorized::UInt72>,
7071
SetFixedKeyHashTableContext<vectorized::UInt128>,
7172
SetFixedKeyHashTableContext<vectorized::UInt256>,
7273
SetFixedKeyHashTableContext<vectorized::UInt136>>;
@@ -105,6 +106,10 @@ struct SetDataVariants
105106
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt64>>(
106107
get_key_sizes(data_types));
107108
break;
109+
case HashKeyType::fixed72:
110+
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>(
111+
get_key_sizes(data_types));
112+
break;
108113
case HashKeyType::fixed128:
109114
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>(
110115
get_key_sizes(data_types));

be/src/pipeline/exec/hashjoin_build_sink.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,9 +450,11 @@ Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state,
450450
/// For 'null safe equal' join,
451451
/// the build key column maybe be converted to nullable from non-nullable.
452452
if (p._serialize_null_into_key[i]) {
453-
data_type = vectorized::make_nullable(data_type);
453+
data_types.emplace_back(vectorized::make_nullable(data_type));
454+
} else {
455+
// in this case, we use nullmap to represent null value
456+
data_types.emplace_back(vectorized::remove_nullable(data_type));
454457
}
455-
data_types.emplace_back(std::move(data_type));
456458
}
457459
if (_build_expr_ctxs.size() == 1) {
458460
p._should_keep_hash_key_column = true;

be/src/pipeline/exec/join/process_hash_table_probe_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ struct ExtractType<T(U)> {
803803
INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext<vectorized::UInt128>)); \
804804
INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext<vectorized::UInt256>)); \
805805
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>)); \
806+
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>)); \
806807
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt128>)); \
807808
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt136>)); \
808809
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt256>)); \

be/src/vec/common/hash_table/hash.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -189,20 +189,27 @@ struct HashCRC32<wide::Int256> {
189189
}
190190
};
191191

192+
#include "common/compile_check_avoid_begin.h"
193+
194+
template <>
195+
struct HashCRC32<doris::vectorized::UInt72> {
196+
size_t operator()(const doris::vectorized::UInt72& x) const {
197+
doris::vectorized::UInt64 crc = -1ULL;
198+
crc = _mm_crc32_u8(crc, x.a);
199+
crc = _mm_crc32_u64(crc, x.b);
200+
return crc;
201+
}
202+
};
203+
192204
template <>
193205
struct HashCRC32<doris::vectorized::UInt136> {
194206
size_t operator()(const doris::vectorized::UInt136& x) const {
195-
#if defined(__SSE4_2__) || defined(__aarch64__)
196207
doris::vectorized::UInt64 crc = -1ULL;
197-
#include "common/compile_check_avoid_begin.h"
198-
//_mm_crc32_u8 does not provide a u64 interface, so there is an unavoidable conversion from u64 to u32 here.
199208
crc = _mm_crc32_u8(crc, x.a);
200-
#include "common/compile_check_avoid_end.h"
201209
crc = _mm_crc32_u64(crc, x.b);
202210
crc = _mm_crc32_u64(crc, x.c);
203211
return crc;
204-
#else
205-
return Hash128to64({Hash128to64({x.a, x.b}), x.c});
206-
#endif
207212
}
208213
};
214+
215+
#include "common/compile_check_avoid_end.h"

be/src/vec/common/hash_table/hash_key_type.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ enum class HashKeyType {
3737
int256_key,
3838
string_key,
3939
fixed64,
40+
fixed72,
4041
fixed128,
4142
fixed136,
4243
fixed256
@@ -59,6 +60,8 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) {
5960
using namespace vectorized;
6061
if (size <= sizeof(UInt64)) {
6162
return HashKeyType::fixed64;
63+
} else if (size <= sizeof(UInt72)) {
64+
return HashKeyType::fixed72;
6265
} else if (size <= sizeof(UInt128)) {
6366
return HashKeyType::fixed128;
6467
} else if (size <= sizeof(UInt136)) {

be/src/vec/common/uint128.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ struct UInt128TrivialHash {
6161

6262
using UInt256 = wide::UInt256;
6363

64+
#pragma pack(1)
65+
struct UInt72 {
66+
UInt8 a;
67+
UInt64 b;
68+
69+
bool operator==(const UInt72& rhs) const { return a == rhs.a && b == rhs.b; }
70+
};
71+
#pragma pack()
72+
6473
#pragma pack(1)
6574
struct UInt136 {
6675
UInt8 a;

0 commit comments

Comments
 (0)