Skip to content

Commit d50a794

Browse files
[opt](ann index) Make chunk size of index train configurable (#58645)
### What problem does this PR solve? Previous pr: #57623 The current granularity for index training and data ingestion is set to 1M and is hard-coded, which makes index construction unnecessarily slow in some scenarios. This should be made configurable and reduced when appropriate. For example, when having 1M vectors to add, and batch size of stream load is set to 0.3M, this means we will have 3 stream load requests. If it happens to make one request that having 0.3M to have 1 threads for adding, whole process of load will be very slow. A typical cpu usage will be like this: <img width="1902" height="552" alt="image" src="https://github.com/user-attachments/assets/65728e56-f333-4bd5-a54a-8c12d01668f1" /> We need to make batch size configurable so that we can modify them when we need to do it. For example, when we set batch size to 30K, we can have a more higher avg cpu usage when we like this: <img width="1890" height="554" alt="image" src="https://github.com/user-attachments/assets/7d664b0e-b017-4a2e-bed8-e40f56ff97b7" /> **Default value is still 1M, small batch size will do a damage to the recall of the hnsw.**
1 parent 7ce7925 commit d50a794

File tree

4 files changed

+18
-6
lines changed

4 files changed

+18
-6
lines changed

be/src/common/config.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1612,6 +1612,12 @@ DEFINE_mInt32(max_segment_partial_column_cache_size, "100");
16121612
DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true");
16131613
DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true");
16141614

1615+
// Chunk size for ANN/vector index building per training/adding batch
1616+
// 1M By default.
1617+
DEFINE_mInt64(ann_index_build_chunk_size, "1000000");
1618+
DEFINE_Validator(ann_index_build_chunk_size,
1619+
[](const int64_t config) -> bool { return config > 0; });
1620+
16151621
DEFINE_mBool(enable_wal_tde, "false");
16161622

16171623
DEFINE_mBool(print_stack_when_cache_miss, "false");

be/src/common/config.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,8 @@ DECLARE_mInt64(max_csv_line_reader_output_buffer_size);
16741674
DECLARE_Int32(omp_threads_limit);
16751675
// The capacity of segment partial column cache, used to cache column readers for each segment.
16761676
DECLARE_mInt32(max_segment_partial_column_cache_size);
1677+
// Chunk size for ANN/vector index building per training/adding batch
1678+
DECLARE_mInt64(ann_index_build_chunk_size);
16771679

16781680
DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction);
16791681
DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction);

be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ Status AnnIndexColumnWriter::init() {
7878
index_type, build_parameter.dim, metric_type, build_parameter.max_degree,
7979
build_parameter.ef_construction, quantizer);
8080

81-
size_t block_size = CHUNK_SIZE * build_parameter.dim;
81+
size_t block_size = AnnIndexColumnWriter::chunk_size() * build_parameter.dim;
8282
_float_array.reserve(block_size);
8383

8484
return Status::OK();
@@ -110,7 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val
110110

111111
const float* p = reinterpret_cast<const float*>(value_ptr);
112112

113-
const size_t full_elements = CHUNK_SIZE * dim;
113+
const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
114114
size_t remaining_elements = num_rows * dim;
115115
size_t src_offset = 0;
116116
while (remaining_elements > 0) {
@@ -122,8 +122,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val
122122
remaining_elements -= elements_to_add;
123123

124124
if (_float_array.size() == full_elements) {
125-
RETURN_IF_ERROR(_vector_index->train(CHUNK_SIZE, _float_array.data()));
126-
RETURN_IF_ERROR(_vector_index->add(CHUNK_SIZE, _float_array.data()));
125+
RETURN_IF_ERROR(
126+
_vector_index->train(AnnIndexColumnWriter::chunk_size(), _float_array.data()));
127+
RETURN_IF_ERROR(
128+
_vector_index->add(AnnIndexColumnWriter::chunk_size(), _float_array.data()));
127129
_float_array.clear();
128130
}
129131
}

be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@ namespace doris::segment_v2 {
4040
#include "common/compile_check_begin.h"
4141
class AnnIndexColumnWriter : public IndexColumnWriter {
4242
public:
43+
static inline int64_t chunk_size() {
4344
#ifdef BE_TEST
44-
static constexpr int64_t CHUNK_SIZE = 10;
45+
return 10;
4546
#else
46-
static constexpr int64_t CHUNK_SIZE = 1'000'000;
47+
return config::ann_index_build_chunk_size;
4748
#endif
49+
}
4850
static constexpr const char* INDEX_TYPE = "index_type";
4951
static constexpr const char* METRIC_TYPE = "metric_type";
5052
static constexpr const char* DIM = "dim";

0 commit comments

Comments
 (0)