Skip to content

Commit c893dd9

Browse files
authored
[opt](memory) implement freeing memory column by column for partial update (apache#58275)
### What problem does this PR solve? #### Background Now, pages of column writer will be released after write all memtable block in the scenario of partial updates, the process is as follows: 1. create all column_writer 2. append all column_writer (significant amount of memory expansion) 3. finish and free memory. #### Problem This implement will cause a significant amount of memory expansion when flushing memtable(The larger the number of columns, the more obvious it is), and make load could easily cancelled by memory manager when table column too wide(such as 5000 column). #### Solution This pr implement freeing memory column by column for partial update to solve the problem, the process is as follows: 1. create all column_writer 2. append column_writer and free memory column by column. #### Test results Test using machines with specifications of 16c and 64G with 5000 column table: - Before this pr, `VerticalSegmentWriter::_create_column_writer` occupies nearly 20% of the memory. - After fix, ` VerticalSegmentWriter::_create_column_writer` hardly occupies memory.
1 parent 29428ff commit c893dd9

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,19 @@ Status VerticalSegmentWriter::_probe_key_for_mow(
447447
return Status::OK();
448448
}
449449

450+
Status VerticalSegmentWriter::_finalize_column_writer_and_update_meta(size_t cid) {
451+
RETURN_IF_ERROR(_column_writers[cid]->finish());
452+
RETURN_IF_ERROR(_column_writers[cid]->write_data());
453+
454+
auto* column_meta = _column_writers[cid]->get_column_meta();
455+
column_meta->set_compressed_data_bytes(
456+
_column_writers[cid]->get_total_compressed_data_pages_bytes());
457+
column_meta->set_uncompressed_data_bytes(
458+
_column_writers[cid]->get_total_uncompressed_data_pages_bytes());
459+
column_meta->set_raw_data_bytes(_column_writers[cid]->get_raw_data_bytes());
460+
return Status::OK();
461+
}
462+
450463
Status VerticalSegmentWriter::_partial_update_preconditions_check(size_t row_pos,
451464
bool is_flexible_update) {
452465
if (!_is_mow()) {
@@ -536,6 +549,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
536549
}
537550
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(),
538551
data.num_rows));
552+
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
539553
}
540554

541555
bool has_default_or_nullable = false;
@@ -625,6 +639,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
625639
}
626640
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(),
627641
data.num_rows));
642+
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
628643
}
629644

630645
_num_rows_updated += stats.num_rows_updated;
@@ -722,6 +737,7 @@ Status VerticalSegmentWriter::_append_block_with_flexible_partial_content(
722737
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(),
723738
data.num_rows));
724739
DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows);
740+
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
725741
}
726742

727743
// 5. genreate read plan
@@ -767,6 +783,7 @@ Status VerticalSegmentWriter::_append_block_with_flexible_partial_content(
767783
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(),
768784
data.num_rows));
769785
DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows);
786+
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
770787
}
771788

772789
_num_rows_updated += stats.num_rows_updated;
@@ -928,17 +945,6 @@ Status VerticalSegmentWriter::write_batch() {
928945
RETURN_IF_ERROR(_append_block_with_partial_content(data, full_block));
929946
}
930947
}
931-
for (auto& column_writer : _column_writers) {
932-
RETURN_IF_ERROR(column_writer->finish());
933-
RETURN_IF_ERROR(column_writer->write_data());
934-
935-
auto* column_meta = column_writer->get_column_meta();
936-
column_meta->set_compressed_data_bytes(
937-
column_writer->get_total_compressed_data_pages_bytes());
938-
column_meta->set_uncompressed_data_bytes(
939-
column_writer->get_total_uncompressed_data_pages_bytes());
940-
column_meta->set_raw_data_bytes(column_writer->get_raw_data_bytes());
941-
}
942948
return Status::OK();
943949
}
944950
// Row column should be filled here when it's a directly write from memtable
@@ -988,15 +994,7 @@ Status VerticalSegmentWriter::write_batch() {
988994
return Status::Error<DISK_REACH_CAPACITY_LIMIT>("disk {} exceed capacity limit.",
989995
_data_dir->path_hash());
990996
}
991-
RETURN_IF_ERROR(_column_writers[cid]->finish());
992-
RETURN_IF_ERROR(_column_writers[cid]->write_data());
993-
994-
auto* column_meta = _column_writers[cid]->get_column_meta();
995-
column_meta->set_compressed_data_bytes(
996-
_column_writers[cid]->get_total_compressed_data_pages_bytes());
997-
column_meta->set_uncompressed_data_bytes(
998-
_column_writers[cid]->get_total_uncompressed_data_pages_bytes());
999-
column_meta->set_raw_data_bytes(_column_writers[cid]->get_raw_data_bytes());
997+
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
1000998
}
1001999

10021000
for (auto& data : _batched_blocks) {

be/src/olap/rowset/segment_v2/vertical_segment_writer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ class VerticalSegmentWriter {
198198
vectorized::IOlapColumnDataAccessor* seq_column, size_t num_rows, bool need_sort);
199199
Status _generate_short_key_index(std::vector<vectorized::IOlapColumnDataAccessor*>& key_columns,
200200
size_t num_rows, const std::vector<size_t>& short_key_pos);
201+
Status _finalize_column_writer_and_update_meta(size_t cid);
202+
201203
bool _is_mow();
202204
bool _is_mow_with_cluster_key();
203205

0 commit comments

Comments
 (0)