Skip to content

Commit e53b03e

Browse files
committed
[#29343] YSQL: Make sure each batch has enough of PgsqlOps
Summary: We could not reproduce the issue reported by #29343 in a controlled environment, however after core dump analysis we figured out that while making batches to fetch data from the base table by ybctids the PgDocReadOp instance did not have sufficient number PgsqlOps/ReadReqs. There was one PgsqlOp/ReadReq while the table had 2 tablets. In 8fa242d where the issue was allegedly introduced we changed the logic of PgsqlOps/ReadReqs repopulation after cleanup between the batches. We create as many PgsqlOps/ReadReqs as there were in the previous batch, assuming that the number of the PgsqlOps/ReadReqs needed does not change between the batches. Apparently that assumption was incorrect, so here we recalculate the number for each batch and make sure we have that number of the PgsqlOps/ReadReqs created. Test Plan: Make sure there is no regression in Jenkins Since the issue was only reproduced in perf environment, rerun perf tests on the diff Reviewers: jason Reviewed By: jason Subscribers: yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D48246
1 parent 1dfa0f5 commit e53b03e

File tree

1 file changed

+19
-24
lines changed

1 file changed

+19
-24
lines changed

src/yb/yql/pggate/pg_doc_op.cc

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,11 +1235,9 @@ void PgDocReadOp::InitializeYbctidOperators() {
12351235
DCHECK(pgsql_ops_.empty());
12361236
pgsql_op_arena_ = SharedThreadSafeArena();
12371237
}
1238-
if (pgsql_ops_.empty()) {
1239-
ClonePgsqlOps(table_->GetPartitionListSize());
1240-
} else {
1241-
ResetInactivePgsqlOps();
1242-
}
1238+
ResetInactivePgsqlOps();
1239+
// Make sure we have one operation per partition
1240+
ClonePgsqlOps(table_->GetPartitionListSize());
12431241
}
12441242

12451243
Result<bool> PgDocReadOp::BindExprsRegular(
@@ -1346,27 +1344,27 @@ Result<bool> PgDocReadOp::PopulateNextHashPermutationOps() {
13461344
// Permutations and operations are reset between parallel ranges, in this case we need to clone
13471345
// a new set of operations.
13481346
if (!hash_permutations_) {
1347+
DCHECK(pgsql_ops_.empty());
13491348
hash_permutations_.emplace(InitializeHashPermutationStates());
1349+
// Setup arena to control memory footprint
1350+
if (!pgsql_op_arena_) {
1351+
pgsql_op_arena_ = SharedThreadSafeArena();
1352+
}
13501353
} else {
13511354
ResetInactivePgsqlOps();
13521355
}
1356+
13531357
// Continue with operations if any are still in flight
13541358
if (active_op_count_ > 0) {
13551359
return false;
13561360
}
13571361

1358-
// Setup arena to control memory footprint
1359-
if (!pgsql_op_arena_) {
1360-
pgsql_op_arena_ = SharedThreadSafeArena();
1361-
}
13621362
// Setup operations
1363-
if (pgsql_ops_.empty()) {
1364-
auto max_op_count = std::min(hash_permutations_->Size(),
1365-
IsHashBatchingEnabled() ?
1366-
table_->GetPartitionListSize() :
1367-
implicit_cast<size_t>(FLAGS_ysql_request_limit));
1368-
ClonePgsqlOps(max_op_count);
1369-
}
1363+
auto max_op_count = std::min(hash_permutations_->Size(),
1364+
IsHashBatchingEnabled() ?
1365+
table_->GetPartitionListSize() :
1366+
implicit_cast<size_t>(FLAGS_ysql_request_limit));
1367+
ClonePgsqlOps(max_op_count);
13701368
if (IsHashBatchingEnabled()) {
13711369
const auto batch_count = table_->GetPartitionList();
13721370
std::vector<std::pair<bool, LWPgsqlExpressionPB*>> partition_batches(
@@ -1650,15 +1648,12 @@ void PgDocReadOp::SetReadTimeForBackfill() {
16501648

16511649
void PgDocReadOp::ResetInactivePgsqlOps() {
16521650
auto op_count = pgsql_ops_.size();
1653-
if (pgsql_op_arena_ && active_op_count_ == 0) {
1651+
if (pgsql_op_arena_ && active_op_count_ == 0 && op_count > 0) {
16541652
// All past operations are done, can perform full reset to release memory
1655-
if (op_count > 0) {
1656-
VLOG_WITH_FUNC(3) << "do full reset";
1657-
ResetResultStream();
1658-
pgsql_ops_.clear();
1659-
pgsql_op_arena_->Reset(ResetMode::kKeepLast);
1660-
ClonePgsqlOps(op_count);
1661-
}
1653+
VLOG_WITH_FUNC(3) << "do full reset";
1654+
ResetResultStream();
1655+
pgsql_ops_.clear();
1656+
pgsql_op_arena_->Reset(ResetMode::kKeepLast);
16621657
return;
16631658
}
16641659
// Clear the existing requests.

0 commit comments

Comments
 (0)