Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 33 additions & 22 deletions src/chk/chk_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ chk_pending_free(struct btr_instance *tins, struct btr_record *rec, void *args)
ABT_mutex_unlock(cpr->cpr_mutex);
} else {
ABT_mutex_unlock(cpr->cpr_mutex);
chk_pending_destroy(cpr);
chk_pending_destroy(NULL, false, cpr);
}
}

Expand Down Expand Up @@ -899,6 +899,30 @@ chk_pool_shard_cleanup(struct chk_instance *ins)
}
}

int
chk_pending_lookup(struct chk_instance *ins, uint64_t seq, bool locked,
struct chk_pending_rec **cpr)
{
d_iov_t kiov;
d_iov_t riov;
int rc;

d_iov_set(&riov, NULL, 0);
d_iov_set(&kiov, &seq, sizeof(seq));

if (!locked)
ABT_rwlock_rdlock(ins->ci_abt_lock);
rc = dbtree_lookup(ins->ci_pending_hdl, &kiov, &riov);
if (!locked)
ABT_rwlock_unlock(ins->ci_abt_lock);
if (rc == 0)
*cpr = (struct chk_pending_rec *)riov.iov_buf;
else
*cpr = NULL;

return rc;
}

int
chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid,
uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr)
Expand Down Expand Up @@ -951,14 +975,16 @@ chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked, struct chk_

if (!locked)
ABT_rwlock_wrlock(ins->ci_abt_lock);
rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, &riov);
rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, cpr == NULL ? NULL : &riov);
if (!locked)
ABT_rwlock_unlock(ins->ci_abt_lock);

if (rc == 0)
*cpr = (struct chk_pending_rec *)riov.iov_buf;
else
*cpr = NULL;
if (cpr != NULL) {
if (rc == 0)
*cpr = (struct chk_pending_rec *)riov.iov_buf;
else
*cpr = NULL;
}

D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG,
"Del pending record with gen "DF_X64", seq "DF_X64": "DF_RC"\n",
Expand Down Expand Up @@ -996,28 +1022,13 @@ chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr)
ABT_mutex_unlock(cpr->cpr_mutex);
} else {
ABT_mutex_unlock(cpr->cpr_mutex);
chk_pending_destroy(cpr);
chk_pending_destroy(ins, false, cpr);
}
}

return rc;
}

void
chk_pending_destroy(struct chk_pending_rec *cpr)
{
D_ASSERT(d_list_empty(&cpr->cpr_pool_link));
D_ASSERT(d_list_empty(&cpr->cpr_rank_link));

if (cpr->cpr_cond != ABT_COND_NULL)
ABT_cond_free(&cpr->cpr_cond);

if (cpr->cpr_mutex != ABT_MUTEX_NULL)
ABT_mutex_free(&cpr->cpr_mutex);

D_FREE(cpr);
}

int
chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies,
d_rank_list_t *ranks, struct chk_property *prop)
Expand Down
43 changes: 21 additions & 22 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ chk_engine_post_repair(struct chk_pool_rec *cpr, int *result, bool update)
*result = 0;

if (*result != 0) {
chk_ins_set_fail(cpr->cpr_ins, cbk->cb_phase);
if (cpr->cpr_ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT) {
cbk->cb_time.ct_stop_time = time(NULL);
cbk->cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_FAILED;
Expand Down Expand Up @@ -1198,10 +1199,13 @@ chk_engine_cont_target_label_empty(struct chk_cont_rec *ccr)
static inline bool
chk_engine_cont_cs_label_empty(struct chk_cont_rec *ccr)
{
if (daos_iov_empty(&ccr->ccr_label_cs))
d_iov_t *label = &ccr->ccr_label_cs;

if (daos_iov_empty(label))
return true;

if (strncmp(DAOS_PROP_NO_CO_LABEL, ccr->ccr_label_cs.iov_buf, DAOS_PROP_LABEL_MAX_LEN) == 0)
if (strlen(DAOS_PROP_NO_CO_LABEL) == label->iov_len &&
strncmp(DAOS_PROP_NO_CO_LABEL, label->iov_buf, label->iov_len) == 0)
return true;

return false;
Expand Down Expand Up @@ -1573,8 +1577,8 @@ chk_engine_cont_label_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg

ccr = riov.iov_buf;
if (ccr->ccr_label_prop == NULL ||
strncmp(key->iov_buf, ccr->ccr_label_prop->dpp_entries[0].dpe_str,
DAOS_PROP_LABEL_MAX_LEN) != 0)
key->iov_len != strlen(ccr->ccr_label_prop->dpp_entries[0].dpe_str) ||
strncmp(key->iov_buf, ccr->ccr_label_prop->dpp_entries[0].dpe_str, key->iov_len) != 0)
rc = daos_iov_copy(&ccr->ccr_label_cs, key);
else
ccr->ccr_label_checked = 1;
Expand Down Expand Up @@ -3147,13 +3151,12 @@ chk_engine_pool_mbs(uint64_t gen, uuid_t uuid, uint32_t phase, const char *label
static int
chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
{
struct chk_instance *ins = chk_engine;
struct chk_pending_rec *cpr = NULL;
struct chk_pending_rec *tmp = NULL;
struct chk_pool_rec *pool = NULL;
d_iov_t kiov;
d_iov_t riov;
int rc;
struct chk_instance *ins = chk_engine;
struct chk_pending_rec *cpr = NULL;
struct chk_pool_rec *pool = NULL;
d_iov_t kiov;
d_iov_t riov;
int rc;

D_ASSERT(cru->cru_pool != NULL);

Expand Down Expand Up @@ -3189,14 +3192,9 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
cru->cru_detail_nr, cru->cru_details, *seq);
if (unlikely(rc == -DER_AGAIN)) {
D_ASSERT(cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT);
D_ASSERT(cpr != NULL);

rc = chk_pending_del(ins, *seq, false, &tmp);
if (rc == 0)
D_ASSERT(tmp == NULL);
else if (rc != -DER_NONEXIST)
goto log;

chk_pending_destroy(cpr);
chk_pending_destroy(ins, false, cpr);
cpr = NULL;

goto new_seq;
Expand Down Expand Up @@ -3241,11 +3239,12 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
goto again;

out:
if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING)
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING;

if (cpr != NULL)
chk_pending_destroy(cpr);
chk_pending_destroy(ins, false, cpr);

if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING &&
d_list_empty(&pool->cpr_pending_list))
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING;

return rc;
}
Expand Down
24 changes: 22 additions & 2 deletions src/chk/chk_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,9 @@ int chk_pool_add_shard(daos_handle_t hdl, d_list_t *head, uuid_t uuid, d_rank_t

void chk_pool_shard_cleanup(struct chk_instance *ins);

int chk_pending_lookup(struct chk_instance *ins, uint64_t seq, bool locked,
struct chk_pending_rec **cpr);

int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid,
uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr);

Expand All @@ -731,8 +734,6 @@ int chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked,

int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr);

void chk_pending_destroy(struct chk_pending_rec *cpr);

int chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr,
struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop);

Expand Down Expand Up @@ -960,6 +961,25 @@ chk_destroy_tree(daos_handle_t *toh, struct btr_root *root)
}
}

static inline void
chk_pending_destroy(struct chk_instance *ins, bool locked, struct chk_pending_rec *cpr)
{
if (d_list_empty(&cpr->cpr_pool_link)) {
D_ASSERT(d_list_empty(&cpr->cpr_rank_link));

if (cpr->cpr_cond != ABT_COND_NULL)
ABT_cond_free(&cpr->cpr_cond);

if (cpr->cpr_mutex != ABT_MUTEX_NULL)
ABT_mutex_free(&cpr->cpr_mutex);

D_FREE(cpr);
} else {
cpr->cpr_busy = 0;
chk_pending_del(ins, cpr->cpr_seq, locked, NULL);
}
}

static inline void
chk_destroy_pending_tree(struct chk_instance *ins)
{
Expand Down
31 changes: 17 additions & 14 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
Expand Up @@ -3392,12 +3392,10 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo
d_iov_t riov;
int rc;

rc = chk_pending_del(ins, seq, locked, &pending);
rc = chk_pending_lookup(ins, seq, locked, &pending);
if (rc != 0)
goto out;

D_ASSERT(pending->cpr_busy);

if (pending->cpr_on_leader) {
ABT_mutex_lock(pending->cpr_mutex);
/*
Expand All @@ -3410,24 +3408,30 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo

if (cla != NULL)
*cla = pending->cpr_class;

chk_pending_del(ins, seq, locked, &pending);
} else {
d_iov_set(&riov, NULL, 0);
d_iov_set(&kiov, pending->cpr_uuid, sizeof(uuid_t));
rc = dbtree_lookup(ins->ci_pool_hdl, &kiov, &riov);
if (rc == 0) {
if (rc == 0)
pool = (struct chk_pool_rec *)riov.iov_buf;
if (pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING)
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING;
} else {
else
rc = 0;
}

/* For locked case, check engines have already processed related interaction. */
if (!locked)
rc = chk_act_remote(ins->ci_ranks, ins->ci_bk.cb_gen, seq,
pending->cpr_class, act, pending->cpr_rank, for_all);

chk_pending_destroy(pending);
if (rc == 0) {
chk_pending_destroy(ins, locked, pending);

if (pool != NULL &&
pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING &&
d_list_empty(&pool->cpr_pending_list))
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING;
}
}

out:
Expand Down Expand Up @@ -3629,14 +3633,13 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
goto again;

out:
if ((rc != 0 || decision != NULL) && cpr != NULL)
chk_pending_destroy(ins, false, cpr);

if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING &&
(rc != 0 || (cpr != NULL &&
cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT)))
d_list_empty(&pool->cpr_pending_list))
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING;

if ((rc != 0 || decision != NULL) && cpr != NULL)
chk_pending_destroy(cpr);

return rc;
}

Expand Down
4 changes: 4 additions & 0 deletions src/chk/chk_upcall.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2022 Intel Corporation.
* (C) Copyright 2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -83,6 +84,9 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re
time_t tm = time(NULL);
int rc;

if (DAOS_FAIL_CHECK(DAOS_CHK_REPORT_FAILURE))
return -DER_IO;

report.seq = seq;
report.class_ = cla;
report.action = act;
Expand Down
3 changes: 2 additions & 1 deletion src/include/daos/common.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2015-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -924,6 +924,7 @@ enum {
#define DAOS_CHK_FAIL_REPORT_POOL1 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xb7)
#define DAOS_CHK_FAIL_REPORT_POOL2 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xb8)
#define DAOS_CHK_ENGINE_DEATH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xb9)
#define DAOS_CHK_REPORT_FAILURE (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xbc)

#define DAOS_MGMT_FAIL_CREATE_QUERY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xe0)

Expand Down
Loading
Loading