Skip to content

Commit b0590f4

Browse files
daltonbohningtanabarr
authored andcommitted
DAOS-18428 test: verify auto recovery policy
Test-tag: RbldAutoRecoveryPolicy Test-repeat: 10 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Dalton Bohning <dalton.bohning@hpe.com>
1 parent b9c419d commit b0590f4

7 files changed

Lines changed: 586 additions & 8 deletions

File tree

src/control/lib/control/system.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// (C) Copyright 2020-2024 Intel Corporation.
3-
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
//
55
// SPDX-License-Identifier: BSD-2-Clause-Patent
66
//
@@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste
13181318
type SystemSelfHealEvalReq struct {
13191319
unaryRequest
13201320
msRequest
1321+
retryableRequest
13211322
}
13221323

13231324
// SystemSelfHealEvalResp contains the response.
@@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System
13411342
req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) {
13421343
return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq)
13431344
})
1345+
req.retryTestFn = func(err error, _ uint) bool {
1346+
return (system.IsUnavailable(err) || IsRetryableConnErr(err) ||
1347+
system.IsNotLeader(err) || system.IsNotReplica(err))
1348+
}
13441349

13451350
rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq))
13461351
ur, err := rpcClient.InvokeUnaryRPC(ctx, req)

src/control/system/errors.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// (C) Copyright 2020-2024 Intel Corporation.
3-
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
//
55
// SPDX-License-Identifier: BSD-2-Clause-Patent
66
//
@@ -17,6 +17,8 @@ import (
1717
"github.com/pkg/errors"
1818

1919
"github.com/daos-stack/daos/src/control/build"
20+
"github.com/daos-stack/daos/src/control/fault"
21+
"github.com/daos-stack/daos/src/control/fault/code"
2022
"github.com/daos-stack/daos/src/control/lib/ranklist"
2123
)
2224

@@ -25,6 +27,7 @@ var (
2527
ErrRaftUnavail = errors.New("raft service unavailable (not started yet?)")
2628
ErrUninitialized = errors.New("system is uninitialized (storage format required?)")
2729
ErrLeaderStepUpInProgress = errors.New("leader step-up in progress (try again)")
30+
ErrEngineNotStarted = errors.New("instance not started or not responding on dRPC")
2831
)
2932

3033
// IsNotReady is a convenience function for checking if an error
@@ -39,8 +42,10 @@ func IsUnavailable(err error) bool {
3942
if err == nil {
4043
return false
4144
}
42-
cause := errors.Cause(err).Error()
43-
return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error())
45+
cause := errors.Cause(err)
46+
return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) ||
47+
strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) ||
48+
fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted)
4449
}
4550

4651
// IsEmptyGroupMap returns a boolean indicating whether or not the

0 commit comments

Comments
 (0)