diff --git a/src/control/lib/control/system.go b/src/control/lib/control/system.go index 5c5fd8e4eb0..3c77374d932 100644 --- a/src/control/lib/control/system.go +++ b/src/control/lib/control/system.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste type SystemSelfHealEvalReq struct { unaryRequest msRequest + retryableRequest } // SystemSelfHealEvalResp contains the response. @@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) { return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq) }) + req.retryTestFn = func(err error, _ uint) bool { + return (system.IsUnavailable(err) || IsRetryableConnErr(err) || + system.IsNotLeader(err) || system.IsNotReplica(err)) + } rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq)) ur, err := rpcClient.InvokeUnaryRPC(ctx, req) diff --git a/src/control/lib/control/system_test.go b/src/control/lib/control/system_test.go index 7d12cc04529..d43db611704 100644 --- a/src/control/lib/control/system_test.go +++ b/src/control/lib/control/system_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,6 +19,8 @@ import ( mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" sharedpb "github.com/daos-stack/daos/src/control/common/proto/shared" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/hostlist" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" @@ -2075,3 +2077,80 @@ func TestControl_SystemSelfHealEval(t *testing.T) { }) } } + +func TestControl_SystemSelfHealEval_RetryableErrors(t *testing.T) { + for name, testErr := range map[string]error{ + "system unavailable": system.ErrRaftUnavail, + "leader step-up": system.ErrLeaderStepUpInProgress, + "connection closed": FaultConnectionClosed(""), + "connection refused": FaultConnectionRefused(""), + "not leader": &system.ErrNotLeader{LeaderHint: "host1", Replicas: []string{"host2"}}, + "not replica": &system.ErrNotReplica{Replicas: []string{"host1", "host2"}}, + "data plane not started": &fault.Fault{Code: code.ServerDataPlaneNotStarted}, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(name) + defer test.ShowBufferOnFailure(t, buf) + + client := NewMockInvoker(log, &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("", testErr, nil), + MockMSResponse("", nil, &mgmtpb.DaosResp{}), + }, + }) + + gotResp, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{}) + if gotErr != nil { + t.Fatalf("unexpected error: %v", gotErr) + } + + expResp := new(SystemSelfHealEvalResp) + if diff := cmp.Diff(expResp, gotResp, cmpopts.IgnoreUnexported(SystemSelfHealEvalResp{})); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } + }) + } +} + +func TestControl_SystemSelfHealEval_NonRetryableErrors(t *testing.T) { + for name, tc := range map[string]struct { + testErr error + expErr error + }{ + "system uninitialized": { + testErr: system.ErrUninitialized, + expErr: system.ErrUninitialized, + }, + "generic error": { + testErr: errors.New("something went wrong"), + expErr: errors.New("something went wrong"), + }, + "connection bad host": { + testErr: FaultConnectionBadHost("badhost"), + expErr: FaultConnectionBadHost("badhost"), + }, + "connection no route": { + testErr: FaultConnectionNoRoute("10.0.0.1"), + expErr: FaultConnectionNoRoute("10.0.0.1"), + }, + "member exists": { + testErr: system.ErrRankExists(1), + expErr: system.ErrRankExists(1), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(name) + defer test.ShowBufferOnFailure(t, buf) + + client := NewMockInvoker(log, &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("", tc.testErr, nil), + MockMSResponse("", nil, &mgmtpb.DaosResp{}), + }, + }) + + _, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{}) + test.CmpErr(t, tc.expErr, gotErr) + }) + } +} diff --git a/src/control/system/errors.go b/src/control/system/errors.go index 509bee13906..335a255bf2f 100644 --- a/src/control/system/errors.go +++ b/src/control/system/errors.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -17,6 +17,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/ranklist" ) @@ -39,8 +41,10 @@ func IsUnavailable(err error) bool { if err == nil { return false } - cause := errors.Cause(err).Error() - return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error()) + cause := errors.Cause(err) + return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) || + strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) || + fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted) } // IsEmptyGroupMap returns a boolean indicating whether or not the diff --git a/src/control/system/errors_test.go b/src/control/system/errors_test.go index d2ea4eda1ab..02c896a1b86 100644 --- a/src/control/system/errors_test.go +++ b/src/control/system/errors_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -12,6 +13,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" ) func TestSystem_Errors_IsNotReady(t *testing.T) { @@ -79,12 +82,32 @@ func TestSystem_Errors_IsUnavailable(t *testing.T) { err: ErrLeaderStepUpInProgress, expResult: true, }, + "data plane not started": { + err: &fault.Fault{Code: code.ServerDataPlaneNotStarted}, + expResult: true, + }, + "wrapped data plane not started": { + err: errors.Wrap(&fault.Fault{Code: code.ServerDataPlaneNotStarted}, "wrapped error"), + expResult: true, + }, "uninitialized not unavailable": { err: ErrUninitialized, }, "something else": { err: errors.New("something is wrong"), }, + "member exists not unavailable": { + err: ErrRankExists(1), + }, + "member not found not unavailable": { + err: ErrMemberRankNotFound(1), + }, + "pool not found not unavailable": { + err: ErrPoolRankNotFound(1), + }, + "different fault code not unavailable": { + err: &fault.Fault{Code: code.ClientUnknown}, + }, } { t.Run(name, func(t *testing.T) { test.AssertEqual(t, tc.expResult, IsUnavailable(tc.err), "")