diff --git a/src/control/lib/control/system.go b/src/control/lib/control/system.go index 5c5fd8e4eb0..3c77374d932 100644 --- a/src/control/lib/control/system.go +++ b/src/control/lib/control/system.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste type SystemSelfHealEvalReq struct { unaryRequest msRequest + retryableRequest } // SystemSelfHealEvalResp contains the response. @@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) { return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq) }) + req.retryTestFn = func(err error, _ uint) bool { + return (system.IsUnavailable(err) || IsRetryableConnErr(err) || + system.IsNotLeader(err) || system.IsNotReplica(err)) + } rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq)) ur, err := rpcClient.InvokeUnaryRPC(ctx, req) diff --git a/src/control/system/errors.go b/src/control/system/errors.go index 509bee13906..3ed11dbf577 100644 --- a/src/control/system/errors.go +++ b/src/control/system/errors.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -17,6 +17,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/ranklist" ) @@ -25,6 +27,7 @@ var ( ErrRaftUnavail = errors.New("raft service unavailable (not started yet?)") ErrUninitialized = errors.New("system is uninitialized (storage format required?)") ErrLeaderStepUpInProgress = errors.New("leader step-up in progress (try again)") + ErrEngineNotStarted = errors.New("instance not started or not responding on dRPC") ) // IsNotReady is a convenience function for checking if an error @@ -39,8 +42,10 @@ func IsUnavailable(err error) bool { if err == nil { return false } - cause := errors.Cause(err).Error() - return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error()) + cause := errors.Cause(err) + return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) || + strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) || + fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted) } // IsEmptyGroupMap returns a boolean indicating whether or not the diff --git a/src/tests/ftest/rebuild/auto_recovery_policy.py b/src/tests/ftest/rebuild/auto_recovery_policy.py new file mode 100644 index 00000000000..9699ec352fc --- /dev/null +++ b/src/tests/ftest/rebuild/auto_recovery_policy.py @@ -0,0 +1,427 @@ +""" + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import re +import time +from functools import partial + +from apricot import TestWithServers +from data_utils import assert_val_in_list +from exception_utils import CommandFailure +from general_utils import list_to_str + + +class RbldAutoRecoveryPolicy(TestWithServers): + """Rebuild test cases featuring IOR. + + This class contains tests for pool rebuild that feature I/O going on + during the rebuild using IOR. + + :avocado: recursive + """ + + def test_rebuild_auto_recovery_policy(self): + """Jira ID: DAOS-17420. + + Test Description: Verify Rebuild Auto Recovery Policy + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=pool,rebuild,self_heal + :avocado: tags=RbldAutoRecoveryPolicy,test_rebuild_auto_recovery_policy + """ + self.log_step('Setup pool') + pool = self.get_pool(connect=False) + + # Run just the scenarios requested, or all by default + total_scenarios = 0 + scenarios_passed = 0 + scenarios_to_verify = set(self.params.get('scenarios_to_verify', '/run/test/*', ['all'])) + for method in filter(lambda x: '_verify_scenario_' in x, dir(self)): + total_scenarios += 1 + scenario_number = int(method.split('_')[-1]) + if scenario_number not in scenarios_to_verify and 'all' not in scenarios_to_verify: + self.log.warning('Skipping scenario %s', scenario_number) + continue + self.log.info('Running scenario %s', scenario_number) + scenario_method = getattr(self, method) + scenario_method(pool) + scenarios_passed += 1 + + self.log_step('Destroy pool') + pool.destroy() + + self.log_step(f'Test passed on {scenarios_passed}/{total_scenarios} scenarios') + + def _verify_scenario_1(self, pool): + """Scenario 1: System Creation and default self_heal. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + self.log_step('Scenario 1 - Verify default system self_heal policy') + response = dmg.system_get_prop(properties='self_heal')['response'] + actual_value = response[0]['value'] + expected_value = 'exclude;pool_exclude;pool_rebuild' + if actual_value != expected_value: + self.fail( + f'Expected system self_heal policy to be {expected_value}, ' + f'but got {actual_value}') + + self.log_step('Scenario 1 - Verify default pool self_heal policy') + response = pool.get_prop(name='self_heal')['response'] + actual_value = response[0]['value'] + expected_value = 'exclude;rebuild' + if actual_value != expected_value: + self.fail( + f'Expected pool self_heal policy to be {expected_value}, ' + f'but got {actual_value}') + + def _verify_scenario_2(self, pool): + """Scenario 2: Disabling and Enabling Self-Heal. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get 2 distinct sets of ranks to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + ranks_y = sorted(self.random.sample(list(set(all_ranks) - set(ranks_x)), k=1)) + + self.log_step('Scenario 2 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + self.log_step('Scenario 2 - Stop a rank and verify it is not excluded') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, 'stopped') + self._wait_detection_delay() + self._verify_rank_state(ranks_x, 'stopped') + + self.log_step( + 'Scenario 2 - Enable system self_heal and invoke dmg system self-heal eval') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + for idx in range(3): + try: + dmg.system_self_heal_eval() + except CommandFailure: + raise + # TODO workaround - see DAOS-18427 + # self.log.error('dmg system self-heal eval failed - retrying in 1s') + # if idx == 2: + # raise + # time.sleep(1) + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + + self.log_step('Scenario 2 - Verify ranks are excluded and rebuilt in the pool') + self._verify_rank_state(ranks_x, 'excluded') + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + + self.log_step( + 'Scenario 2 - Stop another rank and verify it is excluded and rebuilt in the pool') + dmg.system_stop(ranks=ranks_y) + self.server_managers[0].update_expected_states(ranks_y, ['stopped', 'excluded']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(ranks_y, 'excluded') + + self.log_step( + 'Scenario 2 - Reintegrate stopped ranks to bring system back to original state') + stopped_ranks_str = list_to_str(ranks_x + ranks_y) + dmg.system_start(stopped_ranks_str) + dmg.system_reintegrate(stopped_ranks_str) + self.server_managers[0].update_expected_states(ranks_x + ranks_y, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(all_ranks, 'joined') + + def _verify_scenario_3(self, pool): + """Scenario 3: Online System Maintenance. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a random rank to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + + self.log_step('Scenario 3 - Set system.self_heal.pool_rebuild = disabled') + dmg.system_set_prop('self_heal:exclude;pool_exclude') + dmg.system_get_prop(properties='self_heal') + + self.log_step('Scenario 3 - Stop a rank and verify it is excluded without rebuild') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + self._wait_detection_delay() + self._verify_rank_state(ranks_x, 'excluded') + pool.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}}) + # Targets should be down but not down_out + pool.verify_query_targets_state(ranks_x, 'down') + + self.log_step('Scenario 3 - Restart the rank and make sure it rejoins') + dmg.system_start(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 3 - Reintegrate the rank and wait for rebuild') + dmg.system_reintegrate(list_to_str(ranks_x)) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + + # The pool version changes after exclusion, + # but should not changed after resetting self_heal + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step('Scenario 3 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 3 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_4(self, pool): + """Scenario 4: Offline System Maintenance. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks + all_ranks = list(self.server_managers[0].ranks.keys()) + + self.log_step('Scenario 4 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step( + 'Scenario 4 - Stop more ranks than the pool RF and verify there are no exclusions') + pool_rf = int(re.findall(r'rd_fac:([0-9]+)', pool.properties.value)[0]) + self.assertGreater( + len(all_ranks), pool_rf, 'Not enough ranks to stop more than pool RF') + ranks_over_rf = sorted(self.random.sample(all_ranks, k=pool_rf + 1)) + dmg.system_stop(ranks=list_to_str(ranks_over_rf)) + self.server_managers[0].update_expected_states(ranks_over_rf, ['stopped']) + self._wait_detection_delay() + self._verify_rank_state(ranks_over_rf, 'stopped') + + self.log_step('Scenario 4 - Restart the stopped ranks and make sure they rejoin') + dmg.system_start(ranks=list_to_str(ranks_over_rf)) + self.server_managers[0].update_expected_states(ranks_over_rf, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 4 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 4 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_5(self, pool): + """Scenario 5: Normal System Restart. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks + all_ranks = list(self.server_managers[0].ranks.keys()) + + self.log_step('Scenario 5 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step('Scenario 5 - Stop the system and verify no ranks are excluded') + dmg.system_stop() + self.server_managers[0].update_expected_states(all_ranks, ['stopped']) + self._wait_detection_delay() + self._verify_rank_state(all_ranks, 'stopped') + + self.log_step('Scenario 5 - Restart the system and make sure all ranks rejoin') + dmg.system_start() + self.server_managers[0].update_expected_states(all_ranks, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 5 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 5 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_6(self, pool): + """Scenario 6: Unexpected System Restart. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks except 1 + all_ranks = list(self.server_managers[0].ranks.keys()) + all_ranks_minus_1 = sorted(self.random.sample(all_ranks, k=len(all_ranks) - 1)) + + self.log_step('Scenario 6 - Simulate restart with dmg system stop') + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + dmg.system_stop() + self.server_managers[0].update_expected_states(all_ranks, ['stopped']) + + self.log_step('Scenario 6 - Start all but 1 rank and immediately disable self-heal') + dmg.system_start(ranks=list_to_str(all_ranks_minus_1)) + self.server_managers[0].update_expected_states(all_ranks_minus_1, ['joined']) + dmg.system_set_prop('self_heal:none') + + self.log_step('Scenario 6 - Verify all but 1 rank rejoins') + self._verify_rank_state(all_ranks_minus_1, 'joined', tries=5, delay=3) + + self.log_step('Scenario 6 - Restart the last rank and make sure it rejoins') + dmg.system_start() + self.server_managers[0].update_expected_states(all_ranks, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 6 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 6 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_7(self, pool): + """Scenario 7: Problematic Pools. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a random rank to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + + self.log_step('Scenario 7 - Create a second pool') + pool2 = self.get_pool(connect=False) + + self.log_step('Scenario 7 - Disable self_heal rebuild on just the second pool') + pool2.set_prop('self_heal:exclude') + pool2.query() + + self.log_step('Scenario 7 - Stop a rank and wait for the detection delay') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + self._wait_detection_delay() + + self.log_step( + 'Scenario 7 - Verify the rank is excluded and rebuilds in first pool only') + self._verify_rank_state(ranks_x, 'excluded') + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + pool.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': 'done'}}) + self.log_step( + 'Scenario 7 - Verify the rank is excluded and does not rebuild in second pool') + pool2.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}}) + # Targets should be down but not down_out + pool2.verify_query_targets_state(ranks_x, 'down') + + self.log_step( + 'Scenario 7 - Reintegrate stopped ranks to bring system back to original state') + stopped_ranks_str = list_to_str(ranks_x) + dmg.system_start(stopped_ranks_str) + dmg.system_reintegrate(stopped_ranks_str) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(all_ranks, 'joined') + + self.log_step('Scenario 7 - Destroy second pool') + pool2.destroy() + + def _verify_rank_state(self, ranks, state, tries=1, delay=3): + """Verify the state of the given ranks. + + Args: + ranks (list): The list of ranks to verify. + state (str): The expected state of the ranks. + tries (int, optional): Number of attempts to verify the state. Defaults to 1. + delay (int, optional): Delay between attempts in seconds. Defaults to 3. + """ + for current_try in range(tries): + current_state = self.server_managers[0].get_current_state() + + # All ranks are in expected state + if set(current_state[rank]['state'] for rank in ranks) == {state}: + return + + # Retry + if current_try < tries - 1: + self.log.info( + 'Not all ranks are in expected state %s. Retrying in %s seconds...', + state, delay) + time.sleep(delay) + continue + + # Final attempt failed + for rank in ranks: + if current_state[rank]['state'] != state: + self.fail( + f'Expected rank {rank} to be in state {state}, ' + f'but current state is {current_state[rank]["state"]}') + + def _wait_detection_delay(self): + """Wait for the detection delay.""" + # TODO calculate this. hard-coded for now + # The detection delay shall be a couple of SWIM periods (1s) + SWIM suspicion timeout (20s) + # + CRT_EVENT_DELAY (1s) + some margin of error (?) + detection_delay = 30 + self.log.info('Waiting for detection delay of %s seconds', detection_delay) + time.sleep(detection_delay) diff --git a/src/tests/ftest/rebuild/auto_recovery_policy.yaml b/src/tests/ftest/rebuild/auto_recovery_policy.yaml new file mode 100644 index 00000000000..a1338d630a7 --- /dev/null +++ b/src/tests/ftest/rebuild/auto_recovery_policy.yaml @@ -0,0 +1,30 @@ +hosts: + test_servers: 7 + test_clients: 1 + +timeout: 900 + +skip_add_log_msg: true + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 + +pool: + size: 10G + properties: rd_fac:2 + pool_query_timeout: 30 + register_cleanup: False # if something goes wrong, this will likely timeout + +test: + scenarios_to_verify: + - all diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 88d4b26784c..10e56a616a1 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -1157,6 +1157,21 @@ def system_exclude(self, ranks=None, rank_hosts=None): return self._get_json_result( ("system", "exclude"), ranks=ranks, rank_hosts=rank_hosts) + def system_get_prop(self, properties=None): + """Call dmg system get-prop. + + Args: + properties (str, optional): Comma separated properties to get. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "get-prop"), properties=properties) + def system_leader_query(self): """Call dmg system leader-query. @@ -1268,6 +1283,33 @@ def system_rebuild_stop(self, verbose=False, force=False): return self._get_json_result( ("system", "rebuild", "stop"), verbose=verbose, force=force) + def system_self_heal_eval(self): + """Call dmg system self-heal eval. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "self-heal", "eval")) + + def system_set_prop(self, properties=None): + """Call dmg system set-prop. + + Args: + properties (str): properties in the form of key:val[,key:val...] + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "set-prop"), properties=properties) + def system_start(self, ranks=None, ignore_admin_excluded=False): """Start the system. diff --git a/src/tests/ftest/util/dmg_utils_base.py b/src/tests/ftest/util/dmg_utils_base.py index be13a06c107..96ddc889625 100644 --- a/src/tests/ftest/util/dmg_utils_base.py +++ b/src/tests/ftest/util/dmg_utils_base.py @@ -1,6 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -885,6 +885,8 @@ def get_sub_command_class(self): self.sub_command_class = self.EraseSubCommand() elif self.sub_command.value == "exclude": self.sub_command_class = self.ExcludeSubCommand() + elif self.sub_command.value == "get-prop": + self.sub_command_class = self.GetPropSubCommand() elif self.sub_command.value == "leader-query": self.sub_command_class = self.LeaderQuerySubCommand() elif self.sub_command.value == "list-pools": @@ -895,6 +897,10 @@ def get_sub_command_class(self): self.sub_command_class = self.RebuildSubCommand() elif self.sub_command.value == "reintegrate": self.sub_command_class = self.ReintegrateSubCommand() + elif self.sub_command.value == "self-heal": + self.sub_command_class = self.SelfHealSubCommand() + elif self.sub_command.value == "set-prop": + self.sub_command_class = self.SetPropSubCommand() elif self.sub_command.value == "start": self.sub_command_class = self.StartSubCommand() elif self.sub_command.value == "stop": @@ -945,6 +951,14 @@ def __init__(self): self.ranks = FormattedParameter("--ranks={}") self.rank_hosts = FormattedParameter("--rank-hosts={}") + class GetPropSubCommand(CommandWithParameters): + """Defines an object for the dmg system get-prop command.""" + + def __init__(self): + """Create a dmg system get-prop command object.""" + super().__init__("/run/dmg/system/get-prop/*", "get-prop") + self.properties = BasicParameter(None, position=1) + class LeaderQuerySubCommand(CommandWithParameters): """Defines an object for the dmg system leader-query command.""" @@ -1011,6 +1025,36 @@ def __init__(self): self.verbose = FormattedParameter("--verbose", False) self.force = FormattedParameter("--force", False) + class SelfHealSubCommand(CommandWithSubCommand): + """Defines an object for the dmg system self-heal command.""" + + def __init__(self): + """Create a dmg system self-heal command object.""" + super().__init__("/run/dmg/system/self-heal/*", "self-heal") + + def get_sub_command_class(self): + # pylint: disable=redefined-variable-type + """Get the dmg system sub command object.""" + if self.sub_command.value == "eval": + self.sub_command_class = self.EvalSubCommand() + else: + self.sub_command_class = None + + class EvalSubCommand(CommandWithParameters): + """Defines an object for the dmg system self-heal eval command.""" + + def __init__(self): + """Create a dmg system self-heal eval command object.""" + super().__init__("/run/dmg/system/self-heal/eval/*", "eval") + + class SetPropSubCommand(CommandWithParameters): + """Defines an object for the dmg system set-prop command.""" + + def __init__(self): + """Create a dmg system set-prop command object.""" + super().__init__("/run/dmg/system/set-prop/*", "set-prop") + self.properties = BasicParameter(None, position=1) + class StartSubCommand(CommandWithParameters): """Defines an object for the dmg system start command.""" diff --git a/src/tests/ftest/util/test_utils_pool.py b/src/tests/ftest/util/test_utils_pool.py index 591c4fe59ae..36539943abc 100644 --- a/src/tests/ftest/util/test_utils_pool.py +++ b/src/tests/ftest/util/test_utils_pool.py @@ -96,8 +96,9 @@ def add_pool(test, namespace=POOL_NAMESPACE, create=True, connect=True, dmg=None # Add a step to remove this pool when the test completes and ensure their is enough time for the # pool destroy to be attempted - accounting for a possible dmg command timeout - test.increment_timeout(POOL_TIMEOUT_INCREMENT) - test.register_cleanup(remove_pool, test=test, pool=pool) + if pool.register_cleanup.value is True: + test.increment_timeout(POOL_TIMEOUT_INCREMENT) + test.register_cleanup(remove_pool, test=test, pool=pool) return pool @@ -306,6 +307,8 @@ def __init__(self, context, dmg_command, label_generator=None, namespace=POOL_NA # Parameter to control running 'dmg storage query usage --show_usable' if pool create fails self.query_on_create_error = BasicParameter(None, False) + self.register_cleanup = BasicParameter(True, True) # call register_cleanup by default + self.pool = None self.info = None self.svc_ranks = None @@ -724,7 +727,7 @@ def set_prop(self, *args, **kwargs): dict: json output of dmg pool set-prop command """ - return self.dmg.pool_set_prop(pool=self.identifier, *args, **kwargs) + return self.dmg.pool_set_prop(self.identifier, *args, **kwargs) @fail_on(CommandFailure) def get_prop(self, *args, **kwargs): @@ -1623,3 +1626,25 @@ def verify_query(self, expected_response, use_cached_query=False): response = self.query_data['response'] assert_dict_subset(expected_response, response) + + def verify_query_targets_state(self, ranks, expected_target_state): + """Verify all targets are in the expected state with dmg pool query-targets. + + Args: + ranks (list): The list of ranks to verify. + expected_target_state (str): The expected target state. + + Raises: + AssertionError: if the targets are not in the expected state + + """ + for rank in ranks: + self.log.info( + 'Verifying targets on rank %s are in state %s', rank, expected_target_state) + response = self.query_targets(rank=rank)['response'] + infos = response['Infos'] + for target, info in enumerate(infos): + if info['target_state'] != expected_target_state: + raise AssertionError( + f'Expected target {target} to be in state {expected_target_state}, ' + f'but current state is {info["target_state"]}')