Skip to content

Commit

Permalink
DAOS-15698 test: Update the reset faulty device logic for nvme_utils.…
Browse files Browse the repository at this point in the history
…py (#15430)

MD-on-SSD cluster: If we set storage faulty, the corresponding engine dies (becomes excluded). Tell the server manager the rank is excluded and don't reset faulty device.
Non-MD-on-SSD cluster: Reset by calling dmg storage led identify --reset. We expect the command to work and all engines to be joined.

Test-tag: test_nvme_fault test_nvme_fault_reintegration test_vmd_led_faulty test_disk_failure_recover
Skip-func-hw-test-medium-vmd: false
Skip-func-hw-test-large-md-on-ssd: false

Signed-off-by: Makito Kano <[email protected]>
  • Loading branch information
shimizukko authored and phender committed Jan 21, 2025
1 parent ba3ec5a commit 2d1efe4
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions src/tests/ftest/util/nvme_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,32 +44,38 @@ def set_device_faulty(test, dmg, server, uuid, pool=None, has_sys_xs=False, **kw
Args:
test (Test): avocado test class
dmg (DmgCommand): a DmgCommand class instance
server (NodeSet): host on which to issue the dmg storage set nvme-faulty
server (NodeSet): host on which to issue the dmg storage set nvme-faulty. Must be one host.
uuid (str): the device UUID
pool (TestPool, optional): pool used to wait for rebuild to start/complete if specified.
Defaults to None.
has_sys_xs (bool, optional): the device's has_sys_xs property value. Defaults to False.
kwargs (dict, optional): named arguments to pass to the DmgCommand.storage_set_faulty.
Returns:
dict: the json response from the dmg storage set-faulty command.
dict: the json response from the dmg storage set-faulty command. None if has_sys_xs is True.
"""
kwargs['host'] = server
kwargs['uuid'] = uuid
response = None
try:
response = get_dmg_response(dmg.storage_set_faulty, **kwargs)
if has_sys_xs:
test.fail("Setting a sys_xs device faulty should fail.")
except CommandFailure as error:
if not has_sys_xs:
test.fail(str(error))

# Update the expected status of the any stopped/excluded ranks
if has_sys_xs:
ranks = [test.server_managers[-1].ranks[server]]
rank_to_host = test.server_managers[-1].ranks
ranks = []
for rank, host in rank_to_host.items():
if host == str(server):
ranks.append(rank)
test.server_managers[-1].update_expected_states(ranks, ["stopped", "excluded"])

# Add a tearDown method to reset the faulty device
test.register_cleanup(reset_fault_device, dmg=dmg, server=server, uuid=uuid)
else:
# Add a tearDown method to reset the faulty device
test.register_cleanup(reset_fault_device, dmg=dmg, server=server, uuid=uuid)

if pool:
# Wait for rebuild to start
Expand Down

0 comments on commit 2d1efe4

Please sign in to comment.