diff --git a/src/tests/ftest/soak/harassers.yaml b/src/tests/ftest/soak/harassers.yaml index af7065fb3d88..57cb3e4f348e 100644 --- a/src/tests/ftest/soak/harassers.yaml +++ b/src/tests/ftest/soak/harassers.yaml @@ -3,7 +3,7 @@ hosts: test_servers: 8 # servers if a server partition is defined # server_partition: daos_server - client_partition: daos_client + # client_partition: daos_client # client_reservation: daos-test orterun: allow_run_as_root: true diff --git a/src/tests/ftest/soak/smoke.yaml b/src/tests/ftest/soak/smoke.yaml index ca1d4fb7a4c9..4c5594b4ab8a 100644 --- a/src/tests/ftest/soak/smoke.yaml +++ b/src/tests/ftest/soak/smoke.yaml @@ -3,7 +3,7 @@ hosts: test_servers: 4 # servers if a server partition is defined # server_partition: daos_server - client_partition: daos_client + # client_partition: daos_client # client_reservation: daos-test orterun: allow_run_as_root: true diff --git a/src/tests/ftest/soak/stress.yaml b/src/tests/ftest/soak/stress.yaml index 15a6a3033a3b..38527d95042c 100644 --- a/src/tests/ftest/soak/stress.yaml +++ b/src/tests/ftest/soak/stress.yaml @@ -3,7 +3,7 @@ hosts: test_servers: 8 # servers if a server partition is defined # server_partition: daos_server - client_partition: daos_client + # client_partition: daos_client # client_reservation: daos-test orterun: allow_run_as_root: true @@ -138,7 +138,6 @@ ior_stress: mount_dir: "/tmp/soak_dfuse_ior/" disable_caching: true thread_count: 8 - cores: '0-7' fio_stress: api: - POSIX @@ -180,7 +179,6 @@ fio_stress: mount_dir: "/tmp/soak_dfuse_fio/" disable_caching: true thread_count: 8 - cores: '0-7' daos_racer: runtime: 120 vpic_stress: @@ -217,7 +215,6 @@ lammps_stress: mount_dir: "/tmp/soak_dfuse_lammps/" disable_caching: true thread_count: 8 - cores: '0-7' oclass: - ["EC_2P1GX", "RP_2GX"] mdtest_stress: @@ -258,7 +255,6 @@ mdtest_stress: mount_dir: "/tmp/soak_dfuse_mdtest/" disable_caching: true thread_count: 8 - cores: '0-7' macsio_stress: job_timeout: 30 nodesperjob: @@ -289,7 +285,6 @@ macsio_stress: mount_dir: "/tmp/soak_dfuse_macsio/" disable_caching: true thread_count: 8 - cores: '0-7' datamover_stress: job_timeout: 10 nodesperjob: diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index 879e142e8728..e3cda5fce7f2 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -145,6 +145,9 @@ def pre_tear_down(self): if not run_local(self.log, cmd, timeout=120).passed: # Exception was raised due to a non-zero exit status errors.append(f"Failed to cancel jobs {self.failed_job_id_list}") + elif self.job_scheduler != "slurm": + cmd = "pkill jobscript" + run_remote(self.log, self.hostlist_clients, cmd) if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 616677f8516d..016577be8681 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -22,7 +22,7 @@ from daos_racer_utils import DaosRacerCommand from data_mover_utils import DcpCommand, FsCopy from dfuse_utils import get_dfuse -from dmg_utils import get_storage_query_device_info +from dmg_utils import get_storage_query_device_info, get_storage_query_device_uuids from duns_utils import format_path from exception_utils import CommandFailure from fio_utils import FioCommand @@ -573,8 +573,10 @@ def launch_vmd_identify_check(self, name, results, args): results (queue): multiprocessing queue args (queue): multiprocessing queue """ + # pylint: disable=too-many-nested-blocks status = True failing_vmd = [] + dmg = self.get_dmg_command().copy() device_info = get_storage_query_device_info(self.dmg_command) uuid_list = [device['uuid'] for device in device_info] # limit the number of leds to blink to 1024 @@ -583,22 +585,27 @@ def launch_vmd_identify_check(self, name, results, args): else: uuids = uuid_list self.log.info("VMD device UUIDs: %s", uuids) - - for uuid in uuids: - # Blink led - self.dmg_command.storage_led_identify(ids=uuid, timeout=2) - # check if led is blinking - result = self.dmg_command.storage_led_check(ids=uuid) - # determine if leds are blinking as expected - for value in list(result['response']['host_storage_map'].values()): - if value['storage']['smd_info']['devices']: - for device in value['storage']['smd_info']['devices']: - if device['ctrlr']['led_state'] != "QUICK_BLINK": - failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']]) - status = False - # reset leds to previous state - for uuid in uuids: - self.dmg_command.storage_led_identify(ids=uuid, reset=True) + host_uuids = get_storage_query_device_uuids(self.dmg_command) + for host, uuid_dict in host_uuids.items(): + uuid_list = sorted(uuid_dict.keys()) + self.log.info("Devices on host %s: %s", host, uuid_list) + # Now check whether the random uuid belongs to a particular host. + for uuid in uuids: + if uuid in uuid_list: + dmg.hostlist = host + # Blink led + dmg.storage_led_identify(ids=uuid, timeout=2) + # check if led is blinking + result = dmg.storage_led_check(ids=uuid) + # determine if leds are blinking as expected + for value in list(result['response']['host_storage_map'].values()): + if value['storage']['smd_info']['devices']: + for device in value['storage']['smd_info']['devices']: + if device['ctrlr']['led_state'] != "QUICK_BLINK": + failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']]) + status = False + # reset leds to previous state + dmg.storage_led_identify(ids=uuid, reset=True) params = {"name": name, "status": status,