Skip to content

Commit

Permalink
include correct soak updates for 2.6
Browse files Browse the repository at this point in the history
Required-githooks: true

Signed-off-by: Maureen Jean <[email protected]>
  • Loading branch information
mjean308 authored and daltonbohning committed Jan 27, 2025
1 parent e3106d8 commit bd2fb4f
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 25 deletions.
2 changes: 1 addition & 1 deletion src/tests/ftest/soak/harassers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ hosts:
test_servers: 8
# servers if a server partition is defined
# server_partition: daos_server
client_partition: daos_client
# client_partition: daos_client
# client_reservation: daos-test
orterun:
allow_run_as_root: true
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/soak/smoke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ hosts:
test_servers: 4
# servers if a server partition is defined
# server_partition: daos_server
client_partition: daos_client
# client_partition: daos_client
# client_reservation: daos-test
orterun:
allow_run_as_root: true
Expand Down
7 changes: 1 addition & 6 deletions src/tests/ftest/soak/stress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ hosts:
test_servers: 8
# servers if a server partition is defined
# server_partition: daos_server
client_partition: daos_client
# client_partition: daos_client
# client_reservation: daos-test
orterun:
allow_run_as_root: true
Expand Down Expand Up @@ -138,7 +138,6 @@ ior_stress:
mount_dir: "/tmp/soak_dfuse_ior/"
disable_caching: true
thread_count: 8
cores: '0-7'
fio_stress:
api:
- POSIX
Expand Down Expand Up @@ -180,7 +179,6 @@ fio_stress:
mount_dir: "/tmp/soak_dfuse_fio/"
disable_caching: true
thread_count: 8
cores: '0-7'
daos_racer:
runtime: 120
vpic_stress:
Expand Down Expand Up @@ -217,7 +215,6 @@ lammps_stress:
mount_dir: "/tmp/soak_dfuse_lammps/"
disable_caching: true
thread_count: 8
cores: '0-7'
oclass:
- ["EC_2P1GX", "RP_2GX"]
mdtest_stress:
Expand Down Expand Up @@ -258,7 +255,6 @@ mdtest_stress:
mount_dir: "/tmp/soak_dfuse_mdtest/"
disable_caching: true
thread_count: 8
cores: '0-7'
macsio_stress:
job_timeout: 30
nodesperjob:
Expand Down Expand Up @@ -289,7 +285,6 @@ macsio_stress:
mount_dir: "/tmp/soak_dfuse_macsio/"
disable_caching: true
thread_count: 8
cores: '0-7'
datamover_stress:
job_timeout: 10
nodesperjob:
Expand Down
3 changes: 3 additions & 0 deletions src/tests/ftest/util/soak_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def pre_tear_down(self):
if not run_local(self.log, cmd, timeout=120).passed:
# Exception was raised due to a non-zero exit status
errors.append(f"Failed to cancel jobs {self.failed_job_id_list}")
elif self.job_scheduler != "slurm":
cmd = "pkill jobscript"
run_remote(self.log, self.hostlist_clients, cmd)
if self.all_failed_jobs:
errors.append("SOAK FAILED: The following jobs failed {} ".format(
" ,".join(str(j_id) for j_id in self.all_failed_jobs)))
Expand Down
41 changes: 24 additions & 17 deletions src/tests/ftest/util/soak_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from daos_racer_utils import DaosRacerCommand
from data_mover_utils import DcpCommand, FsCopy
from dfuse_utils import get_dfuse
from dmg_utils import get_storage_query_device_info
from dmg_utils import get_storage_query_device_info, get_storage_query_device_uuids
from duns_utils import format_path
from exception_utils import CommandFailure
from fio_utils import FioCommand
Expand Down Expand Up @@ -573,8 +573,10 @@ def launch_vmd_identify_check(self, name, results, args):
results (queue): multiprocessing queue
args (queue): multiprocessing queue
"""
# pylint: disable=too-many-nested-blocks
status = True
failing_vmd = []
dmg = self.get_dmg_command().copy()
device_info = get_storage_query_device_info(self.dmg_command)
uuid_list = [device['uuid'] for device in device_info]
# limit the number of leds to blink to 1024
Expand All @@ -583,22 +585,27 @@ def launch_vmd_identify_check(self, name, results, args):
else:
uuids = uuid_list
self.log.info("VMD device UUIDs: %s", uuids)

for uuid in uuids:
# Blink led
self.dmg_command.storage_led_identify(ids=uuid, timeout=2)
# check if led is blinking
result = self.dmg_command.storage_led_check(ids=uuid)
# determine if leds are blinking as expected
for value in list(result['response']['host_storage_map'].values()):
if value['storage']['smd_info']['devices']:
for device in value['storage']['smd_info']['devices']:
if device['ctrlr']['led_state'] != "QUICK_BLINK":
failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']])
status = False
# reset leds to previous state
for uuid in uuids:
self.dmg_command.storage_led_identify(ids=uuid, reset=True)
host_uuids = get_storage_query_device_uuids(self.dmg_command)
for host, uuid_dict in host_uuids.items():
uuid_list = sorted(uuid_dict.keys())
self.log.info("Devices on host %s: %s", host, uuid_list)
# Now check whether the random uuid belongs to a particular host.
for uuid in uuids:
if uuid in uuid_list:
dmg.hostlist = host
# Blink led
dmg.storage_led_identify(ids=uuid, timeout=2)
# check if led is blinking
result = dmg.storage_led_check(ids=uuid)
# determine if leds are blinking as expected
for value in list(result['response']['host_storage_map'].values()):
if value['storage']['smd_info']['devices']:
for device in value['storage']['smd_info']['devices']:
if device['ctrlr']['led_state'] != "QUICK_BLINK":
failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']])
status = False
# reset leds to previous state
dmg.storage_led_identify(ids=uuid, reset=True)

params = {"name": name,
"status": status,
Expand Down

0 comments on commit bd2fb4f

Please sign in to comment.