Skip to content

Commit

Permalink
DAOS-14103 test: remove performance_test_base subprocess (#13601) (#1…
Browse files Browse the repository at this point in the history
…3685)

Don't run ior and mdtest in a subprocess since it isn't needed.
The variants that needed this aren't ran and need to be reworked in a
new context.

Also remove support for multipler iterations since it was never used.

Signed-off-by: Dalton Bohning <[email protected]>
  • Loading branch information
daltonbohning authored Feb 1, 2024
1 parent 2f7ca75 commit 4c14427
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 192 deletions.
48 changes: 0 additions & 48 deletions src/tests/ftest/performance/ior_easy.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,54 +57,6 @@ def test_performance_ior_easy_dfuse_ec_16p2gx(self):
"""
self.run_performance_ior(namespace="/run/ior_dfuse_ec_16p2gx/*")

def test_performance_ior_easy_dfs_ec_4p2gx_stop_write(self):
"""Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during write
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs
:avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_write
"""
self.run_performance_ior(
namespace="/run/ior_dfs_ec_4p2gx/*",
stop_delay_write=0.5)

def test_performance_ior_easy_dfs_ec_4p2gx_stop_read(self):
"""Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during read.
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs
:avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_read
"""
self.run_performance_ior(
namespace="/run/ior_dfs_ec_4p2gx/*",
stop_delay_read=0.5)

def test_performance_ior_easy_dfs_ec_16p2gx_stop_write(self):
"""Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during write.
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs
:avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_write
"""
self.run_performance_ior(
namespace="/run/ior_dfs_ec_16p2gx/*",
stop_delay_write=0.5)

def test_performance_ior_easy_dfs_ec_16p2gx_stop_read(self):
"""Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during read.
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs
:avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_read
"""
self.run_performance_ior(
namespace="/run/ior_dfs_ec_16p2gx/*",
stop_delay_read=0.5)

def test_performance_ior_easy_hdf5_sx(self):
"""Test Description: Run IOR Easy, HDF5, SX.
Expand Down
26 changes: 1 addition & 25 deletions src/tests/ftest/performance/mdtest_easy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''
(C) Copyright 2019-2022 Intel Corporation.
(C) Copyright 2019-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand Down Expand Up @@ -43,27 +43,3 @@ def test_performance_mdtest_easy_dfuse_s1(self):
:avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfuse_s1,dfuse
"""
self.run_performance_mdtest(namespace="/run/mdtest_dfuse_s1/*")

def test_performance_mdtest_easy_dfs_ec_4p2g1_stop(self):
"""Test Description: Run MDTest Easy, DFS, EC_4P2G1, stop a rank.
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs
:avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_4p2g1_stop
"""
self.run_performance_mdtest(
namespace="/run/mdtest_dfs_ec_4p2g1/*",
stop_delay=0.5)

def test_performance_mdtest_easy_dfs_ec_16p2g1_stop(self):
"""Test Description: Run MDTest Easy, DFS, EC_16P2G1, stop a rank.
:avocado: tags=all,manual
:avocado: tags=hw,medium
:avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs
:avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_16p2g1_stop
"""
self.run_performance_mdtest(
namespace="/run/mdtest_dfs_ec_16p2g1/*",
stop_delay=0.5)
2 changes: 1 addition & 1 deletion src/tests/ftest/rebuild/widely_striped.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_rebuild_widely_striped(self):
:avocado: tags=all,full_regression
:avocado: tags=hw,large
:avocado: tags=rebuild
:avocado: tags=rebuild,mdtest
:avocado: tags=RbldWidelyStriped,test_rebuild_widely_striped
"""
# set params
Expand Down
14 changes: 10 additions & 4 deletions src/tests/ftest/util/mdtest_test_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
(C) Copyright 2020-2023 Intel Corporation.
(C) Copyright 2020-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -67,6 +67,9 @@ def execute_mdtest(self, out_queue=None, display_space=True):
Args:
out_queue (queue, optional): Pass any exceptions in a queue. Defaults to None.
display_space (bool, optional): Whether to display the pool space. Defaults to True.
Returns:
object: result of job manager run
"""
# Create a pool if one does not already exist
if self.pool is None:
Expand All @@ -83,17 +86,20 @@ def execute_mdtest(self, out_queue=None, display_space=True):
self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value)

# Run Mdtest
self.run_mdtest(self.get_mdtest_job_manager_command(self.manager),
self.processes, display_space=display_space, out_queue=out_queue)
out = self.run_mdtest(
self.get_mdtest_job_manager_command(self.manager),
self.processes, display_space=display_space, out_queue=out_queue)

if self.subprocess:
return
return out

# reset self.container if dfs_destroy is True or None.
if self.mdtest_cmd.dfs_destroy is not False:
self.container = None
self.stop_dfuse()

return out

def get_mdtest_job_manager_command(self, mpi_type):
"""Get the MPI job manager command for Mdtest.
Expand Down
141 changes: 27 additions & 114 deletions src/tests/ftest/util/performance_test_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
(C) Copyright 2018-2023 Intel Corporation.
(C) Copyright 2018-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand All @@ -9,7 +9,6 @@
import oclass_utils
from avocado.core.exceptions import TestFail
from exception_utils import CommandFailure
from general_utils import get_subprocess_stdout
from ior_test_base import IorTestBase
from ior_utils import IorMetrics
from mdtest_test_base import MdtestBase
Expand Down Expand Up @@ -217,9 +216,7 @@ def verify_system_status(self, pool=None, container=None):
if pool:
funcs.append(pool.set_query_data)
if container:
funcs.append(
lambda: self.log.info(
self.daos_cmd.container_query(container.pool.identifier, container.uuid)))
funcs.append(container.query)

first_error = None
for func in funcs:
Expand Down Expand Up @@ -254,45 +251,21 @@ def verify_oclass_engine_count(self, oclass, fail=True):
return False
return True

def restart_servers(self):
"""Restart the servers."""
self.log.info("Restarting servers")
self.dmg_cmd.system_stop(True)
if self.dmg_cmd.result.exit_status != 0:
self.fail("Failed to stop servers")
time.sleep(5)
self.dmg_cmd.system_start()
if self.dmg_cmd.result.exit_status != 0:
self.fail("Failed to start servers")
self.server_managers[0].detect_engine_start()

def _run_performance_ior_single(self, stop_rank_s=None, intercept=None):
def _run_performance_ior_single(self, intercept=None):
"""Run a single IOR execution.
Args:
stop_rank_s (float, optional): stop a rank this many seconds after starting IOR.
Default is None, which does not stop a rank.
intercept (str, optional): path to interception library.
"""
# Always run as a subprocess so we can stop ranks during IO
self.subprocess = True

self.run_ior_with_pool(
create_pool=False,
create_cont=False,
intercept=intercept,
display_space=False,
stop_dfuse=False
)
if stop_rank_s is not None:
time.sleep(stop_rank_s)
self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0])
ior_returncode = self.job_manager.process.wait()
try:
if ior_returncode != 0:
self.fail("IOR failed")
ior_output = get_subprocess_stdout(self.job_manager.process)
ior_output = self.run_ior_with_pool(
create_pool=False,
create_cont=False,
intercept=intercept,
display_space=False,
stop_dfuse=False
)
ior_metrics = self.ior_cmd.get_ior_metrics(ior_output)
for metrics in ior_metrics:
if metrics[0] == "write":
Expand All @@ -309,9 +282,7 @@ def _run_performance_ior_single(self, stop_rank_s=None, intercept=None):
# Try this even if IOR failed because it could give us useful info
self.verify_system_status(self.pool, self.container)

def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_write=None,
stop_delay_read=None, num_iterations=1,
restart_between_iterations=True):
def run_performance_ior(self, namespace=None, use_intercept=True):
"""Run an IOR performance test.
Write and Read are ran separately.
Expand All @@ -321,26 +292,8 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri
Defaults to None, which uses default IOR namespace.
use_intercept (bool, optional): whether to use the interception library with dfuse.
Defaults to True.
stop_delay_write (float, optional): fraction of stonewall time after which to stop a
rank during write phase. Must be between 0 and 1. Default is None.
stop_delay_read (float, optional): fraction of stonewall time after which to stop a
rank during read phase. Must be between 0 and 1. Default is None.
num_iterations (int, optional): number of times to run the tests.
Default is 1.
restart_between_iterations (int, optional): whether to restart the servers between
iterations. Default is True.
"""
if stop_delay_write is not None and (stop_delay_write < 0 or stop_delay_write > 1):
self.fail("stop_delay_write must be between 0 and 1")
if stop_delay_read is not None and (stop_delay_read < 0 or stop_delay_read > 1):
self.fail("stop_delay_read must be between 0 and 1")
if stop_delay_write is not None and stop_delay_read is not None:
# This isn't straightforward, because stopping a rank during write degrades
# performance, so read tries to read the same number of bytes as write,
# but might finish before the rank is stopped.
self.fail("stop_delay_write and stop_delay_read cannot be used together")

if namespace is not None:
self.ior_cmd.namespace = namespace
self.ior_cmd.get_params(self)
Expand All @@ -351,13 +304,6 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri
else:
intercept = None

# Calculate both stop delays upfront since read phase will remove stonewall
stop_rank_write_s = stop_rank_read_s = None
if stop_delay_write and self.ior_cmd.sw_deadline.value:
stop_rank_write_s = stop_delay_write * self.ior_cmd.sw_deadline.value
if stop_delay_read and self.ior_cmd.sw_deadline.value:
stop_rank_read_s = stop_delay_read * self.ior_cmd.sw_deadline.value

# Save write and read params for switching
write_flags = self.params.get("write_flags", self.ior_cmd.namespace)
read_flags = self.params.get("read_flags", self.ior_cmd.namespace)
Expand All @@ -376,7 +322,7 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri
# Set the container redundancy factor to match the oclass
cont_rf = oclass_utils.extract_redundancy_factor(self.ior_cmd.dfs_oclass.value)

# Create pool and container upfront for flexibility and so rank stop timing is accurate
# Create pool and container upfront for flexibility
self.pool = self.get_pool(connect=False)
params = {}
if self.ior_cmd.dfs_oclass.value:
Expand All @@ -391,50 +337,33 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri
self.container.create()
self.update_ior_cmd_with_pool(False)

for iteration in range(num_iterations):
if restart_between_iterations and iteration > 0:
self.restart_servers()

self.log.info("Running IOR write (%s)", str(iteration))
self.ior_cmd.flags.update(write_flags)
self._run_performance_ior_single(stop_rank_write_s, intercept)

# Manually stop dfuse after ior write completes
self.stop_dfuse()

# Wait for rebuild if we stopped a rank
if stop_rank_write_s:
self.pool.wait_for_rebuild_to_end()
self.log_step("Running IOR write")
self.ior_cmd.flags.update(write_flags)
self._run_performance_ior_single(intercept)

# Wait between write and read
self.phase_barrier()
# Manually stop dfuse after ior write completes
self.stop_dfuse()

self.log.info("Running IOR read (%s)", str(iteration))
self.ior_cmd.flags.update(read_flags)
self._run_performance_ior_single(stop_rank_read_s, intercept)
# Wait between write and read
self.phase_barrier()

# Manually stop dfuse after ior read completes
self.stop_dfuse()
self.log_step("Running IOR read")
self.ior_cmd.flags.update(read_flags)
self._run_performance_ior_single(intercept)

# Wait for rebuild if we stopped a rank
if stop_rank_read_s:
self.pool.wait_for_rebuild_to_end()
# Manually stop dfuse after ior read completes
self.stop_dfuse()

self._log_daos_metrics()

def run_performance_mdtest(self, namespace=None, stop_delay=None):
def run_performance_mdtest(self, namespace=None):
"""Run an MDTest performance test.
Args:
namespace (str, optional): namespace for MDTest parameters in the yaml.
Defaults to None, which uses default MDTest namespace.
stop_delay (float, optional): fraction of stonewall time after which to stop a
rank. Must be between 0 and 1. Defaults to None.
"""
if stop_delay is not None and (stop_delay < 0 or stop_delay > 1):
self.fail("stop_delay must be between 0 and 1")

if namespace is not None:
self.mdtest_cmd.namespace = namespace
self.mdtest_cmd.get_params(self)
Expand All @@ -445,8 +374,6 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None):
if self.mdtest_cmd.api.value not in ('DFS', 'POSIX'):
self.fail("Only DFS API supported")

stop_rank_s = (stop_delay or 0) * (self.mdtest_cmd.stonewall_timer.value or 0)

self._log_performance_params("MDTEST")

self.verify_oclass_engine_count(self.mdtest_cmd.dfs_oclass.value)
Expand Down Expand Up @@ -484,20 +411,10 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None):
# Never let execute_mdtest automatically destroy the container
self.mdtest_cmd.dfs_destroy.update(False)

# Always run as a subprocess so we can stop ranks during IO
self.subprocess = True

self.log.info("Running MDTEST")
self.execute_mdtest(display_space=False)
if stop_rank_s:
time.sleep(stop_rank_s)
self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0])
mdtest_returncode = self.job_manager.process.wait()
try:
if mdtest_returncode != 0:
self.fail("mdtest failed")
mdtest_output = get_subprocess_stdout(self.job_manager.process)
mdtest_metrics = MdtestMetrics(mdtest_output)
mdtest_result = self.execute_mdtest(display_space=False)
mdtest_metrics = MdtestMetrics(mdtest_result.stdout_text)
if not mdtest_metrics:
self.fail("Failed to get mdtest metrics")
log_list = []
Expand All @@ -523,8 +440,4 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None):
# Manually stop dfuse after mdtest completes
self.stop_dfuse()

# Wait for rebuild if we stopped a rank
if stop_rank_s:
self.pool.wait_for_rebuild_to_end()

self._log_daos_metrics()

0 comments on commit 4c14427

Please sign in to comment.