From 4c144279a08095bbc5b19ca8991dc5d3415b12cd Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 1 Feb 2024 11:56:28 -0800 Subject: [PATCH] DAOS-14103 test: remove performance_test_base subprocess (#13601) (#13685) Don't run ior and mdtest in a subprocess since it isn't needed. The variants that needed this aren't ran and need to be reworked in a new context. Also remove support for multipler iterations since it was never used. Signed-off-by: Dalton Bohning --- src/tests/ftest/performance/ior_easy.py | 48 ------ src/tests/ftest/performance/mdtest_easy.py | 26 +--- src/tests/ftest/rebuild/widely_striped.py | 2 +- src/tests/ftest/util/mdtest_test_base.py | 14 +- src/tests/ftest/util/performance_test_base.py | 141 ++++-------------- 5 files changed, 39 insertions(+), 192 deletions(-) diff --git a/src/tests/ftest/performance/ior_easy.py b/src/tests/ftest/performance/ior_easy.py index f95d83c34f8..2be1f898341 100644 --- a/src/tests/ftest/performance/ior_easy.py +++ b/src/tests/ftest/performance/ior_easy.py @@ -57,54 +57,6 @@ def test_performance_ior_easy_dfuse_ec_16p2gx(self): """ self.run_performance_ior(namespace="/run/ior_dfuse_ec_16p2gx/*") - def test_performance_ior_easy_dfs_ec_4p2gx_stop_write(self): - """Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during write - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_write - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_4p2gx/*", - stop_delay_write=0.5) - - def test_performance_ior_easy_dfs_ec_4p2gx_stop_read(self): - """Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during read. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_read - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_4p2gx/*", - stop_delay_read=0.5) - - def test_performance_ior_easy_dfs_ec_16p2gx_stop_write(self): - """Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during write. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_write - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_16p2gx/*", - stop_delay_write=0.5) - - def test_performance_ior_easy_dfs_ec_16p2gx_stop_read(self): - """Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during read. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_read - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_16p2gx/*", - stop_delay_read=0.5) - def test_performance_ior_easy_hdf5_sx(self): """Test Description: Run IOR Easy, HDF5, SX. diff --git a/src/tests/ftest/performance/mdtest_easy.py b/src/tests/ftest/performance/mdtest_easy.py index 99ae476f02b..ba08938553c 100644 --- a/src/tests/ftest/performance/mdtest_easy.py +++ b/src/tests/ftest/performance/mdtest_easy.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2019-2022 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,27 +43,3 @@ def test_performance_mdtest_easy_dfuse_s1(self): :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfuse_s1,dfuse """ self.run_performance_mdtest(namespace="/run/mdtest_dfuse_s1/*") - - def test_performance_mdtest_easy_dfs_ec_4p2g1_stop(self): - """Test Description: Run MDTest Easy, DFS, EC_4P2G1, stop a rank. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs - :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_4p2g1_stop - """ - self.run_performance_mdtest( - namespace="/run/mdtest_dfs_ec_4p2g1/*", - stop_delay=0.5) - - def test_performance_mdtest_easy_dfs_ec_16p2g1_stop(self): - """Test Description: Run MDTest Easy, DFS, EC_16P2G1, stop a rank. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs - :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_16p2g1_stop - """ - self.run_performance_mdtest( - namespace="/run/mdtest_dfs_ec_16p2g1/*", - stop_delay=0.5) diff --git a/src/tests/ftest/rebuild/widely_striped.py b/src/tests/ftest/rebuild/widely_striped.py index 470926df05d..e5d88e6d220 100644 --- a/src/tests/ftest/rebuild/widely_striped.py +++ b/src/tests/ftest/rebuild/widely_striped.py @@ -42,7 +42,7 @@ def test_rebuild_widely_striped(self): :avocado: tags=all,full_regression :avocado: tags=hw,large - :avocado: tags=rebuild + :avocado: tags=rebuild,mdtest :avocado: tags=RbldWidelyStriped,test_rebuild_widely_striped """ # set params diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index 8646d4ef0ac..21931c22031 100644 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -67,6 +67,9 @@ def execute_mdtest(self, out_queue=None, display_space=True): Args: out_queue (queue, optional): Pass any exceptions in a queue. Defaults to None. display_space (bool, optional): Whether to display the pool space. Defaults to True. + + Returns: + object: result of job manager run """ # Create a pool if one does not already exist if self.pool is None: @@ -83,17 +86,20 @@ def execute_mdtest(self, out_queue=None, display_space=True): self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value) # Run Mdtest - self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), - self.processes, display_space=display_space, out_queue=out_queue) + out = self.run_mdtest( + self.get_mdtest_job_manager_command(self.manager), + self.processes, display_space=display_space, out_queue=out_queue) if self.subprocess: - return + return out # reset self.container if dfs_destroy is True or None. if self.mdtest_cmd.dfs_destroy is not False: self.container = None self.stop_dfuse() + return out + def get_mdtest_job_manager_command(self, mpi_type): """Get the MPI job manager command for Mdtest. diff --git a/src/tests/ftest/util/performance_test_base.py b/src/tests/ftest/util/performance_test_base.py index bf7a24907ef..4fdb8ae06d4 100644 --- a/src/tests/ftest/util/performance_test_base.py +++ b/src/tests/ftest/util/performance_test_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -9,7 +9,6 @@ import oclass_utils from avocado.core.exceptions import TestFail from exception_utils import CommandFailure -from general_utils import get_subprocess_stdout from ior_test_base import IorTestBase from ior_utils import IorMetrics from mdtest_test_base import MdtestBase @@ -217,9 +216,7 @@ def verify_system_status(self, pool=None, container=None): if pool: funcs.append(pool.set_query_data) if container: - funcs.append( - lambda: self.log.info( - self.daos_cmd.container_query(container.pool.identifier, container.uuid))) + funcs.append(container.query) first_error = None for func in funcs: @@ -254,45 +251,21 @@ def verify_oclass_engine_count(self, oclass, fail=True): return False return True - def restart_servers(self): - """Restart the servers.""" - self.log.info("Restarting servers") - self.dmg_cmd.system_stop(True) - if self.dmg_cmd.result.exit_status != 0: - self.fail("Failed to stop servers") - time.sleep(5) - self.dmg_cmd.system_start() - if self.dmg_cmd.result.exit_status != 0: - self.fail("Failed to start servers") - self.server_managers[0].detect_engine_start() - - def _run_performance_ior_single(self, stop_rank_s=None, intercept=None): + def _run_performance_ior_single(self, intercept=None): """Run a single IOR execution. Args: - stop_rank_s (float, optional): stop a rank this many seconds after starting IOR. - Default is None, which does not stop a rank. intercept (str, optional): path to interception library. """ - # Always run as a subprocess so we can stop ranks during IO - self.subprocess = True - - self.run_ior_with_pool( - create_pool=False, - create_cont=False, - intercept=intercept, - display_space=False, - stop_dfuse=False - ) - if stop_rank_s is not None: - time.sleep(stop_rank_s) - self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0]) - ior_returncode = self.job_manager.process.wait() try: - if ior_returncode != 0: - self.fail("IOR failed") - ior_output = get_subprocess_stdout(self.job_manager.process) + ior_output = self.run_ior_with_pool( + create_pool=False, + create_cont=False, + intercept=intercept, + display_space=False, + stop_dfuse=False + ) ior_metrics = self.ior_cmd.get_ior_metrics(ior_output) for metrics in ior_metrics: if metrics[0] == "write": @@ -309,9 +282,7 @@ def _run_performance_ior_single(self, stop_rank_s=None, intercept=None): # Try this even if IOR failed because it could give us useful info self.verify_system_status(self.pool, self.container) - def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_write=None, - stop_delay_read=None, num_iterations=1, - restart_between_iterations=True): + def run_performance_ior(self, namespace=None, use_intercept=True): """Run an IOR performance test. Write and Read are ran separately. @@ -321,26 +292,8 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri Defaults to None, which uses default IOR namespace. use_intercept (bool, optional): whether to use the interception library with dfuse. Defaults to True. - stop_delay_write (float, optional): fraction of stonewall time after which to stop a - rank during write phase. Must be between 0 and 1. Default is None. - stop_delay_read (float, optional): fraction of stonewall time after which to stop a - rank during read phase. Must be between 0 and 1. Default is None. - num_iterations (int, optional): number of times to run the tests. - Default is 1. - restart_between_iterations (int, optional): whether to restart the servers between - iterations. Default is True. """ - if stop_delay_write is not None and (stop_delay_write < 0 or stop_delay_write > 1): - self.fail("stop_delay_write must be between 0 and 1") - if stop_delay_read is not None and (stop_delay_read < 0 or stop_delay_read > 1): - self.fail("stop_delay_read must be between 0 and 1") - if stop_delay_write is not None and stop_delay_read is not None: - # This isn't straightforward, because stopping a rank during write degrades - # performance, so read tries to read the same number of bytes as write, - # but might finish before the rank is stopped. - self.fail("stop_delay_write and stop_delay_read cannot be used together") - if namespace is not None: self.ior_cmd.namespace = namespace self.ior_cmd.get_params(self) @@ -351,13 +304,6 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri else: intercept = None - # Calculate both stop delays upfront since read phase will remove stonewall - stop_rank_write_s = stop_rank_read_s = None - if stop_delay_write and self.ior_cmd.sw_deadline.value: - stop_rank_write_s = stop_delay_write * self.ior_cmd.sw_deadline.value - if stop_delay_read and self.ior_cmd.sw_deadline.value: - stop_rank_read_s = stop_delay_read * self.ior_cmd.sw_deadline.value - # Save write and read params for switching write_flags = self.params.get("write_flags", self.ior_cmd.namespace) read_flags = self.params.get("read_flags", self.ior_cmd.namespace) @@ -376,7 +322,7 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri # Set the container redundancy factor to match the oclass cont_rf = oclass_utils.extract_redundancy_factor(self.ior_cmd.dfs_oclass.value) - # Create pool and container upfront for flexibility and so rank stop timing is accurate + # Create pool and container upfront for flexibility self.pool = self.get_pool(connect=False) params = {} if self.ior_cmd.dfs_oclass.value: @@ -391,50 +337,33 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri self.container.create() self.update_ior_cmd_with_pool(False) - for iteration in range(num_iterations): - if restart_between_iterations and iteration > 0: - self.restart_servers() - - self.log.info("Running IOR write (%s)", str(iteration)) - self.ior_cmd.flags.update(write_flags) - self._run_performance_ior_single(stop_rank_write_s, intercept) - - # Manually stop dfuse after ior write completes - self.stop_dfuse() - - # Wait for rebuild if we stopped a rank - if stop_rank_write_s: - self.pool.wait_for_rebuild_to_end() + self.log_step("Running IOR write") + self.ior_cmd.flags.update(write_flags) + self._run_performance_ior_single(intercept) - # Wait between write and read - self.phase_barrier() + # Manually stop dfuse after ior write completes + self.stop_dfuse() - self.log.info("Running IOR read (%s)", str(iteration)) - self.ior_cmd.flags.update(read_flags) - self._run_performance_ior_single(stop_rank_read_s, intercept) + # Wait between write and read + self.phase_barrier() - # Manually stop dfuse after ior read completes - self.stop_dfuse() + self.log_step("Running IOR read") + self.ior_cmd.flags.update(read_flags) + self._run_performance_ior_single(intercept) - # Wait for rebuild if we stopped a rank - if stop_rank_read_s: - self.pool.wait_for_rebuild_to_end() + # Manually stop dfuse after ior read completes + self.stop_dfuse() self._log_daos_metrics() - def run_performance_mdtest(self, namespace=None, stop_delay=None): + def run_performance_mdtest(self, namespace=None): """Run an MDTest performance test. Args: namespace (str, optional): namespace for MDTest parameters in the yaml. Defaults to None, which uses default MDTest namespace. - stop_delay (float, optional): fraction of stonewall time after which to stop a - rank. Must be between 0 and 1. Defaults to None. """ - if stop_delay is not None and (stop_delay < 0 or stop_delay > 1): - self.fail("stop_delay must be between 0 and 1") - if namespace is not None: self.mdtest_cmd.namespace = namespace self.mdtest_cmd.get_params(self) @@ -445,8 +374,6 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): if self.mdtest_cmd.api.value not in ('DFS', 'POSIX'): self.fail("Only DFS API supported") - stop_rank_s = (stop_delay or 0) * (self.mdtest_cmd.stonewall_timer.value or 0) - self._log_performance_params("MDTEST") self.verify_oclass_engine_count(self.mdtest_cmd.dfs_oclass.value) @@ -484,20 +411,10 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): # Never let execute_mdtest automatically destroy the container self.mdtest_cmd.dfs_destroy.update(False) - # Always run as a subprocess so we can stop ranks during IO - self.subprocess = True - self.log.info("Running MDTEST") - self.execute_mdtest(display_space=False) - if stop_rank_s: - time.sleep(stop_rank_s) - self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0]) - mdtest_returncode = self.job_manager.process.wait() try: - if mdtest_returncode != 0: - self.fail("mdtest failed") - mdtest_output = get_subprocess_stdout(self.job_manager.process) - mdtest_metrics = MdtestMetrics(mdtest_output) + mdtest_result = self.execute_mdtest(display_space=False) + mdtest_metrics = MdtestMetrics(mdtest_result.stdout_text) if not mdtest_metrics: self.fail("Failed to get mdtest metrics") log_list = [] @@ -523,8 +440,4 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): # Manually stop dfuse after mdtest completes self.stop_dfuse() - # Wait for rebuild if we stopped a rank - if stop_rank_s: - self.pool.wait_for_rebuild_to_end() - self._log_daos_metrics()