From 1969be3b541c9c86deacc06f16694cad43df7d41 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 3 Dec 2024 17:44:24 +0000 Subject: [PATCH] DAOS-16845 test: add dynamic pool wait_for_aggregation Add TestPool.wait_for_aggregation to dynamically wait for pool aggregation to complete. Update tests to use the new function. Test-tag: test_enospace_time_with_fg DfuseSpaceCheck Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Dalton Bohning --- .../ftest/aggregation/dfuse_space_check.py | 30 +++++++--------- src/tests/ftest/nvme/enospace.py | 29 +++++++++++---- src/tests/ftest/util/test_utils_pool.py | 36 +++++++++++++++++++ 3 files changed, 71 insertions(+), 24 deletions(-) diff --git a/src/tests/ftest/aggregation/dfuse_space_check.py b/src/tests/ftest/aggregation/dfuse_space_check.py index 4bae72ef6c5..55f5d822ed4 100644 --- a/src/tests/ftest/aggregation/dfuse_space_check.py +++ b/src/tests/ftest/aggregation/dfuse_space_check.py @@ -1,11 +1,11 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import time from dfuse_utils import get_dfuse, start_dfuse from ior_test_base import IorTestBase @@ -21,8 +21,8 @@ class DfuseSpaceCheck(IorTestBase): def __init__(self, *args, **kwargs): """Initialize a DfuseSpaceCheck object.""" super().__init__(*args, **kwargs) - self.initial_space = None - self.block_size = None + self.__initial_space = None + self.__block_size = None def get_nvme_free_space(self, display=True): """Display pool free space. @@ -50,14 +50,10 @@ def wait_for_aggregation(self, retries=4, interval=60): Default is 60. """ - for _ in range(retries): - current_space = self.get_nvme_free_space() - if current_space == self.initial_space: - return - time.sleep(interval) - - self.log.info("Free space when test terminated: %s", current_space) - self.fail("Aggregation did not complete within {} seconds".format(retries * interval)) + if not self.pool.verify_space( + verify_free_nvme=lambda current: current == self.__initial_space, + retries=retries, interval=interval): + self.fail(f"Aggregation did not complete within {retries * interval} seconds") def write_multiple_files(self, dfuse): """Write multiple files. @@ -70,9 +66,9 @@ def write_multiple_files(self, dfuse): """ file_count = 0 - while self.get_nvme_free_space(False) >= self.block_size: + while self.get_nvme_free_space(False) >= self.__block_size: file_path = os.path.join(dfuse.mount_dir.value, "file{}.txt".format(file_count)) - write_dd_cmd = "dd if=/dev/zero of={} bs={} count=1".format(file_path, self.block_size) + write_dd_cmd = f"dd if=/dev/zero of={file_path} bs={self.__block_size} count=1" result = run_remote( self.log, self.hostlist_clients, write_dd_cmd, verbose=False, timeout=300) if not result.passed: @@ -109,7 +105,7 @@ def test_dfusespacecheck(self): :avocado: tags=DfuseSpaceCheck,test_dfusespacecheck """ # get test params for cont and pool count - self.block_size = self.params.get('block_size', '/run/dfusespacecheck/*') + self.__block_size = self.params.get('block_size', '/run/dfusespacecheck/*') # Create a pool, container, and start dfuse self.create_pool() @@ -118,15 +114,15 @@ def test_dfusespacecheck(self): start_dfuse(self, dfuse, self.pool, self.container) # get nvme space before write - self.initial_space = self.get_nvme_free_space() + self.__initial_space = self.get_nvme_free_space() # Create a file as large as we can large_file = os.path.join(dfuse.mount_dir.value, 'largefile.txt') if not run_remote(self.log, self.hostlist_clients, f'touch {large_file}').passed: self.fail(f"Error creating {large_file}") - dd_count = (self.initial_space // self.block_size) + 1 + dd_count = (self.__initial_space // self.__block_size) + 1 write_dd_cmd = "dd if=/dev/zero of={} bs={} count={}".format( - large_file, self.block_size, dd_count) + large_file, self.__block_size, dd_count) run_remote(self.log, self.hostlist_clients, write_dd_cmd) # Remove the file diff --git a/src/tests/ftest/nvme/enospace.py b/src/tests/ftest/nvme/enospace.py index c7a996c110f..78f6c705b1d 100644 --- a/src/tests/ftest/nvme/enospace.py +++ b/src/tests/ftest/nvme/enospace.py @@ -1,5 +1,6 @@ ''' (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -464,22 +465,36 @@ def test_enospace_time_with_fg(self): """ self.log.info(self.pool.pool_percentage_used()) - # Enabled TIme mode for Aggregation. + self.log_step("Enable pool aggregation") self.pool.set_property("reclaim", "time") + self.log_step("Get initial pool free space") + initial_space = self.pool.get_pool_daos_space() + initial_free_scm = initial_space["s_free"][0] + initial_free_nvme = initial_space["s_free"][1] + self.log.info("initial_free_scm = %s", initial_free_scm) + self.log.info("initial_free_nvme = %s", initial_free_nvme) + # Repeat the test in loop. for _loop in range(10): - self.log.info("-------enospc_time_fg Loop--------- %d", _loop) + self.log_step(f"Run IOR to fill the pool - enospace_time_with_fg loop {_loop}") self.log.info(self.pool.pool_percentage_used()) # Run IOR to fill the pool. log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log)) self.run_enospace_with_bg_job(log_file) - # Delete all the containers + self.log_step(f"Delete all containers - enospace_time_with_fg loop {_loop}") self.delete_all_containers() - # Delete container will take some time to release the space - time.sleep(60) - - # Run last IO + self.log_step(f"Wait for aggregation to complete - enospace_time_with_fg loop {_loop}") + agg_did_complete = self.pool.verify_space( + # verify_scm=lambda current: current <= initial_free_scm * 1.05, + # verify_nvme=lambda current: current <= initial_free_nvme * 1.05, + verify_free_scm=lambda current: current == initial_free_scm, + verify_free_nvme=lambda current: current == initial_free_nvme, + retries=8, interval=30) + if not agg_did_complete: + self.fail("Pool space not reclaimed after deleting all containers") + + self.log_step("Run one more sanity IOR to fill 1%") self.start_ior_load(storage='SCM', operation="Auto_Write", percent=1) @skipForTicket("DAOS-8896") diff --git a/src/tests/ftest/util/test_utils_pool.py b/src/tests/ftest/util/test_utils_pool.py index 71f05bb131e..7e9b2116edf 100644 --- a/src/tests/ftest/util/test_utils_pool.py +++ b/src/tests/ftest/util/test_utils_pool.py @@ -1,5 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -1475,6 +1476,41 @@ def wait_pool_dead_ranks(self, expected, interval=1, timeout=30): self.log.info("Wait for dead ranks complete: dead ranks %s", expected) + def verify_space(self, verify_free_scm=None, verify_free_nvme=None, retries=4, interval=30): + """Verify pool space with a time constraint. + + Args: + verify_free_scm (callable, optional): function(current) to verify scm free space. + Defaults to None. Must supply at least one verify_* argument. + verify_free_nvme (callable, optional): function(current) to verify nvme free space. + Defaults to None. Must supply at least one verify_* argument. + retries (int, optional): number of times to retry. Default is 4. + interval (int, optional): seconds to wait before retrying. Default is 60. + + Returns: + bool: whether space verification succeeded within the time limit + + Raises: + ValueError: if no verify_* argument is given + + """ + if verify_free_scm is None and verify_free_nvme is None: + raise ValueError("verify_free_scm or verify_free_nvme is required") + for retry in range(retries): + if retry > 0: + sleep(interval) + current_space = self.get_pool_daos_space() + current_free_scm = current_space["s_free"][0] + current_free_nvme = current_space["s_free"][1] + self.log.info("current_free_scm = %s", current_free_scm) + self.log.info("current_free_nvme = %s", current_free_nvme) + if verify_free_scm and not verify_free_scm(current_free_scm): + continue + if verify_free_nvme and not verify_free_nvme(current_free_nvme): + continue + return True # all succeeded + return False # out of retries + def verify_uuid_directory(self, host, scm_mount): """Check if pool folder exist on server.