Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16076 test: Automate dmg scale test to be run on Aurora (#14616) #15126

Merged
merged 2 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions src/tests/ftest/control/dmg_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
(C) Copyright 2024 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
from apricot import TestWithServers
from telemetry_utils import TelemetryUtils
from test_utils_pool import time_pool_create

ENGINE_POOL_METRICS_SHORT = [
"engine_pool_entries_dtx_batched_degree",
"engine_pool_entries_dtx_batched_total",
"engine_pool_ops_akey_enum",
"engine_pool_ops_akey_punch",
"engine_pool_ops_compound",
"engine_pool_ops_dkey_enum",
"engine_pool_ops_dkey_punch",
"engine_pool_ops_dtx_abort",
"engine_pool_ops_dtx_check",
"engine_pool_ops_dtx_commit",
"engine_pool_ops_dtx_refresh",
"engine_pool_ops_ec_agg",
"engine_pool_ops_ec_rep",
"engine_pool_ops_fetch",
"engine_pool_ops_key_query",
"engine_pool_ops_migrate",
"engine_pool_ops_obj_enum",
"engine_pool_ops_obj_punch",
"engine_pool_ops_obj_sync",
"engine_pool_ops_recx_enum",
"engine_pool_ops_tgt_akey_punch",
"engine_pool_ops_tgt_dkey_punch",
"engine_pool_ops_tgt_punch",
"engine_pool_ops_tgt_update",
"engine_pool_ops_update",
"engine_pool_ops_pool_connect",
"engine_pool_ops_pool_disconnect",
"engine_pool_ops_pool_evict",
"engine_pool_ops_pool_query",
"engine_pool_ops_pool_query_space",
"engine_pool_resent",
"engine_pool_restarted",
"engine_pool_retry",
"engine_pool_scrubber_busy_time",
"engine_pool_scrubber_bytes_scrubbed_current",
"engine_pool_scrubber_bytes_scrubbed_prev",
"engine_pool_scrubber_bytes_scrubbed_total",
"engine_pool_scrubber_corruption_current",
"engine_pool_scrubber_corruption_total",
"engine_pool_scrubber_csums_current",
"engine_pool_scrubber_csums_prev",
"engine_pool_scrubber_csums_total",
"engine_pool_scrubber_next_csum_scrub",
"engine_pool_scrubber_next_tree_scrub",
"engine_pool_scrubber_prev_duration",
"engine_pool_scrubber_prev_duration_max",
"engine_pool_scrubber_prev_duration_mean",
"engine_pool_scrubber_prev_duration_min",
"engine_pool_scrubber_prev_duration_stddev",
"engine_pool_scrubber_scrubber_started",
"engine_pool_scrubber_scrubs_completed",
"engine_pool_started_at",
"engine_pool_vos_aggregation_akey_deleted",
"engine_pool_vos_aggregation_akey_scanned",
"engine_pool_vos_aggregation_akey_skipped",
"engine_pool_vos_aggregation_csum_errors",
"engine_pool_vos_aggregation_deleted_ev",
"engine_pool_vos_aggregation_deleted_sv",
"engine_pool_vos_aggregation_dkey_deleted",
"engine_pool_vos_aggregation_dkey_scanned",
"engine_pool_vos_aggregation_dkey_skipped",
"engine_pool_vos_aggregation_epr_duration",
"engine_pool_vos_aggregation_epr_duration_max",
"engine_pool_vos_aggregation_epr_duration_mean",
"engine_pool_vos_aggregation_epr_duration_min",
"engine_pool_vos_aggregation_epr_duration_stddev",
"engine_pool_vos_aggregation_merged_recs",
"engine_pool_vos_aggregation_merged_size",
"engine_pool_vos_aggregation_obj_deleted",
"engine_pool_vos_aggregation_obj_scanned",
"engine_pool_vos_aggregation_obj_skipped",
"engine_pool_vos_aggregation_uncommitted",
"engine_pool_vos_space_nvme_used",
"engine_pool_vos_space_scm_used",
"engine_pool_xferred_fetch",
"engine_pool_xferred_update",
"engine_pool_EC_update_full_stripe",
"engine_pool_EC_update_partial",
"engine_pool_block_allocator_alloc_hint",
"engine_pool_block_allocator_alloc_large",
"engine_pool_block_allocator_alloc_small",
"engine_pool_block_allocator_frags_aging",
"engine_pool_block_allocator_frags_large",
"engine_pool_block_allocator_frags_small",
"engine_pool_block_allocator_free_blks",
"engine_pool_ops_key2anchor"
]


class DmgScale(TestWithServers):
"""Verify dmg commands works as expected in a large scale system.

:avocado: recursive
"""

def test_dmg_scale(self):
"""Run the following steps and manually collect duration for each step.

0. Format storage
1. System query
2. Create a 100% pool that spans all engines
3. Pool query
4. Pool destroy
5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity
6. Pool list
7. Query around 80 pool metrics
8. Destroy all 49 pools
9. System stop
10. System start

Jira ID: DAOS-10508.

:avocado: tags=all,manual
:avocado: tags=deployment
:avocado: tags=DmgScale,test_dmg_scale
"""
# This is a manual test and we need to find the durations from job.log, so add "##" to make
# it easy to search. The log is usually over 1 million lines.
self.log_step("## System query")
dmg_command = self.get_dmg_command()
dmg_command.system_query()

self.log_step("## Create a 100% pool that spans all engines")
pool = self.get_pool(namespace="/run/pool_100/*", create=False)
duration = time_pool_create(log=self.log, number=1, pool=pool)
self.log.info("## Single pool create duration = %.1f", duration)

self.log_step("## Pool query")
pool.query()

self.log_step("## Pool destroy")
pool.destroy()

quantity = self.params.get("quantity", "/run/pool_small/*", 1)
msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up "
f"the capacity")
self.log_step(msg)
pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False)
duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0)
pools = [pool_0]
durations = [duration_0]
for count in range(1, quantity):
pools.append(self.get_pool(create=False))
# Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools.
pools[-1].scm_size.update(pool_0.scm_per_rank)
pools[-1].nvme_size.update(pool_0.nvme_per_rank)
durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1]))
msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; "
f"NVMe = {pools[-1].nvme_per_rank}")
self.log.info(msg)
self.log.info("## durations = %s", durations)
total_duration = sum(durations)
self.log.info("## %d pools create duration = %.1f", quantity, total_duration)

self.log_step("## Pool list")
dmg_command.pool_list()

self.log_step("## Query around 80 pool metrics")
# To save time and logs, call telemetry on the first host only. With the 80 pool metrics
# above, ~100K lines are printed per host.
telemetry_utils = TelemetryUtils(
dmg=dmg_command, servers=[self.server_managers[0].hosts[0]])
telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT))

self.log_step(f"## Destroy all {quantity} pools")
self.destroy_pools(pools=pools)

self.log_step("## System stop")
self.server_managers[0].system_stop()

self.log_step("## System start")
self.server_managers[0].system_start()
37 changes: 37 additions & 0 deletions src/tests/ftest/control/dmg_scale.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while
# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc.
hosts:
test_servers: 256

timeout: 900

daos_server:
pattern_timeout: 60

server_config:
name: daos_server
engines_per_host: 2
engines:
0:
pinned_numa_node: 0
nr_xs_helpers: 1
fabric_iface: ib0
fabric_iface_port: 31317
log_file: daos_server0.log
storage: auto
targets: 8
1:
pinned_numa_node: 1
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31417
log_file: daos_server1.log
storage: auto
targets: 8

pool_100:
size: 100%
pool_small:
size: 2%
# If we use --size=2% during pool create, we can only create up to 49 pools.
quantity: 49
Loading