diff --git a/defaults/test_default.yaml b/defaults/test_default.yaml index c67f8e3bb1..283b5aeccf 100644 --- a/defaults/test_default.yaml +++ b/defaults/test_default.yaml @@ -227,3 +227,5 @@ run_fullscan: [] stress_step_duration: '15m' simulated_racks: 0 + +use_placement_group: false diff --git a/sdcm/provision/aws/utils.py b/sdcm/provision/aws/utils.py index 23e136467e..3adc80c317 100644 --- a/sdcm/provision/aws/utils.py +++ b/sdcm/provision/aws/utils.py @@ -359,3 +359,15 @@ def configure_eth1_script(): def configure_set_preserve_hostname_script(): return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \ '|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n' + + +# -----AWS Placement Group section ----- +def create_cluster_placement_groups_aws(name: str, tags: dict, region=None, dry_run=False): + ec2: EC2Client = ec2_clients[region] + result = ec2.create_placement_group( + DryRun=dry_run, GroupName=name, Strategy='cluster', + TagSpecifications=[{ + 'ResourceType': 'placement-group', + "Tags": [{"Key": key, "Value": value} for key, value in tags.items()] + + [{"Key": "Name", "Value": name}], }],) + return result diff --git a/sdcm/sct_config.py b/sdcm/sct_config.py index 05e10d9cf0..3ca88caf7b 100644 --- a/sdcm/sct_config.py +++ b/sdcm/sct_config.py @@ -626,6 +626,10 @@ class SCTConfiguration(dict): dict(name="security_group_ids", env="SCT_SECURITY_GROUP_IDS", type=str_or_list, help="AWS security groups ids to use"), + dict(name="use_placement_group", env="SCT_USE_PLACEMENT_GROUP", type=str, + help="if true, create 'cluster' placement group for test case " + "for low-latency network performance achievement"), + dict(name="subnet_id", env="SCT_SUBNET_ID", type=str_or_list, help="AWS subnet ids to use"), @@ -2032,6 +2036,8 @@ def verify_configuration(self): self._check_partition_range_with_data_validation_correctness() self._verify_scylla_bench_mode_and_workload_parameters() + self._validate_placement_group_required_values() + def _replace_docker_image_latest_tag(self): docker_repo = self.get('docker_image') scylla_version = self.get('scylla_version') @@ -2113,6 +2119,14 @@ def _validate_number_of_db_nodes_divides_by_az_number(self): assert nodes_num % az_count == 0, \ f"Number of db nodes ({nodes_num}) should be divisible by number of availability zones ({az_count})" + def _validate_placement_group_required_values(self): + if self.get("use_placement_group"): + az_count = len(self.get('availability_zone').split(',')) if self.get('availability_zone') else 1 + regions_count = len(self.region_names) + assert az_count == 1 and regions_count == 1, \ + (f"Number of Regions({regions_count}) and AZ({az_count}) should be 1 " + f"when param use_placement_group is used") + def _check_per_backend_required_values(self, backend: str): if backend in self.available_backends: if backend in ('aws', 'gce') and self.get("db_type") == "cloud_scylla": diff --git a/sdcm/sct_provision/aws/__init__.py b/sdcm/sct_provision/aws/__init__.py new file mode 100644 index 0000000000..89f0bb735e --- /dev/null +++ b/sdcm/sct_provision/aws/__init__.py @@ -0,0 +1,12 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See LICENSE for more details. +# +# Copyright (c) 2023 ScyllaDB diff --git a/sdcm/sct_provision/aws/cluster.py b/sdcm/sct_provision/aws/cluster.py index b841840806..dd1b5f8666 100644 --- a/sdcm/sct_provision/aws/cluster.py +++ b/sdcm/sct_provision/aws/cluster.py @@ -15,8 +15,7 @@ from functools import cached_property from typing import List, Dict -from pydantic import BaseModel # pylint: disable=no-name-in-module - +from pydantic import BaseModel from sdcm import cluster from sdcm.provision.aws.instance_parameters import AWSInstanceParams from sdcm.provision.aws.provisioner import AWSInstanceProvisioner @@ -29,6 +28,7 @@ from sdcm.sct_provision.aws.user_data import ScyllaUserDataBuilder, AWSInstanceUserDataBuilder from sdcm.sct_provision.common.utils import INSTANCE_PROVISION_SPOT, INSTANCE_PROVISION_SPOT_FLEET from sdcm.test_config import TestConfig +from sdcm.provision.aws.utils import create_cluster_placement_groups_aws class ClusterNode(BaseModel): @@ -57,6 +57,7 @@ class ClusterBase(BaseModel): _NODE_NUM_PARAM_NAME = None _INSTANCE_PARAMS_BUILDER = None _USER_PARAM = None + _USE_PLACEMENT_GROUP = True @property def _provisioner(self): @@ -100,6 +101,14 @@ def _user_prefix(self): def cluster_name(self): return '%s-%s' % (cluster.prepend_user_prefix(self._user_prefix, self._cluster_postfix), self._short_id) + @property + def placement_group_name(self): + if self.params.get("use_placement_group") and self._USE_PLACEMENT_GROUP: + return '%s-%s' % ( + cluster.prepend_user_prefix(self._user_prefix, "placement_group"), self._short_id) + else: + return None + @property def _node_prefix(self): return '%s-%s' % (cluster.prepend_user_prefix(self._user_prefix, self._node_postfix), self._short_id) @@ -201,6 +210,7 @@ def _instance_parameters(self, region_id: int, availability_zone: int = 0) -> AW region_id=region_id, user_data_raw=self._user_data, availability_zone=availability_zone, + placement_group=self.placement_group_name ) return AWSInstanceParams(**params_builder.dict(exclude_none=True, exclude_unset=True, exclude_defaults=True)) @@ -288,6 +298,8 @@ class MonitoringCluster(ClusterBase): _NODE_NUM_PARAM_NAME = 'n_monitor_nodes' _INSTANCE_PARAMS_BUILDER = MonitorInstanceParamsBuilder _USER_PARAM = 'ami_monitor_user' + # disable placement group for monitor nodes, because it doesn't need low-latency network performance + _USE_PLACEMENT_GROUP = False @property def _user_data(self) -> str: @@ -297,4 +309,16 @@ def _user_data(self) -> str: ).to_string() +class PlacementGroup(ClusterBase): + + @property + def _user_data(self) -> str: + return '' + + def provision(self): + if self.placement_group_name: + create_cluster_placement_groups_aws( + name=self.placement_group_name, tags=self.common_tags, region=self._region(0)) + + ClusterNode.update_forward_refs() diff --git a/sdcm/sct_provision/aws/instance_parameters_builder.py b/sdcm/sct_provision/aws/instance_parameters_builder.py index 8ca44efd17..ad202ce146 100644 --- a/sdcm/sct_provision/aws/instance_parameters_builder.py +++ b/sdcm/sct_provision/aws/instance_parameters_builder.py @@ -30,6 +30,7 @@ class AWSInstanceParamsBuilder(AWSInstanceParamsBuilderBase, metaclass=abc.ABCMe region_id: int = Field(as_dict=False) user_data_raw: Union[str, UserDataBuilderBase] = Field(as_dict=False) availability_zone: int = 0 + placement_group: str = None _INSTANCE_TYPE_PARAM_NAME: str = None _IMAGE_ID_PARAM_NAME: str = None @@ -82,7 +83,9 @@ def InstanceType(self) -> str: # pylint: disable=invalid-name @property def Placement(self) -> Optional[AWSPlacementInfo]: # pylint: disable=invalid-name - return AWSPlacementInfo(AvailabilityZone=self._region_name + self._availability_zones[self.availability_zone]) + return AWSPlacementInfo( + AvailabilityZone=self._region_name + self._availability_zones[self.availability_zone], + GroupName=self.placement_group) @property def UserData(self) -> Optional[str]: # pylint: disable=invalid-name diff --git a/sdcm/sct_provision/aws/layout.py b/sdcm/sct_provision/aws/layout.py index 758792b088..e474b62b35 100644 --- a/sdcm/sct_provision/aws/layout.py +++ b/sdcm/sct_provision/aws/layout.py @@ -13,7 +13,8 @@ from functools import cached_property -from sdcm.sct_provision.aws.cluster import OracleDBCluster, DBCluster, LoaderCluster, MonitoringCluster +from sdcm.sct_provision.aws.cluster import ( + OracleDBCluster, DBCluster, LoaderCluster, MonitoringCluster, PlacementGroup) from sdcm.sct_provision.common.layout import SCTProvisionLayout from sdcm.test_config import TestConfig @@ -25,6 +26,8 @@ def _test_config(self): return TestConfig() def provision(self): + if self.placement_group: + self.placement_group.provision() if self.db_cluster: self.db_cluster.provision() if self.monitoring_cluster: @@ -67,3 +70,11 @@ def cs_db_cluster(self): common_tags=self._test_config.common_tags(), test_id=self._test_config.test_id(), ) + + @cached_property + def placement_group(self): + return PlacementGroup( + params=self._params, + common_tags=self._test_config.common_tags(), + test_id=self._test_config.test_id(), + ) diff --git a/sdcm/utils/common.py b/sdcm/utils/common.py index e48a00669c..2e3502115e 100644 --- a/sdcm/utils/common.py +++ b/sdcm/utils/common.py @@ -57,6 +57,7 @@ from mypy_boto3_s3 import S3Client, S3ServiceResource from mypy_boto3_ec2 import EC2Client, EC2ServiceResource from mypy_boto3_ec2.service_resource import Image as EC2Image +from botocore.exceptions import ClientError import docker # pylint: disable=wrong-import-order; false warning because of docker import (local file vs. package) from google.cloud.storage import Blob as GceBlob from google.cloud.compute_v1.types import Metadata as GceMetadata, Instance as GceInstance @@ -3215,9 +3216,11 @@ def get_placement_groups(region): return placement_groups +@retrying(n=30, sleep_time=10, allowed_exceptions=(ClientError,)) def clean_placement_groups_aws(tags_dict: dict, regions=None, dry_run=False): """Remove all placement groups with specific tags in AWS.""" # pylint: disable=too-many-locals,import-outside-toplevel + tags_dict = {key: value for key, value in tags_dict.items() if key != "NodeType"} assert tags_dict, "tags_dict not provided (can't clean all instances)" if regions: aws_placement_groups = {} @@ -3240,6 +3243,6 @@ def clean_placement_groups_aws(tags_dict: dict, regions=None, dry_run=False): response = client.delete_placement_group(GroupName=name) LOGGER.debug("Done. Result: %s\n", response) except Exception as ex: # pylint: disable=broad-except - LOGGER.debug("Failed with: %s", str(ex)) - + LOGGER.info("Failed with: %s", str(ex)) + raise # ---------- diff --git a/test-cases/performance/perf-regression.100threads.30M-keys-i4i.yaml b/test-cases/performance/perf-regression.100threads.30M-keys-i4i.yaml index 3d101da964..b79f7b2d2a 100644 --- a/test-cases/performance/perf-regression.100threads.30M-keys-i4i.yaml +++ b/test-cases/performance/perf-regression.100threads.30M-keys-i4i.yaml @@ -48,3 +48,5 @@ custom_es_index: 'performancestatsv2' use_hdr_cs_histogram: true append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1' + +use_placement_group: true diff --git a/vars/perfRegressionParallelPipeline.groovy b/vars/perfRegressionParallelPipeline.groovy index a92757d367..dfe5f44358 100644 --- a/vars/perfRegressionParallelPipeline.groovy +++ b/vars/perfRegressionParallelPipeline.groovy @@ -178,6 +178,26 @@ def call(Map pipelineParams) { } } + stage("Provision Resources for ${sub_test}") { + catchError() { + script { + wrap([$class: 'BuildUser']) { + dir('scylla-cluster-tests') { + timeout(time: 30, unit: 'MINUTES') { + if (params.backend == 'aws' || params.backend == 'azure') { + provisionResources(params, builder.region) + } else { + sh """ + echo 'Skipping because non-AWS/Azure backends are not supported' + """ + } + } + } + } + } + } + } + stage("Run ${sub_test}"){ catchError(stageResult: 'FAILURE') { wrap([$class: 'BuildUser']) {