Skip to content

Commit

Permalink
feature(aws): placement group creation
Browse files Browse the repository at this point in the history
this commit is:
1) Include provision step into the performance pipeline
2) Add the ability to create load and DB instances within one AWS placement group
and config parameter to enable this feature
3) Update i4i Throughput perf test to  use the new feature
4) Update clean_placement_groups_aws functionality
to support --post-behavior flag in sct.py clean-resources command
  • Loading branch information
temichus authored and fruch committed Sep 14, 2023
1 parent 79acac2 commit c290980
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 6 deletions.
2 changes: 2 additions & 0 deletions defaults/test_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,5 @@ run_fullscan: []

stress_step_duration: '15m'
simulated_racks: 0

use_placement_group: false
12 changes: 12 additions & 0 deletions sdcm/provision/aws/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,15 @@ def configure_eth1_script():
def configure_set_preserve_hostname_script():
return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
'|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'


# -----AWS Placement Group section -----
def create_cluster_placement_groups_aws(name: str, tags: dict, region=None, dry_run=False):
ec2: EC2Client = ec2_clients[region]
result = ec2.create_placement_group(
DryRun=dry_run, GroupName=name, Strategy='cluster',
TagSpecifications=[{
'ResourceType': 'placement-group',
"Tags": [{"Key": key, "Value": value} for key, value in tags.items()] +
[{"Key": "Name", "Value": name}], }],)
return result
14 changes: 14 additions & 0 deletions sdcm/sct_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,10 @@ class SCTConfiguration(dict):
dict(name="security_group_ids", env="SCT_SECURITY_GROUP_IDS", type=str_or_list,
help="AWS security groups ids to use"),

dict(name="use_placement_group", env="SCT_USE_PLACEMENT_GROUP", type=str,
help="if true, create 'cluster' placement group for test case "
"for low-latency network performance achievement"),

dict(name="subnet_id", env="SCT_SUBNET_ID", type=str_or_list,
help="AWS subnet ids to use"),

Expand Down Expand Up @@ -2032,6 +2036,8 @@ def verify_configuration(self):
self._check_partition_range_with_data_validation_correctness()
self._verify_scylla_bench_mode_and_workload_parameters()

self._validate_placement_group_required_values()

def _replace_docker_image_latest_tag(self):
docker_repo = self.get('docker_image')
scylla_version = self.get('scylla_version')
Expand Down Expand Up @@ -2113,6 +2119,14 @@ def _validate_number_of_db_nodes_divides_by_az_number(self):
assert nodes_num % az_count == 0, \
f"Number of db nodes ({nodes_num}) should be divisible by number of availability zones ({az_count})"

def _validate_placement_group_required_values(self):
if self.get("use_placement_group"):
az_count = len(self.get('availability_zone').split(',')) if self.get('availability_zone') else 1
regions_count = len(self.region_names)
assert az_count == 1 and regions_count == 1, \
(f"Number of Regions({regions_count}) and AZ({az_count}) should be 1 "
f"when param use_placement_group is used")

def _check_per_backend_required_values(self, backend: str):
if backend in self.available_backends:
if backend in ('aws', 'gce') and self.get("db_type") == "cloud_scylla":
Expand Down
12 changes: 12 additions & 0 deletions sdcm/sct_provision/aws/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See LICENSE for more details.
#
# Copyright (c) 2023 ScyllaDB
28 changes: 26 additions & 2 deletions sdcm/sct_provision/aws/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from functools import cached_property
from typing import List, Dict

from pydantic import BaseModel # pylint: disable=no-name-in-module

from pydantic import BaseModel
from sdcm import cluster
from sdcm.provision.aws.instance_parameters import AWSInstanceParams
from sdcm.provision.aws.provisioner import AWSInstanceProvisioner
Expand All @@ -29,6 +28,7 @@
from sdcm.sct_provision.aws.user_data import ScyllaUserDataBuilder, AWSInstanceUserDataBuilder
from sdcm.sct_provision.common.utils import INSTANCE_PROVISION_SPOT, INSTANCE_PROVISION_SPOT_FLEET
from sdcm.test_config import TestConfig
from sdcm.provision.aws.utils import create_cluster_placement_groups_aws


class ClusterNode(BaseModel):
Expand Down Expand Up @@ -57,6 +57,7 @@ class ClusterBase(BaseModel):
_NODE_NUM_PARAM_NAME = None
_INSTANCE_PARAMS_BUILDER = None
_USER_PARAM = None
_USE_PLACEMENT_GROUP = True

@property
def _provisioner(self):
Expand Down Expand Up @@ -100,6 +101,14 @@ def _user_prefix(self):
def cluster_name(self):
return '%s-%s' % (cluster.prepend_user_prefix(self._user_prefix, self._cluster_postfix), self._short_id)

@property
def placement_group_name(self):
if self.params.get("use_placement_group") and self._USE_PLACEMENT_GROUP:
return '%s-%s' % (
cluster.prepend_user_prefix(self._user_prefix, "placement_group"), self._short_id)
else:
return None

@property
def _node_prefix(self):
return '%s-%s' % (cluster.prepend_user_prefix(self._user_prefix, self._node_postfix), self._short_id)
Expand Down Expand Up @@ -201,6 +210,7 @@ def _instance_parameters(self, region_id: int, availability_zone: int = 0) -> AW
region_id=region_id,
user_data_raw=self._user_data,
availability_zone=availability_zone,
placement_group=self.placement_group_name
)
return AWSInstanceParams(**params_builder.dict(exclude_none=True, exclude_unset=True, exclude_defaults=True))

Expand Down Expand Up @@ -288,6 +298,8 @@ class MonitoringCluster(ClusterBase):
_NODE_NUM_PARAM_NAME = 'n_monitor_nodes'
_INSTANCE_PARAMS_BUILDER = MonitorInstanceParamsBuilder
_USER_PARAM = 'ami_monitor_user'
# disable placement group for monitor nodes, because it doesn't need low-latency network performance
_USE_PLACEMENT_GROUP = False

@property
def _user_data(self) -> str:
Expand All @@ -297,4 +309,16 @@ def _user_data(self) -> str:
).to_string()


class PlacementGroup(ClusterBase):

@property
def _user_data(self) -> str:
return ''

def provision(self):
if self.placement_group_name:
create_cluster_placement_groups_aws(
name=self.placement_group_name, tags=self.common_tags, region=self._region(0))


ClusterNode.update_forward_refs()
5 changes: 4 additions & 1 deletion sdcm/sct_provision/aws/instance_parameters_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class AWSInstanceParamsBuilder(AWSInstanceParamsBuilderBase, metaclass=abc.ABCMe
region_id: int = Field(as_dict=False)
user_data_raw: Union[str, UserDataBuilderBase] = Field(as_dict=False)
availability_zone: int = 0
placement_group: str = None

_INSTANCE_TYPE_PARAM_NAME: str = None
_IMAGE_ID_PARAM_NAME: str = None
Expand Down Expand Up @@ -82,7 +83,9 @@ def InstanceType(self) -> str: # pylint: disable=invalid-name

@property
def Placement(self) -> Optional[AWSPlacementInfo]: # pylint: disable=invalid-name
return AWSPlacementInfo(AvailabilityZone=self._region_name + self._availability_zones[self.availability_zone])
return AWSPlacementInfo(
AvailabilityZone=self._region_name + self._availability_zones[self.availability_zone],
GroupName=self.placement_group)

@property
def UserData(self) -> Optional[str]: # pylint: disable=invalid-name
Expand Down
13 changes: 12 additions & 1 deletion sdcm/sct_provision/aws/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from functools import cached_property

from sdcm.sct_provision.aws.cluster import OracleDBCluster, DBCluster, LoaderCluster, MonitoringCluster
from sdcm.sct_provision.aws.cluster import (
OracleDBCluster, DBCluster, LoaderCluster, MonitoringCluster, PlacementGroup)
from sdcm.sct_provision.common.layout import SCTProvisionLayout
from sdcm.test_config import TestConfig

Expand All @@ -25,6 +26,8 @@ def _test_config(self):
return TestConfig()

def provision(self):
if self.placement_group:
self.placement_group.provision()
if self.db_cluster:
self.db_cluster.provision()
if self.monitoring_cluster:
Expand Down Expand Up @@ -67,3 +70,11 @@ def cs_db_cluster(self):
common_tags=self._test_config.common_tags(),
test_id=self._test_config.test_id(),
)

@cached_property
def placement_group(self):
return PlacementGroup(
params=self._params,
common_tags=self._test_config.common_tags(),
test_id=self._test_config.test_id(),
)
7 changes: 5 additions & 2 deletions sdcm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from mypy_boto3_s3 import S3Client, S3ServiceResource
from mypy_boto3_ec2 import EC2Client, EC2ServiceResource
from mypy_boto3_ec2.service_resource import Image as EC2Image
from botocore.exceptions import ClientError
import docker # pylint: disable=wrong-import-order; false warning because of docker import (local file vs. package)
from google.cloud.storage import Blob as GceBlob
from google.cloud.compute_v1.types import Metadata as GceMetadata, Instance as GceInstance
Expand Down Expand Up @@ -3215,9 +3216,11 @@ def get_placement_groups(region):
return placement_groups


@retrying(n=30, sleep_time=10, allowed_exceptions=(ClientError,))
def clean_placement_groups_aws(tags_dict: dict, regions=None, dry_run=False):
"""Remove all placement groups with specific tags in AWS."""
# pylint: disable=too-many-locals,import-outside-toplevel
tags_dict = {key: value for key, value in tags_dict.items() if key != "NodeType"}
assert tags_dict, "tags_dict not provided (can't clean all instances)"
if regions:
aws_placement_groups = {}
Expand All @@ -3240,6 +3243,6 @@ def clean_placement_groups_aws(tags_dict: dict, regions=None, dry_run=False):
response = client.delete_placement_group(GroupName=name)
LOGGER.debug("Done. Result: %s\n", response)
except Exception as ex: # pylint: disable=broad-except
LOGGER.debug("Failed with: %s", str(ex))

LOGGER.info("Failed with: %s", str(ex))
raise
# ----------
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,5 @@ custom_es_index: 'performancestatsv2'
use_hdr_cs_histogram: true

append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'

use_placement_group: true
20 changes: 20 additions & 0 deletions vars/perfRegressionParallelPipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,26 @@ def call(Map pipelineParams) {
}
}

stage("Provision Resources for ${sub_test}") {
catchError() {
script {
wrap([$class: 'BuildUser']) {
dir('scylla-cluster-tests') {
timeout(time: 30, unit: 'MINUTES') {
if (params.backend == 'aws' || params.backend == 'azure') {
provisionResources(params, builder.region)
} else {
sh """
echo 'Skipping because non-AWS/Azure backends are not supported'
"""
}
}
}
}
}
}
}

stage("Run ${sub_test}"){
catchError(stageResult: 'FAILURE') {
wrap([$class: 'BuildUser']) {
Expand Down

0 comments on commit c290980

Please sign in to comment.