Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ManagedNodeGroup ignoring custom AMI if no user data is configured #1562

Merged
merged 2 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions nodejs/eks/nodegroup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2113,7 +2113,7 @@ export function createManagedNodeGroupInternal(
// amiType, releaseVersion and version cannot be set if an AMI ID is set in a custom launch template.
// The AMI ID is set in the launch template if custom user data is required.
// See https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#mng-ami-id-conditions
if (requiresCustomUserData(userDataArgs) || args.userData) {
if (requiresCustomUserData(userDataArgs) || args.userData || args.amiId) {
delete nodeGroupArgs.version;
delete nodeGroupArgs.releaseVersion;
delete nodeGroupArgs.amiType;
Expand Down Expand Up @@ -2228,7 +2228,9 @@ function createMNGCustomLaunchTemplate(
});

let userData: pulumi.Output<string> | undefined;
if (requiresCustomUserData(customUserDataArgs) || args.userData) {
// when amiId is provided, we need to create a custom user data script because
// EKS will not provide default user data when an AMI ID is provided.
if (requiresCustomUserData(customUserDataArgs) || args.userData || args.amiId) {
userData = pulumi
.all([
clusterMetadata,
Expand Down
121 changes: 100 additions & 21 deletions tests/internal/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth
err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error {
err := ValidateDeploymentHealth(t, clientset, deployment.Namespace, deployment.Name)
if err != nil {
t.Logf("Detected unhelathy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err)
t.Logf("Detected unhealthy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err)
labelSelector, selectorErr := findDeploymentLabelSelector(clientset, deployment.Namespace, deployment.Name, clusterName)
if selectorErr != nil {
return errors.Join(err, selectorErr)
Expand All @@ -586,7 +586,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth
err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error {
err := ValidateDaemonSetHealth(t, clientset, daemonset.Namespace, daemonset.Name)
if err != nil {
t.Logf("Detected unhelathy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err)
t.Logf("Detected unhealthy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err)
labelSelector, selectorErr := findDaemonSetLabelSelector(clientset, daemonset.Namespace, daemonset.Name, clusterName)
if selectorErr != nil {
return errors.Join(err, selectorErr)
Expand Down Expand Up @@ -736,6 +736,36 @@ func mapClusterToNodeGroups(resources []apitype.ResourceV3) (clusterNodeGroupMap
// Examples: i-1234567f, i-0123456789abcdef0
var eC2InstanceIDPattern = regexp.MustCompile(`^i-[a-f0-9]{8,17}$`)

// GetEC2InstanceIDs extracts the EC2 instance IDs from the given list of nodes.
// It returns a map of instance IDs to nodes.
func GetEC2InstanceIDs(nodes *corev1.NodeList) map[string]*corev1.Node {
if nodes == nil {
return nil
}

instanceIDToNode := make(map[string]*corev1.Node)

// Extract instance IDs from all nodes
for i := range nodes.Items {
node := &nodes.Items[i]
// Extract EC2 instance ID from node provider ID
// Provider ID format: aws:///az/i-1234567890abcdef0
providerID := node.Spec.ProviderID
instanceID := strings.TrimPrefix(providerID, "aws:///")
instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/")

// the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate),
// those are not part of any ASGs. We already validated that all nodes are ready, so we can
// skip the nodes that are not EC2 instances. Their ID is not valid input to the
// DescribeAutoScalingInstances API used later and would cause an error.
if eC2InstanceIDPattern.MatchString(instanceID) {
instanceIDToNode[instanceID] = node
}
}

return instanceIDToNode
}

// ValidateNodeGroupInstances validates that all nodes in the cluster are healthy and have successfully joined.
// It checks that each ASG has the expected number of instances and that all instances are properly registered as nodes.
func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, asgClient *autoscaling.Client, ec2Client *ec2.Client, clusterName string, expectedNodeGroups map[string]nodeGroupData) error {
Expand Down Expand Up @@ -774,26 +804,11 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a
}

asgInstanceCounts := make(map[string]int)
instanceIDs := make([]string, 0, len(nodes.Items))
instanceIDToNode := make(map[string]*corev1.Node)

// Extract instance IDs from all nodes
for i := range nodes.Items {
node := &nodes.Items[i]
// Extract EC2 instance ID from node provider ID
// Provider ID format: aws:///az/i-1234567890abcdef0
providerID := node.Spec.ProviderID
instanceID := strings.TrimPrefix(providerID, "aws:///")
instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/")

// the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate),
// those are not part of any ASGs. We already validated that all nodes are ready, so we can
// skip the nodes that are not EC2 instances. Their ID is not valid input to the
// DescribeAutoScalingInstances API used later and would cause an error.
if eC2InstanceIDPattern.MatchString(instanceID) {
instanceIDs = append(instanceIDs, instanceID)
instanceIDToNode[instanceID] = node
}
instanceIDToNode := GetEC2InstanceIDs(nodes)
instanceIDs := make([]string, 0, len(instanceIDToNode))
for instanceID := range instanceIDToNode {
instanceIDs = append(instanceIDs, instanceID)
}

// For each node, get its EC2 instance ID, find which ASG it belongs to and verify that EC2 reports the instance as running.
Expand Down Expand Up @@ -864,6 +879,70 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a
return nil
}

// FindNodesWithAmiID finds the nodes in a cluster that have the given AMI ID.
// It returns a list of EC2 instance IDs.
func FindNodesWithAmiID(t *testing.T, kubeconfig any, amiID string) ([]string, error) {
clientSet, err := clientSetFromKubeconfig(kubeconfig)
if err != nil {
return nil, err
}

nodes, err := clientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return nil, err
}

instanceIDToNode := GetEC2InstanceIDs(nodes)
if len(instanceIDToNode) == 0 {
return nil, nil
}

var region string
var profile string
if p, ok := os.LookupEnv("AWS_PROFILE"); ok {
profile = p
}
if r, ok := os.LookupEnv("AWS_REGION"); ok {
region = r
}
awsConfig := loadAwsDefaultConfig(t, region, profile)
ec2Client := ec2.NewFromConfig(awsConfig)

nodesWithAmiID := make([]string, 0, len(instanceIDToNode))
// Process instance IDs in batches of 255 (AWS API limit)
instanceIDs := make([]string, 0, len(instanceIDToNode))
for id := range instanceIDToNode {
instanceIDs = append(instanceIDs, id)
}

// Describe instances in batches of 255 (AWS API limit)
for i := 0; i < len(instanceIDs); i += 255 {
end := i + 255
if end > len(instanceIDs) {
end = len(instanceIDs)
}

batch := instanceIDs[i:end]
describeInput := &ec2.DescribeInstancesInput{
InstanceIds: batch,
}
describeResult, err := ec2Client.DescribeInstances(context.TODO(), describeInput)
if err != nil {
return nil, err
}

for _, reservation := range describeResult.Reservations {
for _, instance := range reservation.Instances {
if *instance.ImageId == amiID {
nodesWithAmiID = append(nodesWithAmiID, *instance.InstanceId)
}
}
}
}

return nodesWithAmiID, nil
}

// ValidateNodes validates the nodes in a cluster contain the expected values.
func ValidateNodes(t *testing.T, kubeconfig any, validateFn func(*corev1.NodeList)) error {
clientSet, err := clientSetFromKubeconfig(kubeconfig)
Expand Down
18 changes: 18 additions & 0 deletions tests/nodejs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package tests

import (
"context"
"fmt"
"os"
"path"
Expand Down Expand Up @@ -651,6 +652,23 @@ func TestAccManagedNodeGroupOS(t *testing.T) {
}
assert.Equal(t, 2, foundNodes, "Expected %s nodes with Nvidia GPUs", foundNodes)
}))

// Validate that the nodes with the custom AMI ID are present
customAmiId := info.Outputs["customAmiId"].(string)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
err := utils.RetryWithExponentialBackoff(ctx, 10*time.Second, func() error {
nodes, err := utils.FindNodesWithAmiID(t, info.Outputs["kubeconfig"], customAmiId)
if err != nil {
return err
}
t.Logf("Found %d nodes with custom AMI ID: %v", len(nodes), nodes)
if len(nodes) != 2 {
return fmt.Errorf("expected 2 nodes with custom AMI ID, got %d", len(nodes))
}
return nil
})
require.NoError(t, err)
},
})

Expand Down
23 changes: 20 additions & 3 deletions tests/testdata/programs/managed-ng-os/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,15 @@ const managedNodeGroupBottlerocketTaints = eks.createManagedNodeGroup("bottleroc
},
});

// Create a simple AL2023 node group with a custom user data and a custom AMI
const amiId = pulumi.interpolate`/aws/service/eks/optimized-ami/${cluster.core.cluster.version}/amazon-linux-2023/x86_64/standard/recommended/image_id`.apply(name =>
// Fetch the previous minor version. This way we test that the user specified AMI is used instead of the default one.
// MNGs tolerate a skew of 1 minor version
export const customAmiId = pulumi.all([cluster.eksCluster.version]).apply(([version]) => {
const versionParts = version.split('.');
const majorVersion = parseInt(versionParts[0], 10);
const minorVersion = parseInt(versionParts[1], 10) - 1;
Copy link
Contributor Author

@flostadler flostadler Dec 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would break if k8s rolled out a new major version, but there's no indications this will happen in the next years and would make this computation unnecessarily complex.
If they ever do release one we can fix up the test (it'll break and not silently succeed).

const versionString = `${majorVersion}.${minorVersion}`;
return pulumi.interpolate`/aws/service/eks/optimized-ami/${versionString}/amazon-linux-2023/x86_64/standard/recommended/image_id`;
}).apply(name =>
aws.ssm.getParameter({ name }, { async: true })
).apply(result => result.value);

Expand All @@ -318,6 +325,7 @@ const customUserData = userdata.createUserData({
serviceCidr: cluster.core.cluster.kubernetesNetworkConfig.serviceIpv4Cidr,
}, `--max-pods=${increasedPodCapacity} --node-labels=increased-pod-capacity=true`);

// Create a simple AL2023 node group with a custom user data and a custom AMI
const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023-mng-custom-userdata", {
...baseConfig,
operatingSystem: eks.OperatingSystem.AL2023,
Expand All @@ -326,7 +334,16 @@ const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023
nodeRole: role,
diskSize: 100,
userData: customUserData,
amiId,
amiId: customAmiId,
});

const managedNodeGroupAL2023CustomAmi = eks.createManagedNodeGroup("al-2023-mng-custom-ami", {
...baseConfig,
operatingSystem: eks.OperatingSystem.AL2023,
cluster: cluster,
instanceTypes: ["t3.medium"],
nodeRole: role,
amiId: customAmiId,
});

const managedNodeGroupAL2023NodeadmExtraOptions = eks.createManagedNodeGroup("al-2023-mng-extra-options", {
Expand Down
Loading