diff --git a/nodejs/eks/nodegroup.ts b/nodejs/eks/nodegroup.ts index 3de9fd4e6..35fc7e025 100644 --- a/nodejs/eks/nodegroup.ts +++ b/nodejs/eks/nodegroup.ts @@ -2113,7 +2113,7 @@ export function createManagedNodeGroupInternal( // amiType, releaseVersion and version cannot be set if an AMI ID is set in a custom launch template. // The AMI ID is set in the launch template if custom user data is required. // See https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#mng-ami-id-conditions - if (requiresCustomUserData(userDataArgs) || args.userData) { + if (requiresCustomUserData(userDataArgs) || args.userData || args.amiId) { delete nodeGroupArgs.version; delete nodeGroupArgs.releaseVersion; delete nodeGroupArgs.amiType; @@ -2228,7 +2228,9 @@ function createMNGCustomLaunchTemplate( }); let userData: pulumi.Output | undefined; - if (requiresCustomUserData(customUserDataArgs) || args.userData) { + // when amiId is provided, we need to create a custom user data script because + // EKS will not provide default user data when an AMI ID is provided. + if (requiresCustomUserData(customUserDataArgs) || args.userData || args.amiId) { userData = pulumi .all([ clusterMetadata, diff --git a/tests/internal/utils.go b/tests/internal/utils.go index 071cd39ef..d06468d98 100644 --- a/tests/internal/utils.go +++ b/tests/internal/utils.go @@ -565,7 +565,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error { err := ValidateDeploymentHealth(t, clientset, deployment.Namespace, deployment.Name) if err != nil { - t.Logf("Detected unhelathy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err) + t.Logf("Detected unhealthy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err) labelSelector, selectorErr := findDeploymentLabelSelector(clientset, deployment.Namespace, deployment.Name, clusterName) if selectorErr != nil { return errors.Join(err, selectorErr) @@ -586,7 +586,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error { err := ValidateDaemonSetHealth(t, clientset, daemonset.Namespace, daemonset.Name) if err != nil { - t.Logf("Detected unhelathy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err) + t.Logf("Detected unhealthy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err) labelSelector, selectorErr := findDaemonSetLabelSelector(clientset, daemonset.Namespace, daemonset.Name, clusterName) if selectorErr != nil { return errors.Join(err, selectorErr) @@ -736,6 +736,36 @@ func mapClusterToNodeGroups(resources []apitype.ResourceV3) (clusterNodeGroupMap // Examples: i-1234567f, i-0123456789abcdef0 var eC2InstanceIDPattern = regexp.MustCompile(`^i-[a-f0-9]{8,17}$`) +// GetEC2InstanceIDs extracts the EC2 instance IDs from the given list of nodes. +// It returns a map of instance IDs to nodes. +func GetEC2InstanceIDs(nodes *corev1.NodeList) map[string]*corev1.Node { + if nodes == nil { + return nil + } + + instanceIDToNode := make(map[string]*corev1.Node) + + // Extract instance IDs from all nodes + for i := range nodes.Items { + node := &nodes.Items[i] + // Extract EC2 instance ID from node provider ID + // Provider ID format: aws:///az/i-1234567890abcdef0 + providerID := node.Spec.ProviderID + instanceID := strings.TrimPrefix(providerID, "aws:///") + instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/") + + // the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate), + // those are not part of any ASGs. We already validated that all nodes are ready, so we can + // skip the nodes that are not EC2 instances. Their ID is not valid input to the + // DescribeAutoScalingInstances API used later and would cause an error. + if eC2InstanceIDPattern.MatchString(instanceID) { + instanceIDToNode[instanceID] = node + } + } + + return instanceIDToNode +} + // ValidateNodeGroupInstances validates that all nodes in the cluster are healthy and have successfully joined. // It checks that each ASG has the expected number of instances and that all instances are properly registered as nodes. func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, asgClient *autoscaling.Client, ec2Client *ec2.Client, clusterName string, expectedNodeGroups map[string]nodeGroupData) error { @@ -774,26 +804,11 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a } asgInstanceCounts := make(map[string]int) - instanceIDs := make([]string, 0, len(nodes.Items)) - instanceIDToNode := make(map[string]*corev1.Node) - // Extract instance IDs from all nodes - for i := range nodes.Items { - node := &nodes.Items[i] - // Extract EC2 instance ID from node provider ID - // Provider ID format: aws:///az/i-1234567890abcdef0 - providerID := node.Spec.ProviderID - instanceID := strings.TrimPrefix(providerID, "aws:///") - instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/") - - // the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate), - // those are not part of any ASGs. We already validated that all nodes are ready, so we can - // skip the nodes that are not EC2 instances. Their ID is not valid input to the - // DescribeAutoScalingInstances API used later and would cause an error. - if eC2InstanceIDPattern.MatchString(instanceID) { - instanceIDs = append(instanceIDs, instanceID) - instanceIDToNode[instanceID] = node - } + instanceIDToNode := GetEC2InstanceIDs(nodes) + instanceIDs := make([]string, 0, len(instanceIDToNode)) + for instanceID := range instanceIDToNode { + instanceIDs = append(instanceIDs, instanceID) } // For each node, get its EC2 instance ID, find which ASG it belongs to and verify that EC2 reports the instance as running. @@ -864,6 +879,70 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a return nil } +// FindNodesWithAmiID finds the nodes in a cluster that have the given AMI ID. +// It returns a list of EC2 instance IDs. +func FindNodesWithAmiID(t *testing.T, kubeconfig any, amiID string) ([]string, error) { + clientSet, err := clientSetFromKubeconfig(kubeconfig) + if err != nil { + return nil, err + } + + nodes, err := clientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, err + } + + instanceIDToNode := GetEC2InstanceIDs(nodes) + if len(instanceIDToNode) == 0 { + return nil, nil + } + + var region string + var profile string + if p, ok := os.LookupEnv("AWS_PROFILE"); ok { + profile = p + } + if r, ok := os.LookupEnv("AWS_REGION"); ok { + region = r + } + awsConfig := loadAwsDefaultConfig(t, region, profile) + ec2Client := ec2.NewFromConfig(awsConfig) + + nodesWithAmiID := make([]string, 0, len(instanceIDToNode)) + // Process instance IDs in batches of 255 (AWS API limit) + instanceIDs := make([]string, 0, len(instanceIDToNode)) + for id := range instanceIDToNode { + instanceIDs = append(instanceIDs, id) + } + + // Describe instances in batches of 255 (AWS API limit) + for i := 0; i < len(instanceIDs); i += 255 { + end := i + 255 + if end > len(instanceIDs) { + end = len(instanceIDs) + } + + batch := instanceIDs[i:end] + describeInput := &ec2.DescribeInstancesInput{ + InstanceIds: batch, + } + describeResult, err := ec2Client.DescribeInstances(context.TODO(), describeInput) + if err != nil { + return nil, err + } + + for _, reservation := range describeResult.Reservations { + for _, instance := range reservation.Instances { + if *instance.ImageId == amiID { + nodesWithAmiID = append(nodesWithAmiID, *instance.InstanceId) + } + } + } + } + + return nodesWithAmiID, nil +} + // ValidateNodes validates the nodes in a cluster contain the expected values. func ValidateNodes(t *testing.T, kubeconfig any, validateFn func(*corev1.NodeList)) error { clientSet, err := clientSetFromKubeconfig(kubeconfig) diff --git a/tests/nodejs_test.go b/tests/nodejs_test.go index 16a5feb36..b227a6ef0 100644 --- a/tests/nodejs_test.go +++ b/tests/nodejs_test.go @@ -17,6 +17,7 @@ package tests import ( + "context" "fmt" "os" "path" @@ -651,6 +652,23 @@ func TestAccManagedNodeGroupOS(t *testing.T) { } assert.Equal(t, 2, foundNodes, "Expected %s nodes with Nvidia GPUs", foundNodes) })) + + // Validate that the nodes with the custom AMI ID are present + customAmiId := info.Outputs["customAmiId"].(string) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + err := utils.RetryWithExponentialBackoff(ctx, 10*time.Second, func() error { + nodes, err := utils.FindNodesWithAmiID(t, info.Outputs["kubeconfig"], customAmiId) + if err != nil { + return err + } + t.Logf("Found %d nodes with custom AMI ID: %v", len(nodes), nodes) + if len(nodes) != 2 { + return fmt.Errorf("expected 2 nodes with custom AMI ID, got %d", len(nodes)) + } + return nil + }) + require.NoError(t, err) }, }) diff --git a/tests/testdata/programs/managed-ng-os/index.ts b/tests/testdata/programs/managed-ng-os/index.ts index 42f159e46..45d90e36b 100644 --- a/tests/testdata/programs/managed-ng-os/index.ts +++ b/tests/testdata/programs/managed-ng-os/index.ts @@ -306,8 +306,15 @@ const managedNodeGroupBottlerocketTaints = eks.createManagedNodeGroup("bottleroc }, }); -// Create a simple AL2023 node group with a custom user data and a custom AMI -const amiId = pulumi.interpolate`/aws/service/eks/optimized-ami/${cluster.core.cluster.version}/amazon-linux-2023/x86_64/standard/recommended/image_id`.apply(name => +// Fetch the previous minor version. This way we test that the user specified AMI is used instead of the default one. +// MNGs tolerate a skew of 1 minor version +export const customAmiId = pulumi.all([cluster.eksCluster.version]).apply(([version]) => { + const versionParts = version.split('.'); + const majorVersion = parseInt(versionParts[0], 10); + const minorVersion = parseInt(versionParts[1], 10) - 1; + const versionString = `${majorVersion}.${minorVersion}`; + return pulumi.interpolate`/aws/service/eks/optimized-ami/${versionString}/amazon-linux-2023/x86_64/standard/recommended/image_id`; +}).apply(name => aws.ssm.getParameter({ name }, { async: true }) ).apply(result => result.value); @@ -318,6 +325,7 @@ const customUserData = userdata.createUserData({ serviceCidr: cluster.core.cluster.kubernetesNetworkConfig.serviceIpv4Cidr, }, `--max-pods=${increasedPodCapacity} --node-labels=increased-pod-capacity=true`); +// Create a simple AL2023 node group with a custom user data and a custom AMI const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023-mng-custom-userdata", { ...baseConfig, operatingSystem: eks.OperatingSystem.AL2023, @@ -326,7 +334,16 @@ const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023 nodeRole: role, diskSize: 100, userData: customUserData, - amiId, + amiId: customAmiId, +}); + +const managedNodeGroupAL2023CustomAmi = eks.createManagedNodeGroup("al-2023-mng-custom-ami", { + ...baseConfig, + operatingSystem: eks.OperatingSystem.AL2023, + cluster: cluster, + instanceTypes: ["t3.medium"], + nodeRole: role, + amiId: customAmiId, }); const managedNodeGroupAL2023NodeadmExtraOptions = eks.createManagedNodeGroup("al-2023-mng-extra-options", {