chore: Update GPU patterns to use new AL2023 NVIDIA AMI variant and l…

…atest EKS 1.31 (#2031)
aws-ia · Oct 14, 2024 · 9ec1d47 · 9ec1d47
1 parent e7863cf
commit 9ec1d47
Show file tree

Hide file tree

Showing 15 changed files with 118 additions and 111 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/streetsidesoftware/cspell-cli
-    rev: v8.13.3
+    rev: v8.15.1
     hooks:
       - id: cspell
         args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh']
@@ -10,7 +10,7 @@ repos:
       - id: pretty-format-yaml
         args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes]
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer

diff --git a/patterns/fargate-serverless/README.md b/patterns/fargate-serverless/README.md
@@ -45,7 +45,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started
 3. Validate the `aws-logging` configMap for Fargate Fluentbit was created:
 
     ```sh
-    kubectl -n aws-observability get configmap aws-logging 
+    kubectl -n aws-observability get configmap aws-logging
     ```
 
     ```yaml

diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md
@@ -13,7 +13,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations
 
 ## Code
 
-```terraform hl_lines="5-11 80-94 106-109 138-151"
+```terraform hl_lines="5-11 93-107 119-122 161-174"
 {% include  "../../patterns/ml-capacity-block/eks.tf" %}
 ```
 

diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf
@@ -16,10 +16,10 @@ variable "capacity_reservation_id" {
 
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.17"
+  version = "~> 20.26"
 
   cluster_name    = local.name
-  cluster_version = "1.30"
+  cluster_version = "1.31"
 
   # Give the Terraform identity admin access to the cluster
   # which will allow it to deploy resources into the cluster
@@ -30,7 +30,9 @@ module "eks" {
     coredns                = {}
     eks-pod-identity-agent = {}
     kube-proxy             = {}
-    vpc-cni                = {}
+    vpc-cni = {
+      most_recent = true
+    }
   }
 
   # Add security group rules on the node group security group to
@@ -42,16 +44,27 @@ module "eks" {
 
   eks_managed_node_groups = {
     cbr = {
-      # The EKS AL2 GPU AMI provides all of the necessary components
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
       # for accelerated workloads w/ EFA
-      ami_type       = "AL2_x86_64_GPU"
-      instance_types = ["p5.48xlarge"]
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-      EOT
+      ami_type       = "AL2023_x86_64_NVIDIA"
+      instance_types = ["p5e.48xlarge"]
+
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+          EOT
+        }
+      ]
 
       min_size     = 2
       max_size     = 2
@@ -97,7 +110,7 @@ module "eks" {
     default = {
       instance_types = ["m5.large"]
 
-      min_size     = 1
+      min_size     = 2
       max_size     = 2
       desired_size = 2
     }
@@ -109,21 +122,31 @@ module "eks" {
   # the one that works for their use case.
   self_managed_node_groups = {
     cbr2 = {
-      # The EKS AL2 GPU AMI provides all of the necessary components
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
       # for accelerated workloads w/ EFA
-      ami_type      = "AL2_x86_64_GPU"
-      instance_type = "p5.48xlarge"
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-
-        # Ensure only GPU workloads are scheduled on this node group
-        export KUBELET_EXTRA_ARGS='--node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true \
-          --register-with-taints=nvidia.com/gpu=true:NoSchedule'
-
-      EOT
+      ami_type      = "AL2023_x86_64_NVIDIA"
+      instance_type = "p5e.48xlarge"
+
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+              kubelet:
+                flags:
+                  - --node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true
+                  - --register-with-taints=nvidia.com/gpu=true:NoSchedule
+          EOT
+        }
+      ]
 
       min_size     = 2
       max_size     = 2

diff --git a/patterns/ml-capacity-block/helm.tf b/patterns/ml-capacity-block/helm.tf
@@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
   name             = "nvidia-device-plugin"
   repository       = "https://nvidia.github.io/k8s-device-plugin"
   chart            = "nvidia-device-plugin"
-  version          = "0.14.5"
+  version          = "0.16.2"
   namespace        = "nvidia-device-plugin"
   create_namespace = true
   wait             = false
-
-  values = [
-    <<-EOT
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: 'nvidia.com/gpu.present'
-                operator: In
-                values:
-                - 'true'
-    EOT
-  ]
 }
 
 resource "helm_release" "aws_efa_device_plugin" {
   name       = "aws-efa-k8s-device-plugin"
   repository = "https://aws.github.io/eks-charts"
   chart      = "aws-efa-k8s-device-plugin"
-  version    = "v0.5.2"
+  version    = "v0.5.5"
   namespace  = "kube-system"
   wait       = false
 

diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf
@@ -4,11 +4,11 @@ terraform {
   required_providers {
     aws = {
       source  = "hashicorp/aws"
-      version = ">= 5.57"
+      version = ">= 5.70"
     }
     helm = {
       source  = "hashicorp/helm"
-      version = ">= 2.9"
+      version = ">= 2.16"
     }
   }
 

diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md
@@ -17,11 +17,11 @@ The following components are demonstrated in this pattern:
 
 ## Code
 
-```terraform hl_lines="24-26 32-67"
+```terraform hl_lines="26-28 34-80"
 {% include  "../../patterns/nvidia-gpu-efa/eks.tf" %}
 ```
 
-```terraform hl_lines="5-47"
+```terraform hl_lines="5-33"
 {% include  "../../patterns/nvidia-gpu-efa/helm.tf" %}
 ```
 

diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf
@@ -4,10 +4,10 @@
 
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.17"
+  version = "~> 20.26"
 
   cluster_name    = local.name
-  cluster_version = "1.30"
+  cluster_version = "1.31"
 
   # Give the Terraform identity admin access to the cluster
   # which will allow it to deploy resources into the cluster
@@ -18,7 +18,9 @@ module "eks" {
     coredns                = {}
     eks-pod-identity-agent = {}
     kube-proxy             = {}
-    vpc-cni                = {}
+    vpc-cni = {
+      most_recent = true
+    }
   }
 
   # Add security group rules on the node group security group to
@@ -30,16 +32,27 @@ module "eks" {
 
   eks_managed_node_groups = {
     nvidia-efa = {
-      # The EKS AL2 GPU AMI provides all of the necessary components
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
       # for accelerated workloads w/ EFA
-      ami_type       = "AL2_x86_64_GPU"
+      ami_type       = "AL2023_x86_64_NVIDIA"
       instance_types = ["p5.48xlarge"]
 
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-      EOT
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+          EOT
+        }
+      ]
 
       min_size     = 2
       max_size     = 2

diff --git a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh
@@ -2,7 +2,7 @@
 
 export MPI_JOB_NAME=efa-nccl-test
 export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest
-export INSTANCE_TYPE=p5.48xlarge
+export INSTANCE_TYPE=p5e.48xlarge
 export NUM_WORKERS=2
 export GPU_PER_WORKER=8
 export EFA_PER_WORKER=32

diff --git a/patterns/nvidia-gpu-efa/helm.tf b/patterns/nvidia-gpu-efa/helm.tf
@@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
   name             = "nvidia-device-plugin"
   repository       = "https://nvidia.github.io/k8s-device-plugin"
   chart            = "nvidia-device-plugin"
-  version          = "0.14.5"
+  version          = "0.16.2"
   namespace        = "nvidia-device-plugin"
   create_namespace = true
   wait             = false
-
-  values = [
-    <<-EOT
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: 'nvidia.com/gpu.present'
-                operator: In
-                values:
-                - 'true'
-    EOT
-  ]
 }
 
 resource "helm_release" "aws_efa_device_plugin" {
   name       = "aws-efa-k8s-device-plugin"
   repository = "https://aws.github.io/eks-charts"
   chart      = "aws-efa-k8s-device-plugin"
-  version    = "v0.5.2"
+  version    = "v0.5.5"
   namespace  = "kube-system"
   wait       = false
 

diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf
@@ -4,11 +4,11 @@ terraform {
   required_providers {
     aws = {
       source  = "hashicorp/aws"
-      version = ">= 5.34"
+      version = ">= 5.70"
     }
     helm = {
       source  = "hashicorp/helm"
-      version = ">= 2.9"
+      version = ">= 2.16"
     }
   }
 

diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md
@@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations
 
 ## Code
 
-```terraform hl_lines="5-8 81-88 108-131"
+```terraform hl_lines="5-8 94-104 124-147"
 {% include  "../../patterns/targeted-odcr/eks.tf" %}
 ```
 

diff --git a/patterns/targeted-odcr/eks.tf b/patterns/targeted-odcr/eks.tf
@@ -13,10 +13,10 @@ variable "capacity_reservation_arns" {
 
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.17"
+  version = "~> 20.26"
 
   cluster_name    = local.name
-  cluster_version = "1.30"
+  cluster_version = "1.31"
 
   # Give the Terraform identity admin access to the cluster
   # which will allow it to deploy resources into the cluster
@@ -27,7 +27,9 @@ module "eks" {
     coredns                = {}
     eks-pod-identity-agent = {}
     kube-proxy             = {}
-    vpc-cni                = {}
+    vpc-cni = {
+      most_recent = true
+    }
   }
 
   # Add security group rules on the node group security group to
@@ -39,16 +41,27 @@ module "eks" {
 
   eks_managed_node_groups = {
     odcr = {
-      # The EKS AL2 GPU AMI provides all of the necessary components
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
       # for accelerated workloads w/ EFA
-      ami_type      = "AL2_x86_64_GPU"
-      instance_type = "p5.48xlarge"
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-      EOT
+      ami_type       = "AL2023_x86_64_NVIDIA"
+      instance_types = ["p5.48xlarge"]
+
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+          EOT
+        }
+      ]
 
       min_size     = 2
       max_size     = 2
-Original file line number
+Diff line change
@@ Expand Up @@
 . Validate the `aws-logging` configMap for Fargate Fluentbit was created:
         ```sh
-        kubectl -n aws-observability get configmap aws-logging
+        kubectl -n aws-observability get configmap aws-logging
         ```
         ```yaml
@@ Expand Down @@