From f57aae7d8a82a774f6b73bb209d9067e3d71f23c Mon Sep 17 00:00:00 2001 From: Andrew Player Date: Thu, 20 Jun 2024 15:03:41 -0500 Subject: [PATCH 01/13] update bootstrap scripted and instance type for lavas --- .github/workflows/deploy-enterprise-test.yml | 2 +- apps/compute-cf.yml.j2 | 22 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 06b1a3406..9c1b16f01 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: r6id.xlarge,r6id.2xlarge,r6id.4xlarge,r6id.8xlarge,r6idn.xlarge,r6idn.2xlarge,r6idn.4xlarge,r6idn.8xlarge + instance_types: g6.2xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0 diff --git a/apps/compute-cf.yml.j2 b/apps/compute-cf.yml.j2 index 0cb23e152..b8f54c18d 100644 --- a/apps/compute-cf.yml.j2 +++ b/apps/compute-cf.yml.j2 @@ -61,6 +61,28 @@ Resources: cloud-init-per instance mkfs_ssd mkfs.ext4 /dev/nvme1n1 mount /dev/nvme1n1 /var/lib/docker + DRIVER_VERSION=550.54.14 + dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) kernel-modules-extra + curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --tmpdir . --silent + rm ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + + dnf install -y docker git + systemctl start docker + systemctl enable docker + usermod -aG docker ec2-user + + dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo + dnf install -y nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + + dnf install -y git + + dnf clean all && rm -rf /var/cache/dnf/* + + reboot --==BOUNDARY==-- ComputeEnvironment: From 70a98fcbd738f37abfe44dafd4f0033dd8ed444b Mon Sep 17 00:00:00 2001 From: Andrew Player Date: Fri, 21 Jun 2024 12:29:13 -0500 Subject: [PATCH 02/13] Updated GSLC Job Spec for GPU Support --- job_spec/SRG_GSLC_CPU.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/job_spec/SRG_GSLC_CPU.yml b/job_spec/SRG_GSLC_CPU.yml index 856a01293..bc4667b8b 100644 --- a/job_spec/SRG_GSLC_CPU.yml +++ b/job_spec/SRG_GSLC_CPU.yml @@ -26,10 +26,11 @@ SRG_GSLC_CPU: cost: 1.0 tasks: - name: '' - image: ghcr.io/asfhyp3/hyp3-back-projection + image: ghcr.io/asfhyp3/hyp3-back-projection:0.5.2.gpu command: - ++process - back_projection + - --gpu - --bucket - '!Ref Bucket' - --bucket-prefix From bc8e2f06ad09b4cf7e093d0aa08dea5d6d8fd0fc Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Fri, 21 Jun 2024 09:34:27 -0800 Subject: [PATCH 03/13] add gpu resource req --- apps/workflow-cf.yml.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/workflow-cf.yml.j2 b/apps/workflow-cf.yml.j2 index 642d6014d..aa03ac56e 100644 --- a/apps/workflow-cf.yml.j2 +++ b/apps/workflow-cf.yml.j2 @@ -58,6 +58,8 @@ Resources: ResourceRequirements: - Type: VCPU Value: "{{ task['vcpu'] }}" + - Type: GPU + Value: 1 - Type: MEMORY Value: "{{ task['memory'] }}" Command: From e092c57aac7b67f7756b076b0cb6c7e832ee20b0 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Fri, 21 Jun 2024 11:29:04 -0800 Subject: [PATCH 04/13] more gpu instance types --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 9c1b16f01..2b7a4af14 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: g6.2xlarge + instance_types: g6.2xlarge,g6.4xlarge,g4dn.2xlarge,g4dn.4xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0 From ddeb39429b36ba526854e518ee20faa88b414c53 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Fri, 21 Jun 2024 14:14:45 -0800 Subject: [PATCH 05/13] Remove reboot from user script --- apps/compute-cf.yml.j2 | 2 -- 1 file changed, 2 deletions(-) diff --git a/apps/compute-cf.yml.j2 b/apps/compute-cf.yml.j2 index b8f54c18d..0ddb78239 100644 --- a/apps/compute-cf.yml.j2 +++ b/apps/compute-cf.yml.j2 @@ -81,8 +81,6 @@ Resources: dnf install -y git dnf clean all && rm -rf /var/cache/dnf/* - - reboot --==BOUNDARY==-- ComputeEnvironment: From d3a991762b2ea1f117e9b1d0f4b56cbe5bc25ed3 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 10:17:09 -0800 Subject: [PATCH 06/13] Revert User data script changes --- apps/compute-cf.yml.j2 | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/apps/compute-cf.yml.j2 b/apps/compute-cf.yml.j2 index 0ddb78239..0cb23e152 100644 --- a/apps/compute-cf.yml.j2 +++ b/apps/compute-cf.yml.j2 @@ -61,26 +61,6 @@ Resources: cloud-init-per instance mkfs_ssd mkfs.ext4 /dev/nvme1n1 mount /dev/nvme1n1 /var/lib/docker - DRIVER_VERSION=550.54.14 - dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) kernel-modules-extra - curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run - chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run - ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --tmpdir . --silent - rm ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run - - dnf install -y docker git - systemctl start docker - systemctl enable docker - usermod -aG docker ec2-user - - dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo - dnf install -y nvidia-container-toolkit - nvidia-ctk runtime configure --runtime=docker - systemctl restart docker - - dnf install -y git - - dnf clean all && rm -rf /var/cache/dnf/* --==BOUNDARY==-- ComputeEnvironment: From 461aa272426e158c2a3ee37c70775615431d45e3 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 11:01:50 -0800 Subject: [PATCH 07/13] fix image tag --- job_spec/SRG_GSLC_CPU.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/job_spec/SRG_GSLC_CPU.yml b/job_spec/SRG_GSLC_CPU.yml index bc4667b8b..79589a14b 100644 --- a/job_spec/SRG_GSLC_CPU.yml +++ b/job_spec/SRG_GSLC_CPU.yml @@ -26,7 +26,8 @@ SRG_GSLC_CPU: cost: 1.0 tasks: - name: '' - image: ghcr.io/asfhyp3/hyp3-back-projection:0.5.2.gpu + image: ghcr.io/asfhyp3/hyp3-back-projection + image_tag: 0.5.2.gpu command: - ++process - back_projection From 8fe4babbb97d7f4ab7e40164e187c28a482e1442 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 11:04:12 -0800 Subject: [PATCH 08/13] fix ami_id --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 2b7a4af14..029ccda6c 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -92,7 +92,7 @@ jobs: expanded_max_vcpus: 640 required_surplus: 0 security_environment: ASF - ami_id: /aws/service/ecs/optimized-ami/amazon-linux-2023/recommended/image_id + ami_id: /aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended/image_id distribution_url: '' environment: From b0eba6a42533eda163bca7c08314ebedb3517c42 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 11:05:03 -0800 Subject: [PATCH 09/13] revert instance types --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 029ccda6c..4fe6fdc63 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: g6.2xlarge,g6.4xlarge,g4dn.2xlarge,g4dn.4xlarge + instance_types: g6.2xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0 From a9019b60a5e06a60ee3bdbc3f2e55d1cca40c293 Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 11:30:45 -0800 Subject: [PATCH 10/13] use g5 --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 4fe6fdc63..7b28b2c1d 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: g6.2xlarge + instance_types: g5.2xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0 From 221b75bd44d7adf0a7f313d95bc8604222fac37a Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 11:45:16 -0800 Subject: [PATCH 11/13] back to g4dn --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 7b28b2c1d..3ec5df8b1 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: g5.2xlarge + instance_types: g4dn.2xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0 From c3bc094d1c8cd51fe15c52cf10efe19946b459bc Mon Sep 17 00:00:00 2001 From: Andrew Johnston Date: Mon, 24 Jun 2024 12:36:22 -0800 Subject: [PATCH 12/13] reduce memory reservation for GSLC jobs --- job_spec/SRG_GSLC_CPU.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job_spec/SRG_GSLC_CPU.yml b/job_spec/SRG_GSLC_CPU.yml index 79589a14b..714fda46d 100644 --- a/job_spec/SRG_GSLC_CPU.yml +++ b/job_spec/SRG_GSLC_CPU.yml @@ -39,7 +39,7 @@ SRG_GSLC_CPU: - Ref::granules timeout: 10800 vcpu: 1 - memory: 31500 + memory: 30500 secrets: - EARTHDATA_USERNAME - EARTHDATA_PASSWORD From 56387ac3fa1a9d8fe57c4bf3968adcf63e34065b Mon Sep 17 00:00:00 2001 From: Jake Herrmann Date: Mon, 24 Jun 2024 13:03:47 -0800 Subject: [PATCH 13/13] update g6 --- .github/workflows/deploy-enterprise-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-enterprise-test.yml b/.github/workflows/deploy-enterprise-test.yml index 3ec5df8b1..4fe6fdc63 100644 --- a/.github/workflows/deploy-enterprise-test.yml +++ b/.github/workflows/deploy-enterprise-test.yml @@ -87,7 +87,7 @@ jobs: job_files: >- job_spec/INSAR_ISCE_BURST.yml job_spec/SRG_GSLC_CPU.yml - instance_types: g4dn.2xlarge + instance_types: g6.2xlarge default_max_vcpus: 640 expanded_max_vcpus: 640 required_surplus: 0