From 36ef14d6ec2a856a7af0ae8985b20dfb17242dfa Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Tue, 24 Oct 2023 12:04:45 -0700 Subject: [PATCH 1/7] Increase timeout for GADM job of integrated_alerts Increase time to 6 hours from default timeout of 4 hours for GADM job of integrated_alerts. --- src/datapump/sync/sync.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/datapump/sync/sync.py b/src/datapump/sync/sync.py index 1fbe59fe..b4787248 100644 --- a/src/datapump/sync/sync.py +++ b/src/datapump/sync/sync.py @@ -206,7 +206,7 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]: if not self._should_update(latest_versions): return [] - jobs = [] + jobs: List[Job] = [] if config.dataset == "gadm": jobs.append( @@ -237,22 +237,25 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]: ), ) ) - jobs.append( - GeotrellisJob( - id=str(uuid1()), - status=JobStatus.starting, - analysis_version=config.analysis_version, - sync_version=self.sync_version, - sync_type=config.sync_type, - table=AnalysisInputTable( - dataset=config.dataset, - version=config.dataset_version, - analysis=config.analysis, - ), - features_1x1=config.metadata["features_1x1"], - geotrellis_version=config.metadata["geotrellis_version"], - ) + + job = GeotrellisJob( + id=str(uuid1()), + status=JobStatus.starting, + analysis_version=config.analysis_version, + sync_version=self.sync_version, + sync_type=config.sync_type, + table=AnalysisInputTable( + dataset=config.dataset, + version=config.dataset_version, + analysis=config.analysis, + ), + features_1x1=config.metadata["features_1x1"], + geotrellis_version=config.metadata["geotrellis_version"], ) + # Increase default timeout for gadm generated datasets. + if config.dataset == "gadm": + job.timeout_sec = 6 * 3600 + jobs.append(job) return jobs From 8faa9318fcfda9f43c58d5f67af43b572c96100e Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Thu, 26 Oct 2023 11:04:13 -0700 Subject: [PATCH 2/7] Revert timeout increase for gadm The timeout was caused by a big DB ingestion job that was preventing all other DB ingestion jobs from running, so gadm timeout increase doesn't really help generally. --- src/datapump/sync/sync.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/datapump/sync/sync.py b/src/datapump/sync/sync.py index b4787248..1fbe59fe 100644 --- a/src/datapump/sync/sync.py +++ b/src/datapump/sync/sync.py @@ -206,7 +206,7 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]: if not self._should_update(latest_versions): return [] - jobs: List[Job] = [] + jobs = [] if config.dataset == "gadm": jobs.append( @@ -237,25 +237,22 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]: ), ) ) - - job = GeotrellisJob( - id=str(uuid1()), - status=JobStatus.starting, - analysis_version=config.analysis_version, - sync_version=self.sync_version, - sync_type=config.sync_type, - table=AnalysisInputTable( - dataset=config.dataset, - version=config.dataset_version, - analysis=config.analysis, - ), - features_1x1=config.metadata["features_1x1"], - geotrellis_version=config.metadata["geotrellis_version"], + jobs.append( + GeotrellisJob( + id=str(uuid1()), + status=JobStatus.starting, + analysis_version=config.analysis_version, + sync_version=self.sync_version, + sync_type=config.sync_type, + table=AnalysisInputTable( + dataset=config.dataset, + version=config.dataset_version, + analysis=config.analysis, + ), + features_1x1=config.metadata["features_1x1"], + geotrellis_version=config.metadata["geotrellis_version"], + ) ) - # Increase default timeout for gadm generated datasets. - if config.dataset == "gadm": - job.timeout_sec = 6 * 3600 - jobs.append(job) return jobs From 816e5c7aec74936c6701ac59f8bd851392a938fc Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 22 Apr 2024 16:57:16 -0400 Subject: [PATCH 3/7] Disable use of public subnets for EMR --- terraform/main.tf | 1 - terraform/modules/datapump/lambdas.tf | 1 - terraform/modules/datapump/variables.tf | 6 ------ 3 files changed, 8 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index 91b1f600..9791f06f 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -54,7 +54,6 @@ module "datapump" { glad_path = local.glad_path emr_instance_profile_name = data.terraform_remote_state.core.outputs.emr_instance_profile_name emr_service_role_name = data.terraform_remote_state.core.outputs.emr_service_role_name - public_subnet_ids = data.terraform_remote_state.core.outputs.public_subnet_ids ec2_key_name = data.terraform_remote_state.core.outputs.key_pair_jterry_gfw gcs_secret_arn = data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_arn read_gfw_api_secrets_policy = data.terraform_remote_state.core.outputs.secrets_read-gfw-api-token_policy_arn diff --git a/terraform/modules/datapump/lambdas.tf b/terraform/modules/datapump/lambdas.tf index 20592acc..c77f8ada 100644 --- a/terraform/modules/datapump/lambdas.tf +++ b/terraform/modules/datapump/lambdas.tf @@ -45,7 +45,6 @@ resource "aws_lambda_function" "executor" { S3_BUCKET_PIPELINE = var.pipelines_bucket S3_BUCKET_DATA_LAKE = var.data_lake_bucket GEOTRELLIS_JAR_PATH = var.geotrellis_jar_path - PUBLIC_SUBNET_IDS = jsonencode(var.public_subnet_ids) EC2_KEY_NAME = var.ec2_key_name EMR_SERVICE_ROLE = var.emr_service_role_name EMR_INSTANCE_PROFILE = var.emr_instance_profile_name diff --git a/terraform/modules/datapump/variables.tf b/terraform/modules/datapump/variables.tf index ad188ec6..922b4d49 100644 --- a/terraform/modules/datapump/variables.tf +++ b/terraform/modules/datapump/variables.tf @@ -65,12 +65,6 @@ variable "ec2_key_name" { description = "Key pair to use for SSHing into EC2" } -variable "public_subnet_ids" { - default = [] - type = list(string) - description = "Public subnet IDs to run on" -} - variable "pipelines_bucket" { type = string description = "Pipelines bucket to store intermediate results" From 0809c32f1b16a7c5bcf5027e7b1842195ff5faf7 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Tue, 23 Apr 2024 14:48:03 -0400 Subject: [PATCH 4/7] Revert "Disable use of public subnets for EMR" This reverts commit 816e5c7aec74936c6701ac59f8bd851392a938fc. --- terraform/main.tf | 1 + terraform/modules/datapump/lambdas.tf | 1 + terraform/modules/datapump/variables.tf | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/terraform/main.tf b/terraform/main.tf index 9791f06f..91b1f600 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -54,6 +54,7 @@ module "datapump" { glad_path = local.glad_path emr_instance_profile_name = data.terraform_remote_state.core.outputs.emr_instance_profile_name emr_service_role_name = data.terraform_remote_state.core.outputs.emr_service_role_name + public_subnet_ids = data.terraform_remote_state.core.outputs.public_subnet_ids ec2_key_name = data.terraform_remote_state.core.outputs.key_pair_jterry_gfw gcs_secret_arn = data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_arn read_gfw_api_secrets_policy = data.terraform_remote_state.core.outputs.secrets_read-gfw-api-token_policy_arn diff --git a/terraform/modules/datapump/lambdas.tf b/terraform/modules/datapump/lambdas.tf index c77f8ada..20592acc 100644 --- a/terraform/modules/datapump/lambdas.tf +++ b/terraform/modules/datapump/lambdas.tf @@ -45,6 +45,7 @@ resource "aws_lambda_function" "executor" { S3_BUCKET_PIPELINE = var.pipelines_bucket S3_BUCKET_DATA_LAKE = var.data_lake_bucket GEOTRELLIS_JAR_PATH = var.geotrellis_jar_path + PUBLIC_SUBNET_IDS = jsonencode(var.public_subnet_ids) EC2_KEY_NAME = var.ec2_key_name EMR_SERVICE_ROLE = var.emr_service_role_name EMR_INSTANCE_PROFILE = var.emr_instance_profile_name diff --git a/terraform/modules/datapump/variables.tf b/terraform/modules/datapump/variables.tf index 922b4d49..ad188ec6 100644 --- a/terraform/modules/datapump/variables.tf +++ b/terraform/modules/datapump/variables.tf @@ -65,6 +65,12 @@ variable "ec2_key_name" { description = "Key pair to use for SSHing into EC2" } +variable "public_subnet_ids" { + default = [] + type = list(string) + description = "Public subnet IDs to run on" +} + variable "pipelines_bucket" { type = string description = "Pipelines bucket to store intermediate results" From cddf74a976e99256cd1bb7cc400e1e017747d6d8 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Tue, 23 Apr 2024 14:55:40 -0400 Subject: [PATCH 5/7] Use private subnet IDs for EMR jobs --- src/Dockerfile | 2 +- src/datapump/globals.py | 4 ++-- src/datapump/jobs/geotrellis.py | 4 ++-- terraform/main.tf | 2 +- terraform/modules/datapump/variables.tf | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index b90959fc..e713cc80 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -15,7 +15,7 @@ RUN pip install . -t python # to change the hash of the file and get TF to realize it needs to be # redeployed. Ticket for a better solution: # https://gfw.atlassian.net/browse/GTC-1250 -# change 1 +# change 3 RUN yum install -y zip geos-devel diff --git a/src/datapump/globals.py b/src/datapump/globals.py index 113e18c2..3592b020 100644 --- a/src/datapump/globals.py +++ b/src/datapump/globals.py @@ -30,8 +30,8 @@ class Globals(EnvSettings): s3_bucket_data_lake: str = Field(env="S3_BUCKET_DATA_LAKE") s3_glad_path: Optional[str] = Field(env="S3_GLAD_PATH") ec2_key_name: Optional[str] = Field("", env="EC2_KEY_NAME") - public_subnet_ids: List[str] = Field( - json.loads(os.environ.get("PUBLIC_SUBNET_IDS", b"[]")) + subnet_ids: List[str] = Field( + json.loads(os.environ.get("SUBNET_IDS", b"[]")) ) emr_instance_profile: Optional[str] = Field("", env="EMR_INSTANCE_PROFILE") emr_service_role: Optional[str] = Field("", env="EMR_SERVICE_ROLE") diff --git a/src/datapump/jobs/geotrellis.py b/src/datapump/jobs/geotrellis.py index 587e3c3d..94c922b0 100644 --- a/src/datapump/jobs/geotrellis.py +++ b/src/datapump/jobs/geotrellis.py @@ -909,8 +909,8 @@ def _instances(worker_count: int) -> Dict[str, Any]: if GLOBALS.ec2_key_name: instances["Ec2KeyName"] = GLOBALS.ec2_key_name - if GLOBALS.public_subnet_ids: - instances["Ec2SubnetIds"] = GLOBALS.public_subnet_ids + if GLOBALS.subnet_ids: + instances["Ec2SubnetIds"] = GLOBALS.subnet_ids return instances diff --git a/terraform/main.tf b/terraform/main.tf index 91b1f600..a21150b7 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -54,7 +54,7 @@ module "datapump" { glad_path = local.glad_path emr_instance_profile_name = data.terraform_remote_state.core.outputs.emr_instance_profile_name emr_service_role_name = data.terraform_remote_state.core.outputs.emr_service_role_name - public_subnet_ids = data.terraform_remote_state.core.outputs.public_subnet_ids + subnet_ids = data.terraform_remote_state.core.outputs.private_subnet_ids ec2_key_name = data.terraform_remote_state.core.outputs.key_pair_jterry_gfw gcs_secret_arn = data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_arn read_gfw_api_secrets_policy = data.terraform_remote_state.core.outputs.secrets_read-gfw-api-token_policy_arn diff --git a/terraform/modules/datapump/variables.tf b/terraform/modules/datapump/variables.tf index ad188ec6..62c9364e 100644 --- a/terraform/modules/datapump/variables.tf +++ b/terraform/modules/datapump/variables.tf @@ -65,10 +65,10 @@ variable "ec2_key_name" { description = "Key pair to use for SSHing into EC2" } -variable "public_subnet_ids" { +variable "subnet_ids" { default = [] type = list(string) - description = "Public subnet IDs to run on" + description = "Subnet IDs to run on" } variable "pipelines_bucket" { From c4c48c5942b1b42c876b916c2c6e371b54de02f3 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Tue, 23 Apr 2024 15:06:12 -0400 Subject: [PATCH 6/7] Missed a reference to public_subnet_ids --- terraform/modules/datapump/lambdas.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/datapump/lambdas.tf b/terraform/modules/datapump/lambdas.tf index 20592acc..ea880fd2 100644 --- a/terraform/modules/datapump/lambdas.tf +++ b/terraform/modules/datapump/lambdas.tf @@ -45,7 +45,7 @@ resource "aws_lambda_function" "executor" { S3_BUCKET_PIPELINE = var.pipelines_bucket S3_BUCKET_DATA_LAKE = var.data_lake_bucket GEOTRELLIS_JAR_PATH = var.geotrellis_jar_path - PUBLIC_SUBNET_IDS = jsonencode(var.public_subnet_ids) + SUBNET_IDS = jsonencode(var.subnet_ids) EC2_KEY_NAME = var.ec2_key_name EMR_SERVICE_ROLE = var.emr_service_role_name EMR_INSTANCE_PROFILE = var.emr_instance_profile_name From 1a04333919a67b92679fccc92f37426b1e936e0a Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Thu, 25 Apr 2024 19:07:18 +0300 Subject: [PATCH 7/7] stagger VIIRS and GLAD cron schedules --- terraform/modules/datapump/cloudwatch.tf | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/terraform/modules/datapump/cloudwatch.tf b/terraform/modules/datapump/cloudwatch.tf index 18a6e343..46b69e86 100644 --- a/terraform/modules/datapump/cloudwatch.tf +++ b/terraform/modules/datapump/cloudwatch.tf @@ -5,6 +5,13 @@ resource "aws_cloudwatch_event_rule" "everyday-11-pm-est" { tags = local.tags } +resource "aws_cloudwatch_event_rule" "everyday-10-pm-est" { + name = substr("everyday-10-pm-est${local.name_suffix}", 0, 64) + description = "Run everyday at 10 pm EST" + schedule_expression = "cron(0 6 ? * * *)" + tags = local.tags +} + resource "aws_cloudwatch_event_rule" "everyday-7-pm-est" { name = substr("everyday-7-pm-est${local.name_suffix}", 0, 64) description = "Run everyday at 7 pm EST" @@ -38,7 +45,7 @@ resource "aws_cloudwatch_event_target" "sync-deforestation-alerts" { } resource "aws_cloudwatch_event_target" "sync-glad" { - rule = aws_cloudwatch_event_rule.everyday-11-pm-est.name + rule = aws_cloudwatch_event_rule.everyday-10-pm-est.name target_id = substr("${local.project}-sync-glad${local.name_suffix}", 0, 64) arn = aws_sfn_state_machine.datapump.id input = "{\"command\": \"sync\", \"parameters\": {\"types\": [\"glad\"]}}"