From a629617dfc67d828a718c5549b6eea394c9a7da1 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 09:33:17 -0500 Subject: [PATCH 01/16] WIP Azure Outage Alert Co-Authored-By: Samuel Aquino --- operations/template/alert.tf | 43 ++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 642b75089..be90032d3 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -28,6 +28,49 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { } } +resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { + count = local.non_pr_environment ? 1 : 0 + name = "cdcti-${var.environment}-azure-status-alert" + location = data.azurerm_resource_group.group.location + resource_group_name = data.azurerm_resource_group.group.name + scopes = [azurerm_container_registry.registry.id] + + criteria { + category = "ServiceHealth" + levels = ["Error"] + service_health { + locations = ["East US", "Global"] + events = ["Incident"] + services = ["*"] + } + } + + action { + action_group_id = [azurerm_monitor_action_group.notify_slack_email[count.index].id] + email_subject = "FATAL: Azure Outage Alert!" + } + + description = "Alert service(s) appear to be down" + enabled = true + + lifecycle { + ignore_changes = [ + tags["business_steward"], + tags["center"], + tags["environment"], + tags["escid"], + tags["funding_source"], + tags["pii_data"], + tags["security_compliance"], + tags["security_steward"], + tags["support_group"], + tags["system"], + tags["technical_steward"], + tags["zone"] + ] + } +} + resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-api-log-token-alert" From 07fbaf3dee15f577942d6de7eac4f05edb5e4059 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:19:37 -0500 Subject: [PATCH 02/16] Attempt action_group_id fix Co-Authored-By: Samuel Aquino --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index be90032d3..cdae8fd21 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -46,7 +46,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { } action { - action_group_id = [azurerm_monitor_action_group.notify_slack_email[count.index].id] + action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id email_subject = "FATAL: Azure Outage Alert!" } From e4ec0a45db18bf0fc6d59bd2e447852afd4a72b2 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:22:27 -0500 Subject: [PATCH 03/16] Removed unnecessary email_subject --- operations/template/alert.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index cdae8fd21..4a4bdffde 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -47,7 +47,6 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { action { action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id - email_subject = "FATAL: Azure Outage Alert!" } description = "Alert service(s) appear to be down" From 71c105662161953d1a6755a4c01520f9de23830c Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:41:02 -0500 Subject: [PATCH 04/16] Refactoring location --- operations/template/alert.tf | 2 +- operations/template/main.tf | 2 +- operations/template/variables.tf | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 4a4bdffde..59e98ded6 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -39,7 +39,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { category = "ServiceHealth" levels = ["Error"] service_health { - locations = ["East US", "Global"] + locations = var.service_health_locations events = ["Incident"] services = ["*"] } diff --git a/operations/template/main.tf b/operations/template/main.tf index ff42970ce..579a23c60 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,7 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr") + non_pr_environment = !strcontains(var.environment, "pr", "dev") # dev is temp while testing } data "azurerm_resource_group" "group" { diff --git a/operations/template/variables.tf b/operations/template/variables.tf index 0007ad678..bd74082b2 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -19,3 +19,8 @@ variable "alert_slack_email" { nullable = false sensitive = true } + +variable "service_health_locations" { + type = list(string) + default = ["East US"] +} From 1741c974e6ce30e8bd3598e6ce92b6540182430d Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:51:54 -0500 Subject: [PATCH 05/16] Remove temp change --- operations/template/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/main.tf b/operations/template/main.tf index 579a23c60..ff42970ce 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,7 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr", "dev") # dev is temp while testing + non_pr_environment = !strcontains(var.environment, "pr") } data "azurerm_resource_group" "group" { From f60e5fb5785601a7ff91b25fa0f6f36eaef9a893 Mon Sep 17 00:00:00 2001 From: James Herr Date: Mon, 21 Oct 2024 11:20:26 -0500 Subject: [PATCH 06/16] Attempting SKU change --- operations/template/app.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/app.tf b/operations/template/app.tf index bb7b99f81..4b7ea0ac1 100644 --- a/operations/template/app.tf +++ b/operations/template/app.tf @@ -81,7 +81,7 @@ resource "azurerm_service_plan" "plan" { resource_group_name = data.azurerm_resource_group.group.name location = data.azurerm_resource_group.group.location os_type = "Linux" - sku_name = local.higher_environment_level ? "P1v3" : "P0v3" + sku_name = local.higher_environment_level ? "P1v3" : "P1v3" zone_balancing_enabled = local.higher_environment_level # below tags are managed by CDC From 6df6a9320c119e97b2fc0843f0acac6cab497583 Mon Sep 17 00:00:00 2001 From: James Herr Date: Mon, 21 Oct 2024 11:24:09 -0500 Subject: [PATCH 07/16] Reverting SKU test --- operations/template/app.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/app.tf b/operations/template/app.tf index 4b7ea0ac1..bb7b99f81 100644 --- a/operations/template/app.tf +++ b/operations/template/app.tf @@ -81,7 +81,7 @@ resource "azurerm_service_plan" "plan" { resource_group_name = data.azurerm_resource_group.group.name location = data.azurerm_resource_group.group.location os_type = "Linux" - sku_name = local.higher_environment_level ? "P1v3" : "P1v3" + sku_name = local.higher_environment_level ? "P1v3" : "P0v3" zone_balancing_enabled = local.higher_environment_level # below tags are managed by CDC From a1a85f8cea58d7e58875c708e91befa3a1ed8dde Mon Sep 17 00:00:00 2001 From: James Herr Date: Mon, 21 Oct 2024 11:42:13 -0500 Subject: [PATCH 08/16] Removed services and fixed location string --- operations/template/alert.tf | 1 - operations/template/variables.tf | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 59e98ded6..dab40d19d 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -41,7 +41,6 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { service_health { locations = var.service_health_locations events = ["Incident"] - services = ["*"] } } diff --git a/operations/template/variables.tf b/operations/template/variables.tf index bd74082b2..33946813a 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -22,5 +22,5 @@ variable "alert_slack_email" { variable "service_health_locations" { type = list(string) - default = ["East US"] + default = ["EastUS"] } From 1c072b5cc373c981e2f64e76770c285e9f046f07 Mon Sep 17 00:00:00 2001 From: saquino0827 Date: Mon, 21 Oct 2024 11:49:41 -0500 Subject: [PATCH 09/16] set default to global for service health locations --- operations/template/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/variables.tf b/operations/template/variables.tf index 33946813a..1e01b3646 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -22,5 +22,5 @@ variable "alert_slack_email" { variable "service_health_locations" { type = list(string) - default = ["EastUS"] + default = ["global"] } From ee59f3b4c0a0df38545c630a9d1501d73d3f67fd Mon Sep 17 00:00:00 2001 From: saquino0827 Date: Mon, 21 Oct 2024 12:02:16 -0500 Subject: [PATCH 10/16] Testing different locations for azure outage alert --- operations/template/alert.tf | 4 ++-- operations/template/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index dab40d19d..a0854617c 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -31,7 +31,7 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-status-alert" - location = data.azurerm_resource_group.group.location + location = var.service_health_locations resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_container_registry.registry.id] @@ -39,7 +39,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { category = "ServiceHealth" levels = ["Error"] service_health { - locations = var.service_health_locations + locations = [var.service_health_locations] events = ["Incident"] } } diff --git a/operations/template/variables.tf b/operations/template/variables.tf index 1e01b3646..97d3ab9cc 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -21,6 +21,6 @@ variable "alert_slack_email" { } variable "service_health_locations" { - type = list(string) - default = ["global"] + type = string + default = "global" } From c8106f565227e5bce9a8a1ef8fcac72ef813a88e Mon Sep 17 00:00:00 2001 From: saquino0827 Date: Mon, 21 Oct 2024 12:22:40 -0500 Subject: [PATCH 11/16] Make the resource group to the scope of this alert --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index a0854617c..525ff7040 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -33,7 +33,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { name = "cdcti-${var.environment}-azure-status-alert" location = var.service_health_locations resource_group_name = data.azurerm_resource_group.group.name - scopes = [azurerm_container_registry.registry.id] + scopes = [data.azurerm_resource_group.group.id] criteria { category = "ServiceHealth" From fb7c529c536721e870037e247a0f4daa62fa6e29 Mon Sep 17 00:00:00 2001 From: saquino0827 Date: Mon, 21 Oct 2024 12:29:40 -0500 Subject: [PATCH 12/16] Change scope to our azure subscription Co-Authored-By: halprin --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 525ff7040..bb134bbae 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -33,7 +33,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { name = "cdcti-${var.environment}-azure-status-alert" location = var.service_health_locations resource_group_name = data.azurerm_resource_group.group.name - scopes = [data.azurerm_resource_group.group.id] + scopes = [data.azurerm_client_config.current.subscription_id] criteria { category = "ServiceHealth" From 925418c525bd11f9ad22c354056cd5b632789bec Mon Sep 17 00:00:00 2001 From: saquino0827 Date: Mon, 21 Oct 2024 12:43:33 -0500 Subject: [PATCH 13/16] Update scope to the correct format of subscription id Co-authored-by: halprin --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index bb134bbae..2ca6a15de 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -33,7 +33,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { name = "cdcti-${var.environment}-azure-status-alert" location = var.service_health_locations resource_group_name = data.azurerm_resource_group.group.name - scopes = [data.azurerm_client_config.current.subscription_id] + scopes = ["/subscriptions/${data.azurerm_client_config.current.subscription_id}"] criteria { category = "ServiceHealth" From 3844a508556ca7d141251aee4f681681b0f34d4c Mon Sep 17 00:00:00 2001 From: Bella Luz Quintero Date: Mon, 21 Oct 2024 15:44:00 -0600 Subject: [PATCH 14/16] remove string variable --- operations/template/alert.tf | 4 ++-- operations/template/variables.tf | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 2ca6a15de..2d449d9f9 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -31,7 +31,7 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-status-alert" - location = var.service_health_locations + location = "global" resource_group_name = data.azurerm_resource_group.group.name scopes = ["/subscriptions/${data.azurerm_client_config.current.subscription_id}"] @@ -39,7 +39,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { category = "ServiceHealth" levels = ["Error"] service_health { - locations = [var.service_health_locations] + locations = "global" events = ["Incident"] } } diff --git a/operations/template/variables.tf b/operations/template/variables.tf index 97d3ab9cc..0007ad678 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -19,8 +19,3 @@ variable "alert_slack_email" { nullable = false sensitive = true } - -variable "service_health_locations" { - type = string - default = "global" -} From 40b42a379fb8b920117843f657c67062781bdd5f Mon Sep 17 00:00:00 2001 From: Bella Luz Quintero Date: Mon, 21 Oct 2024 15:46:32 -0600 Subject: [PATCH 15/16] change string to array --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 2d449d9f9..0d43726c6 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -39,7 +39,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { category = "ServiceHealth" levels = ["Error"] service_health { - locations = "global" + locations = ["global"] events = ["Incident"] } } From 3b1b74b1a2cb2f8cc8a5e154c115c8a9e1e1e3d3 Mon Sep 17 00:00:00 2001 From: James Herr Date: Tue, 22 Oct 2024 09:27:04 -0500 Subject: [PATCH 16/16] Updated location --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 0d43726c6..109e27d02 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -31,7 +31,7 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-status-alert" - location = "global" + location = data.azurerm_resource_group.group.location resource_group_name = data.azurerm_resource_group.group.name scopes = ["/subscriptions/${data.azurerm_client_config.current.subscription_id}"]