From eca4729cd60f1cec14960a23c1a7d0ca778e6d29 Mon Sep 17 00:00:00 2001 From: Sylvie Date: Wed, 23 Oct 2024 15:22:16 -0500 Subject: [PATCH] alert when instance count is low (#1472) * alert when instance count is low * Fix non-pr env logic --------- Co-authored-by: Samuel Aquino Co-authored-by: halprin Co-authored-by: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> Co-authored-by: jherrflexion <118225331+jherrflexion@users.noreply.github.com> --- operations/template/alert.tf | 41 ++++++++++++++++++++++++++++++++++++ operations/template/main.tf | 4 +++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 562629847..1d55c392d 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -161,6 +161,47 @@ resource "azurerm_monitor_metric_alert" "azure_4XX_alert" { ] } } +resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { + count = local.non_pr_environment ? 1 : 0 + name = "cdcti-${var.environment}-azure-low-instance-count-alert" + resource_group_name = data.azurerm_resource_group.group.name + scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] + description = "The instance count in ${var.environment} is too low" + severity = 2 // warning + frequency = "PT1M" // Checks every 1 minute + window_size = "PT15M" // Every Check, looks back 15 minutes in history + + criteria { + metric_namespace = "Microsoft.Insights/autoscalesettings" + metric_name = "ObservedCapacity" + aggregation = "Average" + operator = "LessThanOrEqual" + threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity[0].default - 0.5 + } + + action { + action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id + } + + lifecycle { + # Ignore changes to tags because the CDC sets these automagically + ignore_changes = [ + tags["business_steward"], + tags["center"], + tags["environment"], + tags["escid"], + tags["funding_source"], + tags["pii_data"], + tags["security_compliance"], + tags["security_steward"], + tags["support_group"], + tags["system"], + tags["technical_steward"], + tags["zone"] + ] + } +} + resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-log-errors-alert" diff --git a/operations/template/main.tf b/operations/template/main.tf index ff42970ce..bc083cff3 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,9 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr") + + // If the environment looks like pr123, regexall will contain matches. If there are no matches, it's a non-pr env + non_pr_environment = length(regexall("^pr\\d+", var.environment)) == 0 } data "azurerm_resource_group" "group" {