From 35b73aa2fed58c945965443dff6452d480f6f223 Mon Sep 17 00:00:00 2001 From: KillianG Date: Mon, 12 Aug 2024 15:52:55 +0200 Subject: [PATCH] Support multiple service in kafka alerts Issue: ZENKO-4857 --- monitoring/kafka/alerts.test.yaml | 13 ++++++++++++- monitoring/kafka/alerts.yaml | 8 ++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/monitoring/kafka/alerts.test.yaml b/monitoring/kafka/alerts.test.yaml index 81155aaaee..799585899d 100644 --- a/monitoring/kafka/alerts.test.yaml +++ b/monitoring/kafka/alerts.test.yaml @@ -35,6 +35,7 @@ tests: description: 'Kafka: Broker count is down' exp_labels: severity: warning + service: ${service} - alertname: BrokersCountCritical eval_time: 3m exp_alerts: [] @@ -49,7 +50,7 @@ tests: description: 'Kafka: Broker count is 0' exp_labels: namespace: zenko - service: artesca-data-base-queue + service: ${service} severity: critical # ActiveControllerCritical @@ -78,6 +79,7 @@ tests: summary: 'Kafka: No active controller' exp_labels: severity: critical + service: ${service} - alertname: ActiveControllerCritical eval_time: 3m exp_alerts: [] @@ -108,6 +110,7 @@ tests: summary: 'Kafka: 1 under-replicated partitons' exp_labels: severity: critical + service: ${service} - alertname: UnderReplicatedPartitions eval_time: 3m exp_alerts: @@ -119,6 +122,7 @@ tests: summary: 'Kafka: 2 under-replicated partitons' exp_labels: severity: critical + service: ${service} # OfflinePartitons ################################################################################################## @@ -147,6 +151,7 @@ tests: summary: 'Kafka: 1 offline partitons' exp_labels: severity: critical + service: ${service} - alertname: OfflinePartitons eval_time: 3m exp_alerts: @@ -159,6 +164,7 @@ tests: summary: 'Kafka: 2 offline partitons' exp_labels: severity: critical + service: ${service} # RemainingDiskSpaceWarning ################################################################################################## @@ -198,6 +204,7 @@ tests: namespace: zenko persistentvolumeclaim: artesca-data-base-queue-1 severity: warning + service: ${service} - alertname: RemainingDiskSpaceWarning eval_time: 5d8h exp_alerts: [] @@ -225,6 +232,7 @@ tests: summary: Zookeeper Sync Disconected exp_labels: severity: warning + service: ${service} # ConsumerLagWarning ################################################################################################## @@ -274,6 +282,7 @@ tests: cluster_name: artesca-data-base-queue group: notification severity: warning + service: ${service} - alertname: ConsumerLagWarning eval_time: 20m exp_alerts: @@ -290,6 +299,7 @@ tests: cluster_name: artesca-data-base-queue group: replication severity: warning + service: ${service} - exp_annotations: description: | Kafka consumer lag has been more more than 300 seconds @@ -303,3 +313,4 @@ tests: cluster_name: artesca-data-base-queue group: notification severity: warning + service: ${service} diff --git a/monitoring/kafka/alerts.yaml b/monitoring/kafka/alerts.yaml index d5fc2aa725..0de57b0f85 100644 --- a/monitoring/kafka/alerts.yaml +++ b/monitoring/kafka/alerts.yaml @@ -34,6 +34,7 @@ groups: for: 1m labels: severity: warning + service: ${service} annotations: summary: 'Not all expected brokers are online.' description: 'Kafka: Broker count is down' @@ -44,6 +45,7 @@ groups: for: 1m labels: severity: critical + service: ${service} annotations: summary: 'No Brokers online' description: 'Kafka: Broker count is 0' @@ -53,6 +55,7 @@ groups: for: 1m labels: severity: critical + service: ${service} annotations: description: >- No broker in the cluster is reporting as the active controller in the last 1 minute interval. During steady state there should @@ -64,6 +67,7 @@ groups: for: 1m labels: severity: critical + service: ${service} annotations: description: >- Under-replicated partitions means that one or more replicas are not available. This is usually because a broker is down. Restart @@ -75,6 +79,7 @@ groups: for: 1m labels: severity: critical + service: ${service} annotations: description: >- After successful leader election, if the leader for partition dies, then the partition moves to the OfflinePartition state. @@ -91,6 +96,7 @@ groups: for: 2m labels: severity: warning + service: ${service} annotations: description: 'Kafka Broker has low disk space' summary: 'Kafka Broker has low disk space' @@ -101,6 +107,7 @@ groups: for: 1m labels: severity: warning + service: ${service} annotations: summary: 'Zookeeper Sync Disconected' description: 'Kafka Zookeeper Sync Disconected' @@ -116,6 +123,7 @@ groups: for: 5m labels: severity: warning + service: ${service} annotations: summary: 'Kafka: consumer lag is too high for {{ $labels.group }}' description: |