From 2cb74390cccaf20d36355b4996e70e1fdf4bf0a7 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Tue, 8 Oct 2024 19:14:45 +0200 Subject: [PATCH 1/3] fixup on MongoDbRSNotSynced expression MongoDbRSNotSynced firing when it shouldn't because today we sum up the members state , as we are calculating the number of secondaries, we end up with a higher value | than the expected one to have the right value an additional filtering based on the instance have been introduced as well Issue: ZENKO-4912 --- monitoring/mongodb/alerts.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring/mongodb/alerts.yaml b/monitoring/mongodb/alerts.yaml index be84c062f2..cdbe2b3c8a 100644 --- a/monitoring/mongodb/alerts.yaml +++ b/monitoring/mongodb/alerts.yaml @@ -183,10 +183,10 @@ groups: - alert: MongoDbRSNotSynced expr: | - sum by (rs_nm) (mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}) != (${replicas} - 1) + group by(rs_nm) ( count by(rs_nm, pod) (mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}) != (${replicas} - 1) ) for: 10m labels: severity: warning annotations: - description: "MongoDB replica set `{{ $labels.rs_nm }}` is not in the expected state. It currently has `{{ $value }}` SECONDARY members instead of the expected number. Please ensure that all instance are running properly." + description: "MongoDB replica set `{{ $labels.rs_nm }}` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly." summary: MongoDB replica set out of sync From 83ae2447025b6073efc2c7e101f33e6ecda257c6 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Tue, 8 Oct 2024 19:15:00 +0200 Subject: [PATCH 2/3] fixup on MongoDbRSNotSynced test Issue: ZENKO-4912 --- monitoring/mongodb/alerts.test.yaml | 50 +++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/monitoring/mongodb/alerts.test.yaml b/monitoring/mongodb/alerts.test.yaml index cd1991980b..01a1adca50 100644 --- a/monitoring/mongodb/alerts.test.yaml +++ b/monitoring/mongodb/alerts.test.yaml @@ -329,27 +329,51 @@ tests: description: "MongoDB pod `data-db-mongodb-sharded-mongos-0` has been in the 'STARTUP2' state for more than 1 hour. Please ensure that the instance is running properly." summary: MongoDB node in STARTUP2 state for too long - - name: MongoDbRSNotSynced interval: 1m input_series: - - series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-0", member_state="SECONDARY"} - values: 0x10 - - series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-1", member_state="SECONDARY"} - values: 0x10 - - series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-2", member_state="SECONDARY"} - values: 1x10 - + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-0", member_state="PRIMARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-0"} + values: 1x20 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-0", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-1"} + values: 2x20 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-0", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"} + values: 2x8 stale + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-0", member_state="(not reachable/healthy)", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"} + values: stale _x8 8x10 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-1", member_state="PRIMARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-0"} + values: 1x20 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-1", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-1"} + values: 2x20 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-1", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"} + values: 2x8 stale + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-1", member_state="(not reachable/healthy)", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"} + values: stale _x8 8x10 + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-2", member_state="PRIMARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-0"} + values: 1x8 stale + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-2", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-1"} + values: 2x8 stale + - series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-2", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"} + values: 2x8 stale + alert_rule_test: + - alertname: MongoDbRSNotSynced + eval_time: 5m + exp_alerts: [] + - alertname: MongoDbRSNotSynced eval_time: 10m + exp_alerts: [] + + - alertname: MongoDbRSNotSynced + eval_time: 18m + exp_alerts: [] + + - alertname: MongoDbRSNotSynced + eval_time: 19m exp_alerts: - exp_labels: severity: warning - rs_nm: rs0 + rs_nm: data-db-mongodb-sharded-shard-0 exp_annotations: - description: "MongoDB replica set `rs0` is not in the expected state. It currently has `1` SECONDARY members instead of the expected number. Please ensure that all instance are running properly." + description: "MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly." summary: MongoDB replica set out of sync - - - From 80ff4c7c93bda3e3b070eeea4e88c18103bd80f3 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Tue, 8 Oct 2024 19:15:20 +0200 Subject: [PATCH 3/3] Bump zenko to 2.10.2 Issue: ZENKO-4912 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 52c10640d3..fa9056580e 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -VERSION="2.10.1" +VERSION="2.10.2" VERSION_SUFFIX=