From 92fb25ea5b577ae212f8bf8f5f1a1cb18ec16ff0 Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Fri, 15 Dec 2023 16:23:46 +0100 Subject: [PATCH 1/6] Fix ReplicationLagWarning alert * Fix unit, metrics is in millisecond * In sharded deployment, the same `job` is used for members of a replicaset, so this rule must be filtered by pod or instance. Issue: ZENKO-4715 --- monitoring/mongodb/alerts.test.yaml | 46 +++++++++++++++++++++-------- monitoring/mongodb/alerts.yaml | 18 ++++++----- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/monitoring/mongodb/alerts.test.yaml b/monitoring/mongodb/alerts.test.yaml index 30712c7295..4e51d51e43 100644 --- a/monitoring/mongodb/alerts.test.yaml +++ b/monitoring/mongodb/alerts.test.yaml @@ -168,30 +168,52 @@ tests: - name: ReplicationLagWarning interval: 1m input_series: - - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", job="zenko/data-db-mongodb-sharded-shard0-data", member_idx="mongo-0", member_state="PRIMARY"} - values: 5 25 35 - - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", job="zenko/data-db-mongodb-sharded-shard0-data", member_idx="mongo-1", member_state="SECONDARY"} - values: 0 12 29 - - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", job="zenko/data-db-mongodb-sharded-shard0-data", member_idx="mongo-2", member_state="SECONDARY"} - values: 2 2 31 - - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-configsvr-1", job="zenko/data-db-mongodb-sharded-configsvr", member_idx="mongo-1", member_state="PRIMARY"} - values: 71 83 95 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"} + values: 5 35000+1000x10 45000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 0 24000+1000x10 39000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 2 2000+1000x10 41000 + + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"} + values: 5 34000+1000x10 44000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 0 26000+1000x10 40000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 2 1000+1000x10 40000 + + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"} + values: 5 12000+1000x10 43000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 0 9000+1000x10 38000 + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"} + values: 2 3000+1000x10 42000 + + - series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-configsvr-1", rs_nm="configsvr", member_idx="data-db-mongodb-sharded-cfgsvr-0.data-db-mongodb-sharded-cfgsvr-headless.svc.cluster.local", member_state="PRIMARY"} + values: 71 8300 9500 alert_rule_test: - alertname: ReplicationLagWarning eval_time: 1m exp_alerts: [] - alertname: ReplicationLagWarning - eval_time: 1m30s + eval_time: 2m + exp_alerts: [] + - alertname: ReplicationLagWarning + eval_time: 10m + exp_alerts: [] + - alertname: ReplicationLagWarning + eval_time: 11m exp_alerts: - exp_labels: severity: warning - job: shard0-data + member_idx: data-db-mongodb-sharded-shard0-data-2 + rs_nm: shard0-data exp_annotations: - description: Mongodb replication lag for `shard0-data` is more than 10s. + description: Mongodb replication lag for `data-db-mongodb-sharded-shard0-data-2` is more than 30 seconds. summary: MongoDB replication lag - alertname: ReplicationLagWarning - eval_time: 2m + eval_time: 12m exp_alerts: [] - name: TooManyClientConnectionsWarning diff --git a/monitoring/mongodb/alerts.yaml b/monitoring/mongodb/alerts.yaml index 64ad609f68..761702cacb 100644 --- a/monitoring/mongodb/alerts.yaml +++ b/monitoring/mongodb/alerts.yaml @@ -97,17 +97,19 @@ groups: - alert: ReplicationLagWarning expr: | label_replace( - max(mongodb_rs_members_optimeDate{namespace="${namespace}",pod=~"${service}.*",member_state="PRIMARY"}) - by(job) - -min(mongodb_rs_members_optimeDate{namespace="${namespace}",pod=~"${service}.*",member_state="SECONDARY"}) - by(job) - > 10 - , "job", "$1", "job", "(?:${namespace}/)?${service}-?(.*)") - for: 30s + max( + max(mongodb_rs_members_optimeDate{namespace="${namespace}",pod=~"${service}.*",member_state="PRIMARY"}) + by(pod, rs_nm) + - ignoring(member_idx) group_right + min(mongodb_rs_members_optimeDate{namespace="${namespace}",pod=~"${service}.*",member_state="SECONDARY"}) + by(pod, rs_nm, member_idx) + ) by(member_idx, rs_nm) / 1000 > 30 + , "member_idx", "$1", "member_idx", "(${service}[^.]*)\\.${service}.*") + for: 10m labels: severity: warning annotations: - description: "Mongodb replication lag for `{{ $labels.job }}` is more than 10s." + description: "Mongodb replication lag for `{{ $labels.member_idx }}` is more than 30 seconds." summary: MongoDB replication lag - alert: TooManyClientConnectionsWarning From 4dd2aee2572530e8703507245c9d2122e9b55cd0 Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Mon, 18 Dec 2023 14:51:46 +0100 Subject: [PATCH 2/6] Filter NaN/Inf from kafka ConsumerLagWarning Issue: ZENKO-4715 --- monitoring/kafka/alerts.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/monitoring/kafka/alerts.yaml b/monitoring/kafka/alerts.yaml index 461526a193..d5fc2aa725 100644 --- a/monitoring/kafka/alerts.yaml +++ b/monitoring/kafka/alerts.yaml @@ -111,6 +111,7 @@ groups: > ${maxConsumerLagMessagesWarningThreshold} or kafka_consumergroup_group_max_lag_seconds{namespace="${namespace}",cluster_name="${cluster}",group!=""} + < (1/0) > ${maxConsumerLagSecondsWarningThreshold} for: 5m labels: From 476941f0b87161929bfcad2af3353ffc1cd3daa7 Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Mon, 18 Dec 2023 23:20:12 +0100 Subject: [PATCH 3/6] Bump zenko-operator 1.5.39 Issue: ZENKO-4715 --- solution/deps.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solution/deps.yaml b/solution/deps.yaml index b2a21dc01d..df7e4ef2a9 100644 --- a/solution/deps.yaml +++ b/solution/deps.yaml @@ -98,7 +98,7 @@ vault: zenko-operator: sourceRegistry: registry.scality.com/zenko-operator image: zenko-operator - tag: 1.5.38 + tag: 1.5.39 envsubst: ZENKO_OPERATOR_TAG zenko-ui: sourceRegistry: registry.scality.com/zenko-ui From ccb2aea6730c9a4abbc60a3acc224a5cfd587a44 Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Mon, 18 Dec 2023 23:20:35 +0100 Subject: [PATCH 4/6] Bump backbeat 8.6.32 Issue: ZENKO-4715 --- solution/deps.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solution/deps.yaml b/solution/deps.yaml index df7e4ef2a9..9a50a358a8 100644 --- a/solution/deps.yaml +++ b/solution/deps.yaml @@ -6,7 +6,7 @@ backbeat: dashboard: backbeat-dashboards image: backbeat policy: backbeat-policies - tag: 8.6.31 + tag: 8.6.32 envsubst: BACKBEAT_TAG busybox: image: busybox From 954c26282999c62b7828e81c9068d817a5e79f2a Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Mon, 18 Dec 2023 23:24:05 +0100 Subject: [PATCH 5/6] Release Zenko 2.6.40 Issue: ZENKO-4715 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 1c0f159c11..231366f6e3 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -VERSION="2.6.39" +VERSION="2.6.40" VERSION_SUFFIX= From 08bead7d70f2ed422cad140a2c0f4c7a69bdd454 Mon Sep 17 00:00:00 2001 From: Francois Ferrand Date: Mon, 18 Dec 2023 23:26:46 +0100 Subject: [PATCH 6/6] Release Zenko 2.7.36 Issue: ZENKO-4715 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index de7e4d8168..7fc5e13d75 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -VERSION="2.7.35" +VERSION="2.7.36" VERSION_SUFFIX=