From 4bbb1aceae203a7bf47525a3db56a41672ca8875 Mon Sep 17 00:00:00 2001 From: Denis Shipkov Date: Thu, 21 Nov 2024 16:37:01 +0300 Subject: [PATCH] rules changed Signed-off-by: Denis Shipkov --- .../controller/linstor_resources_watcher.go | 2 +- ...replicated-pv-with-incorrect-settings.yaml | 86 +++++++++---------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/images/sds-replicated-volume-controller/src/pkg/controller/linstor_resources_watcher.go b/images/sds-replicated-volume-controller/src/pkg/controller/linstor_resources_watcher.go index 0048b1ac..18b85400 100644 --- a/images/sds-replicated-volume-controller/src/pkg/controller/linstor_resources_watcher.go +++ b/images/sds-replicated-volume-controller/src/pkg/controller/linstor_resources_watcher.go @@ -294,7 +294,7 @@ func checkPVMinReplicasCount(ctx context.Context, log logger.Logger, lc *lapi.Cl return "" } else if upVols <= 1 { return "fatal" - } else if (upVols*100)/placeCount < 66 { + } else if (upVols*100)/placeCount <= 50 { return "error" } else { return "warning" diff --git a/monitoring/prometheus-rules/replicated-pv-with-incorrect-settings.yaml b/monitoring/prometheus-rules/replicated-pv-with-incorrect-settings.yaml index 250dc4f6..740bae18 100644 --- a/monitoring/prometheus-rules/replicated-pv-with-incorrect-settings.yaml +++ b/monitoring/prometheus-rules/replicated-pv-with-incorrect-settings.yaml @@ -35,14 +35,14 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has incorrect quorum-minimum-redundancy setting description: | - There are persistent volumes in the cluster that has incorrect quorum-minimum-redundancy setting. + Persistent volumes in the cluster has incorrect quorum-minimum-redundancy setting. Please, contact tech support for assistance. - - alert: ReplicatedPVWithIncorrectReplicasCountWarning15m - expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="warning", label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 + - alert: ReplicatedPVIncorrectReplicasCountFatalS3 + expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 for: 15m labels: - severity_level: "5" + severity_level: "3" tier: cluster annotations: plk_markup_format: "markdown" @@ -51,15 +51,15 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` - - alert: ReplicatedPVWithIncorrectReplicasCountError15m + Persistent volumes in the cluster has less then 2 replicas (set of UpToDate resources) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l` + - alert: ReplicatedPVIncorrectReplicasCountErrorS3 expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 - for: 15m + for: 30m labels: - severity_level: "4" + severity_level: "3" tier: cluster annotations: plk_markup_format: "markdown" @@ -68,15 +68,15 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` - - alert: ReplicatedPVWithIncorrectReplicasCountError30m + Persistent volumes in the cluster has not enough replicas for quorum for 30min (set of UpToDate resources) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l` + - alert: ReplicatedPVIncorrectReplicasCountWarningS3 expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="warning", label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 - for: 30m + for: 24h labels: - severity_level: "4" + severity_level: "3" tier: cluster annotations: plk_markup_format: "markdown" @@ -85,15 +85,15 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` - - alert: ReplicatedPVWithIncorrectReplicasCountFatal15m - expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 + Persistent volumes in the cluster has not enough replicas for long time (set of UpToDate resources less than minimal count) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l` + - alert: ReplicatedPVIncorrectReplicasCountErrorS4 + expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 for: 15m labels: - severity_level: "3" + severity_level: "4" tier: cluster annotations: plk_markup_format: "markdown" @@ -102,15 +102,15 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` - - alert: ReplicatedPVWithIncorrectReplicasCountFatal30m - expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 + Persistent volumes in the cluster has not enough replicas for quorum for 15min (set of UpToDate resources) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l` + - alert: ReplicatedPVIncorrectReplicasCountWarningS4 + expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="warning", label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 for: 30m labels: - severity_level: "3" + severity_level: "4" tier: cluster annotations: plk_markup_format: "markdown" @@ -119,15 +119,15 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` - - alert: ReplicatedPVWithIncorrectReplicasCountFatal60m + Persistent volumes in the cluster has not enough replicas for 30min (set of UpToDate resources less than minimal count) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l` + - alert: ReplicatedPVIncorrectReplicasCountWarningS5 expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="warning", label_storage_deckhouse_io_pv_not_enough_replicas="error", label_storage_deckhouse_io_pv_not_enough_replicas="fatal"}) > 0 - for: 60m + for: 15m labels: - severity_level: "3" + severity_level: "5" tier: cluster annotations: plk_markup_format: "markdown" @@ -136,7 +136,7 @@ plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Replicated PVs has not enough replicas description: | - There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count) - - You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` - And view all Resource States with `linstor r l` + Persistent volumes in the cluster has not enough replicas (set of UpToDate resources less than minimal count) +# +# You can get minimal limit for StorageClass with command `kubectl get sc -o yaml | grep -E "(\sname|placementCount)"` +# And view all Resource States with `linstor r l`