Skip to content

Commit

Permalink
Enable PodDisruptionBudgetAtLimit alert to find maintenance blockin…
Browse files Browse the repository at this point in the history
…g PDBs

Checked against internal cluster telemetry, seems to be a high fidelity alert. Only very few alerts need to be silenced due to operator managed PDBs.
  • Loading branch information
bastjan committed Nov 27, 2024
1 parent 122ba06 commit 56cb9a1
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 100 deletions.
1 change: 0 additions & 1 deletion class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ parameters:
- NodeFilesystemAlmostOutOfFiles
- NodeFilesystemAlmostOutOfSpace
- NodeFilesystemFilesFillingUp
- PodDisruptionBudgetAtLimit
- ThanosRuleRuleEvaluationLatencyHigh
- etcdDatabaseHighFragmentationRatio
- etcdExcessiveDatabaseGrowth
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -647,13 +663,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1983,11 +1999,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -789,13 +805,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -2059,11 +2075,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -619,13 +635,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1885,11 +1901,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Loading

0 comments on commit 56cb9a1

Please sign in to comment.