From 54fc07c912f3b765414c3e72e2dc6e4046161913 Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Fri, 12 Apr 2024 15:30:42 +0200 Subject: [PATCH] Make OpenShift 4.14 the default --- class/defaults.yml | 2 +- .../prometheus_rules.yaml | 93 ++-- .../prometheus_rules.yaml | 93 ++-- .../prometheus_rules.yaml | 433 +++--------------- .../prometheus_rules.yaml | 93 ++-- .../prometheus_rules.yaml | 96 ++-- .../prometheus_rules.yaml | 93 ++-- .../prometheus_rules.yaml | 110 +++-- 8 files changed, 379 insertions(+), 634 deletions(-) diff --git a/class/defaults.yml b/class/defaults.yml index 7f0bc8fd..5b1948b2 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -8,7 +8,7 @@ parameters: alert-patching.libsonnet: openshift4-monitoring-alert-patching.libsonnet namespace: openshift-monitoring # TODO: select based on reported OCP version once we have dynamic facts - manifests_version: release-4.13 + manifests_version: release-4.14 =_cluster_monitoring_operator_version_map: release-4.13: release-4.13 release-4.14: release-4.14 diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 836cf8fe..95929000 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -249,7 +249,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -285,7 +285,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +339,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -610,6 +611,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1076,6 +1078,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1129,6 +1132,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1142,6 +1146,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1170,7 +1175,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1245,10 +1250,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1257,11 +1264,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1274,9 +1302,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1290,9 +1320,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1356,20 +1388,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1502,19 +1520,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1565,7 +1570,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2166,7 +2171,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2180,7 +2185,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2208,7 +2213,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2234,7 +2239,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2244,7 +2249,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2252,6 +2257,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 836cf8fe..95929000 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -249,7 +249,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -285,7 +285,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +339,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -610,6 +611,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1076,6 +1078,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1129,6 +1132,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1142,6 +1146,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1170,7 +1175,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1245,10 +1250,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1257,11 +1264,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1274,9 +1302,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1290,9 +1320,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1356,20 +1388,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1502,19 +1520,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1565,7 +1570,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2166,7 +2171,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2180,7 +2185,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2208,7 +2213,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2234,7 +2239,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2244,7 +2249,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2252,6 +2257,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index c8a58cd6..b13b9a69 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -146,34 +146,34 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-network-operator-master.rules rules: - - alert: SYN_NoOvnMasterLeader + - alert: SYN_NoOvnClusterManagerLeader annotations: description: | Networking control plane is degraded. Networking configuration updates applied to the cluster will not be implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane is not functional. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md - summary: There is no ovn-kubernetes master leader. + summary: There is no ovn-kubernetes cluster manager leader. syn_component: openshift4-monitoring expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max by (namespace) (max_over_time(ovnkube_master_leader[5m])) == 0 + max by (namespace) (max_over_time(ovnkube_clustermanager_leader[5m])) == 0 for: 5m labels: severity: critical syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NoRunningOvnMaster + - alert: SYN_NoRunningOvnControlPlane annotations: description: | Networking control plane is degraded. Networking configuration updates applied to the cluster will not be implemented while there are no OVN Kubernetes pods. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md - summary: There is no running ovn-kubernetes master. + summary: There is no running ovn-kubernetes control plane. syn_component: openshift4-monitoring expr: | - absent(up{job="ovnkube-master", namespace="openshift-ovn-kubernetes"} == 1) + absent(up{job="ovnkube-control-plane", namespace="openshift-ovn-kubernetes"} == 1) for: 5m labels: namespace: openshift-ovn-kubernetes @@ -193,333 +193,12 @@ spec: expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - time() - max_over_time(ovnkube_master_nb_e2e_timestamp[5m]) > 120 + time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120 for: 10m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseClusterIDError - annotations: - description: More than one OVN northbound database cluster ID indicates - degraded OVN database high availability and possible database split - brain. - summary: Multiple OVN northbound database cluster IDs exist. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(count(min_over_time(ovn_db_cluster_id{db_name="OVN_Northbound"}[5m])) by (cluster_id, namespace)) by (namespace) > 1 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseClusterMemberError - annotations: - description: OVN northbound database server(s) has not been a RAFT cluster - member for a period of time which may indicate degraded OVN database - high availability cluster. - summary: OVN northbound database server(s) has not been a member of the - databases high availability for a period of time. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_nbdb_not_cluster_member:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseInboundConnectionError - annotations: - description: OVN northbound database server(s) is experiencing inbound - RAFT connectivity errors which may indicate degraded OVN database high - availability. - summary: OVN northbound database server(s) is experiencing inbound RAFT - connectivity errors. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - # ..error_total is set to zero when error resolves itself - min_over_time(ovn_db_cluster_inbound_connections_error_total{db_name="OVN_Northbound"}[5m]) > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseInboundConnectionMissing - annotations: - description: OVN northbound database server(s) do not have expected number - of inbound connections for a RAFT cluster which may indicate degraded - OVN database high availability. - summary: OVN northbound database server(s) do not have expected number - of inbound RAFT connections. - syn_component: openshift4-monitoring - expr: | - # Expected sum of inbound connections is number of control plane nodes * number of control plane nodes minus one - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_nbdb_missing_inbound_connections:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseLeaderError - annotations: - description: OVN northbound database(s) have no RAFT leader. Networking - control plane is degraded. - summary: OVN northbound database(s) have no RAFT leader - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(max_over_time(ovn_db_cluster_server_role{db_name="OVN_Northbound", server_role="leader"}[5m])) by (namespace) == 0 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseMultipleLeadersError - annotations: - description: OVN northbound database(s) have multiple RAFT leaders which - may indicate degraded OVN database high availability. - summary: OVN northbound database(s) have multiple RAFT leaders - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(min_over_time(ovn_db_cluster_server_role{db_name="OVN_Northbound", server_role="leader"}[1m])) by (leader, namespace) > 1 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseOutboundConnectionError - annotations: - description: OVN northbound database server(s) outbound RAFT connectivity - errors may indicate degraded OVN database high availability. - summary: OVN northbound database server(s) is experiencing outbound RAFT - connectivity errors. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - # ..error_total is set to zero when error resolves itself - min_over_time(ovn_db_cluster_outbound_connections_error_total{db_name="OVN_Northbound"}[5m]) > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseOutboundConnectionMissing - annotations: - description: OVN northbound database server(s) do not have expected number - of outbound connections for a RAFT cluster which may indicate degraded - OVN database high availability. - summary: OVN northbound database server(s) do not have expected number - of outbound RAFT connections. - syn_component: openshift4-monitoring - expr: | - # Expected sum of outbound connections is number of control plane nodes * number of control plane nodes minus one - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_nbdb_missing_outbound_connections:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthboundDatabaseTermLag - annotations: - description: OVN northbound database(s) RAFT term have not been equal - which may indicate degraded OVN database high availability. - summary: OVN northbound databases RAFT term have not been equal for a - period of time. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max(max_over_time(ovn_db_cluster_term{db_name="OVN_Northbound"}[5m])) by (namespace) - min(max_over_time(ovn_db_cluster_term{db_name="OVN_Northbound"}[5m])) by (namespace) > 0 - for: 25m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesNorthdInactive - annotations: - description: Exactly one OVN northd must have an active status within - the high availability set. Networking control plane is degraded. - summary: Exactly one OVN northd instance must have an active status. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(ovn_northd_status == 1) by (namespace) != 1 - for: 5m labels: severity: critical syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseClusterIDError - annotations: - description: More than one OVN southbound database cluster ID indicates - degraded OVN database high availability and possible database split - brain. - summary: Multiple OVN southbound database cluster IDs exist. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(count(min_over_time(ovn_db_cluster_id{db_name="OVN_Southbound"}[5m])) by (cluster_id, namespace)) by (namespace) > 1 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseClusterMemberError - annotations: - description: OVN southbound database server(s) has not been a RAFT cluster - member for a period of time which may indicate degraded OVN database - high availability. - summary: OVN southbound database server(s) has not been a member of the - databases high availability for a period of time. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_sbdb_not_cluster_member:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseInboundConnectionError - annotations: - description: OVN southbound database server(s) is experiencing inbound - RAFT connectivity errors which may indicate degraded OVN database high - availability. - summary: OVN southbound database server(s) is experiencing inbound RAFT - connectivity errors. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - # ..error_total is set to zero when error resolves itself - min_over_time(ovn_db_cluster_inbound_connections_error_total{db_name="OVN_Southbound"}[5m]) > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseInboundConnectionMissing - annotations: - description: OVN southbound database server(s) do not have expected number - of inbound connections for a RAFT cluster which may indicate degraded - OVN database high availability. - summary: OVN southbound database server(s) do not have expected number - of inbound RAFT connections. - syn_component: openshift4-monitoring - expr: | - # Expected sum of inbound connections is number of control plane nodes * number of control plane nodes minus one - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_sbdb_missing_inbound_connections:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseLeaderError - annotations: - description: OVN southbound database(s) have no leader. Networking control - plane is degraded. - summary: OVN southbound database(s) have no RAFT leader - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(max_over_time(ovn_db_cluster_server_role{db_name="OVN_Southbound", server_role="leader"}[5m])) by (namespace) == 0 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseMultipleLeadersError - annotations: - description: OVN southbound database(s) have multiple RAFT leaders which - may indicate degraded OVN database high availability. - summary: OVN southbound database(s) have multiple RAFT leaders - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - count(min_over_time(ovn_db_cluster_server_role{db_name="OVN_Southbound", server_role="leader"}[1m])) by (leader, namespace) > 1 - for: 5m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseOutboundConnectionError - annotations: - description: OVN southbound database server(s) outbound RAFT connectivity - errors which may indicate degraded OVN database high availability. - summary: OVN southbound database server(s) is experiencing outbound RAFT - connectivity errors. - syn_component: openshift4-monitoring - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - # ..error_total is set to zero when error resolves itself - min_over_time(ovn_db_cluster_outbound_connections_error_total{db_name="OVN_Southbound"}[5m]) > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseOutboundConnectionMissing - annotations: - description: OVN southbound database server(s) do not have expected number - of outbound connections for a RAFT cluster which may indicate degraded - OVN database high availability. - summary: OVN southbound database server(s) do not have expected number - of outbound RAFT connections. - syn_component: openshift4-monitoring - expr: | - # Expected sum of outbound connections is number of control plane nodes * number of control plane nodes minus one - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - min_over_time(cluster:ovn_db_sbdb_missing_outbound_connections:abs[5m]) != 0 - for: 5m - labels: - namespace: openshift-ovn-kubernetes - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_OVNKubernetesSouthboundDatabaseTermLag - annotations: - description: OVN southbound database(s) RAFT term have not been equal - which may indicate degraded OVN database high availability. - summary: OVN southbound databases RAFT term have not been equal for a - period of time. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max(max_over_time(ovn_db_cluster_term{db_name="OVN_Southbound"}[5m])) by (namespace) - min(max_over_time(ovn_db_cluster_term{db_name="OVN_Southbound"}[5m])) by (namespace) > 0 - for: 25m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_SouthboundStale annotations: description: | @@ -533,10 +212,10 @@ spec: expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(ovnkube_master_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_master_sb_e2e_timestamp[5m]) > 120 + max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120 for: 10m labels: - severity: warning + severity: critical syn: 'true' syn_component: openshift4-monitoring - alert: SYN_V4SubnetAllocationThresholdExceeded @@ -593,6 +272,7 @@ spec: annotations: description: | Networking is degraded on nodes when OVN controller is not connected to OVN southbound database connection. No networking control plane updates will be applied to the node. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/OVNKubernetesControllerDisconnectedSouthboundDatabase.md summary: Networking control plane is degraded on node {{ $labels.node }} because OVN controller is not connected to OVN southbound database. syn_component: openshift4-monitoring @@ -703,7 +383,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -739,7 +419,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -793,6 +473,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -1064,6 +745,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1530,6 +1212,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1583,6 +1266,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1596,6 +1280,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1624,7 +1309,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1699,10 +1384,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1711,11 +1398,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1728,9 +1436,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1744,9 +1454,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1810,20 +1522,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1956,19 +1654,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -2019,7 +1704,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2620,7 +2305,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2634,7 +2319,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2662,7 +2347,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2688,7 +2373,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2698,7 +2383,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2706,6 +2391,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 836cf8fe..95929000 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -249,7 +249,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -285,7 +285,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +339,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -610,6 +611,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1076,6 +1078,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1129,6 +1132,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1142,6 +1146,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1170,7 +1175,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1245,10 +1250,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1257,11 +1264,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1274,9 +1302,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1290,9 +1320,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1356,20 +1388,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1502,19 +1520,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1565,7 +1570,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2166,7 +2171,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2180,7 +2185,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2208,7 +2213,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2234,7 +2239,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2244,7 +2249,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2252,6 +2257,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 39a5b929..f82725a8 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -262,7 +262,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -300,7 +300,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -357,6 +357,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -640,6 +641,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1131,6 +1133,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1188,6 +1191,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1202,6 +1206,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1232,7 +1237,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring syn_team: clumsy-donkeys @@ -1312,10 +1317,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1325,11 +1332,33 @@ spec: syn: 'true' syn_component: openshift4-monitoring syn_team: clumsy-donkeys + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + syn_team: clumsy-donkeys - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1343,9 +1372,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1360,9 +1391,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1430,21 +1463,6 @@ spec: syn: 'true' syn_component: openshift4-monitoring syn_team: clumsy-donkeys - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - syn_team: clumsy-donkeys - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1584,20 +1602,6 @@ spec: syn: 'true' syn_component: openshift4-monitoring syn_team: clumsy-donkeys - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - syn_team: clumsy-donkeys - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1651,7 +1655,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2288,7 +2292,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2303,7 +2307,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2333,7 +2337,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2361,7 +2365,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2372,7 +2376,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2380,6 +2384,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 0f1fa02e..2b7ff60f 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -249,7 +249,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -285,7 +285,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +339,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -610,6 +611,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1076,6 +1078,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1129,6 +1132,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1142,6 +1146,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1170,7 +1175,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1245,10 +1250,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1257,11 +1264,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1274,9 +1302,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1290,9 +1320,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1356,20 +1388,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1502,19 +1520,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1565,7 +1570,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2166,7 +2171,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2180,7 +2185,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2208,7 +2213,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2234,7 +2239,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2244,7 +2249,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2252,6 +2257,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 1428daf0..8a35c738 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -249,7 +249,7 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first @@ -285,7 +285,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +339,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -610,6 +611,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1076,6 +1078,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1129,6 +1132,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1142,6 +1146,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1170,7 +1175,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1245,10 +1250,12 @@ spec: rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1257,11 +1264,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1274,9 +1302,11 @@ spec: rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1290,9 +1320,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1356,20 +1388,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1502,19 +1520,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1565,7 +1570,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -2166,7 +2171,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2180,7 +2185,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2208,7 +2213,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2234,7 +2239,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2244,7 +2249,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2252,6 +2257,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) @@ -2455,3 +2462,20 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_VSphereOpenshiftVmsCBTMismatch + annotations: + description: | + Cluster node VMs are not configured the same for CBT feature. + message: Cluster node VMs are not configured the same for CBT feature. + summary: | + Periodic vSphere health check is failing due to some nodes not having ctkEnabled matching the other nodes. + To get details about the failure, please see the logs in the vsphere-problem-detector-operator pod in namespace openshift-cluster-storage-operator: + ' oc logs -l name=vsphere-problem-detector-operator -n openshift-cluster-storage-operator --tail=-1 | grep "node_cbt"' + syn_component: openshift4-monitoring + expr: min_over_time(vsphere_vm_cbt_checks{cbt=~"enabled"}[5m]) > 0 and on() + min_over_time(vsphere_vm_cbt_checks{cbt=~"disabled"}[5m]) > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring